In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [13]:
data = pd.read_csv('drug_discovery_virtual_screening.csv')
data

Unnamed: 0,compound_id,protein_id,molecular_weight,logp,h_bond_donors,h_bond_acceptors,rotatable_bonds,polar_surface_area,compound_clogp,protein_length,protein_pi,hydrophobicity,binding_site_size,mw_ratio,logp_pi_interaction,binding_affinity,active
0,CID_00000,PID_361,499.671415,2.487233,1,7,4,113.350817,4.050696,678,6.019657,0.812534,12.512165,0.736978,14.972288,5.996665,0
1,CID_00001,PID_165,436.173570,3.283222,3,4,4,71.981132,3.704408,876,6.447408,0.651417,11.538420,0.497915,21.168271,6.445742,0
2,CID_00002,PID_168,514.768854,,2,11,11,83.936307,1.869610,658,3.925837,0.633467,13.155702,0.782323,9.074061,5.689583,0
3,CID_00003,PID_226,602.302986,3.038058,0,5,5,79.868125,2.451909,312,7.597056,0.513038,12.071822,1.930458,23.080293,6.043438,0
4,CID_00004,PID_224,426.584663,0.659578,2,4,5,88.198676,1.771936,1418,4.249454,0.613571,15.850448,0.300835,2.802846,4.845055,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,CID_01995,PID_425,557.015024,3.542686,4,5,11,102.329688,1.470121,240,7.862185,0.770925,21.308909,2.320896,27.853256,7.209850,1
1996,CID_01996,PID_100,447.347874,0.383282,1,8,5,52.321214,3.739447,718,7.401501,0.644721,16.578658,0.623047,2.836864,9.029696,1
1997,CID_01997,PID_265,361.812535,3.019553,0,4,9,86.722469,3.122027,1228,5.551238,0.717044,16.829751,0.294636,16.762259,5.728118,0
1998,CID_01998,PID_125,433.693304,5.965067,0,4,10,86.932238,3.866579,1244,5.680128,0.661245,10.604194,0.348628,33.882348,7.324152,1


In [14]:
X = data.drop(columns=['compound_id', 'protein_id', 'active'])
y = data['active']

In [15]:
numerical_cols = X.columns.tolist()

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numerical_cols)
])

X_processed = preprocessor.fit_transform(X)

In [16]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

In [17]:
print("Résultats de la validation croisée (5-fold accuracy):\n")
for name, model in models.items():
    scores = cross_val_score(model, X_processed, y, cv=5, scoring='accuracy')
    print(f"{name}:")
    print(f"  Moyenne: {scores.mean():.4f}")
    print(f"  Écart-type: {scores.std():.4f}")
    print(f"  Scores individuels: {scores}\n")

Résultats de la validation croisée (5-fold accuracy):

Logistic Regression:
  Moyenne: 0.9840
  Écart-type: 0.0082
  Scores individuels: [0.9875 0.97   0.995  0.985  0.9825]

Decision Tree:
  Moyenne: 1.0000
  Écart-type: 0.0000
  Scores individuels: [1. 1. 1. 1. 1.]

Random Forest:
  Moyenne: 1.0000
  Écart-type: 0.0000
  Scores individuels: [1. 1. 1. 1. 1.]

XGBoost:
  Moyenne: 0.9970
  Écart-type: 0.0037
  Scores individuels: [1.     1.     0.9975 0.9975 0.99  ]



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
