In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [12]:
df = pd.read_csv("drug_discovery_virtual_screening.csv")
df.head()

Unnamed: 0,compound_id,protein_id,molecular_weight,logp,h_bond_donors,h_bond_acceptors,rotatable_bonds,polar_surface_area,compound_clogp,protein_length,protein_pi,hydrophobicity,binding_site_size,mw_ratio,logp_pi_interaction,binding_affinity,active
0,CID_00000,PID_361,499.671415,2.487233,1,7,4,113.350817,4.050696,678,6.019657,0.812534,12.512165,0.736978,14.972288,5.996665,0
1,CID_00001,PID_165,436.17357,3.283222,3,4,4,71.981132,3.704408,876,6.447408,0.651417,11.53842,0.497915,21.168271,6.445742,0
2,CID_00002,PID_168,514.768854,,2,11,11,83.936307,1.86961,658,3.925837,0.633467,13.155702,0.782323,9.074061,5.689583,0
3,CID_00003,PID_226,602.302986,3.038058,0,5,5,79.868125,2.451909,312,7.597056,0.513038,12.071822,1.930458,23.080293,6.043438,0
4,CID_00004,PID_224,426.584663,0.659578,2,4,5,88.198676,1.771936,1418,4.249454,0.613571,15.850448,0.300835,2.802846,4.845055,0


In [13]:
X = df.drop(columns=['compound_id', 'protein_id', 'active'])
y = df['active']

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="mean")
X = imputer.fit_transform(X)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [16]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Gradient Boosting": GradientBoostingClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier()
}

In [18]:
results = {}
for name, model in models.items():
    if name in ["SVM", "Logistic Regression", "KNN"]:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(name)
    print(f"Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))

best_model = max(results, key=results.get)
print("Meilleur modèle :", best_model, "avec une accuracy de", results[best_model])

Logistic Regression
Accuracy: 0.9800
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       267
           1       0.98      0.95      0.97       133

    accuracy                           0.98       400
   macro avg       0.98      0.97      0.98       400
weighted avg       0.98      0.98      0.98       400

Random Forest
Accuracy: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       267
           1       1.00      1.00      1.00       133

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400

Gradient Boosting
Accuracy: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       267
           1       1.00      1.00      1.00       133

    accuracy                           1.00       400
   macro avg       1.00  

In [19]:
new_data = pd.DataFrame([{
    'molecular_weight': 350.0,
    'logp': 3.2,
    'h_bond_donors': 2,
    'h_bond_acceptors': 6,
    'rotatable_bonds': 4,
    'polar_surface_area': 75.0,
    'compound_clogp': 2.9,
    'protein_length': 400,
    'protein_pi': 6.5,
    'hydrophobicity': 0.2,
    'binding_site_size': 120,
    'mw_ratio': 0.875,
    'logp_pi_interaction': 0.3,
    'binding_affinity': -8.5
}])

In [None]:
best_model_instance = models[best_model]

new_data_imputed = imputer.transform(new_data)

if best_model in ["SVM", "Logistic Regression", "KNN"]:
    new_data_prepared = scaler.transform(new_data_imputed)
else:
    new_data_prepared = new_data_imputed

prediction = best_model_instance.predict(new_data_prepared)
prediction_proba = None

if hasattr(best_model_instance, "predict_proba"):
    prediction_proba = best_model_instance.predict_proba(new_data_prepared)

print("Prédiction :", prediction[0])
if prediction_proba is not None:
    print("Probabilités :", prediction_proba[0])

Prédiction : 0
Probabilités : [0.96 0.04]
