In [2]:
import pandas as pd
import numpy as np
import joblib

# Reproducibility
np.random.seed(42)


In [3]:
# Load descriptor matrix
desc_df = pd.read_csv('../data/descriptor_matrix.csv')

# Preview top rows
desc_df.head()


Unnamed: 0,compound_id,MolWt,LogP,TPSA,NumHDonors,NumHAcceptors,NumRotatableBonds,RingCount,ECFP_0,ECFP_1,...,ECFP_1014,ECFP_1015,ECFP_1016,ECFP_1017,ECFP_1018,ECFP_1019,ECFP_1020,ECFP_1021,ECFP_1022,ECFP_1023
0,CMPD001,124.139,0.4325,37.3,1,2,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
1,CMPD002,168.236,2.2874,37.3,1,1,1,2,0,0,...,0,0,0,0,0,1,0,0,0,1
2,CMPD003,192.258,2.9981,37.3,1,1,3,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# Create balanced dummy labels (equal 0s and 1s)
y = np.array([0, 1] * (len(X) // 2))
y = np.append(y, [0] * (len(X) % 2))  # Pad with 0 if odd number

# Confirm label distribution
unique, counts = np.unique(y, return_counts=True)
print("Label distribution:", dict(zip(unique, counts)))


Label distribution: {np.int64(0): np.int64(2), np.int64(1): np.int64(1)}


In [7]:
from sklearn.model_selection import train_test_split

# Use balanced dummy labels from earlier
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [8]:
from sklearn.ensemble import RandomForestClassifier

# Initialize model
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42
)

# Train the model
rf_model.fit(X_train, y_train)

print("Model trained successfully 💼🧬")


Model trained successfully 💼🧬


In [9]:
import joblib

# Save the trained model to models/ folder
joblib.dump(rf_model, '../models/random_forest_model.joblib')

print("Model saved successfully 💾💼")


Model saved successfully 💾💼


In [10]:
# Drop non-feature columns if present
X_screen = desc_df.drop(columns=['compound_id'], errors='ignore')

# Predict probabilities using the trained model
desc_df['Predicted_Probability'] = rf_model.predict_proba(X_screen)[:, 1]

# Preview top predictions
desc_df[['compound_id', 'Predicted_Probability']].head()


Unnamed: 0,compound_id,Predicted_Probability
0,CMPD001,0.525
1,CMPD002,0.77
2,CMPD003,0.275


In [12]:
# Sort compounds by predicted probability (highest first)
desc_df = desc_df.sort_values(by='Predicted_Probability', ascending=False)

# Save top hits to results folder
desc_df.to_csv('../results/screening_results.csv', index=False)

print("Screening results saved successfully 💾🧬")


Screening results saved successfully 💾🧬
