<a href="https://colab.research.google.com/github/gopika20nair/Solubility-Prediction-of-Drug-like-Compounds-Using-Multi-Data-Representational-Modeling/blob/main/ML_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import os

os.makedirs('data', exist_ok=True)

# Create dummy dataset
data = pd.DataFrame({
    'Mol_ID': [f'Mol_{i}' for i in range(1, 11)],
    'Mordred_1': np.random.rand(10),
    'RDKit_1': np.random.rand(10),
    'Morgan_1': np.random.rand(10),
    'MACCS_1': np.random.rand(10),
    'Avalon_1': np.random.rand(10),
    'ErG_1': np.random.rand(10),
    'Solubility': np.random.rand(10) * 10  # target variable
})

# Save CSV in 'data/' folder
data.to_csv('data/example_data.csv', index=False)

data.head()


Unnamed: 0,Mol_ID,Mordred_1,RDKit_1,Morgan_1,MACCS_1,Avalon_1,ErG_1,Solubility
0,Mol_1,0.530866,0.794129,0.716986,0.833831,0.149328,0.167289,2.419205
1,Mol_2,0.041354,0.503599,0.96748,0.352165,0.511913,0.620927,6.261698
2,Mol_3,0.546102,0.373752,0.694152,0.361797,0.543751,0.653226,8.661554
3,Mol_4,0.747928,0.382816,0.541318,0.528213,0.323952,0.41711,9.897116
4,Mol_5,0.855294,0.193308,0.829126,0.661586,0.343339,0.175042,6.785721


In [None]:
import pandas as pd
import numpy as np
import os

In [2]:
# Step 2: Linear models demo

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import r2_score, mean_squared_error

# Load dummy data
df = pd.read_csv('data/example_data.csv')
X = df.drop(['Mol_ID', 'Solubility'], axis=1)
y = df['Solubility']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate models
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet()
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results[name] = {
        'R2': r2_score(y_test, y_pred),
        'RMSE': mean_squared_error(y_test, y_pred, squared=False)
    }

pd.DataFrame(results).T

TypeError: got an unexpected keyword argument 'squared'

In [None]:
# Step 3: CatBoost demo
!pip install catboost --quiet

from catboost import CatBoostRegressor

# Train CatBoost
cat_model = CatBoostRegressor(
    iterations=100, learning_rate=0.1, depth=3, verbose=0, random_state=42
)
cat_model.fit(X_train, y_train)

# Predict & evaluate
y_pred = cat_model.predict(X_test)
print("CatBoost R2:", r2_score(y_test, y_pred))
print("CatBoost RMSE:", mean_squared_error(y_test, y_pred, squared=False))



In [None]:
# Step 4: SHAP-based feature selection + CatBoost demo
import shap
from catboost import CatBoostRegressor

# 1️⃣ Train CatBoost on all dummy features
cat_model = CatBoostRegressor(iterations=50, learning_rate=0.1, depth=3, verbose=0, random_state=42)
cat_model.fit(X_train, y_train)

# 2️⃣ SHAP explainer to rank features
explainer = shap.Explainer(cat_model, X_train)
shap_values = explainer(X_train)

# Get mean absolute SHAP values per feature
shap_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': np.abs(shap_values.values).mean(axis=0)
}).sort_values(by='importance', ascending=False)

# Select top 3 features (for demo)
top_features = shap_importance['feature'].head(3).tolist()
print("Top features selected by SHAP:", top_features)

# 3️⃣ Train CatBoost again with selected features
cat_model_selected = CatBoostRegressor(iterations=50, learning_rate=0.1, depth=3, verbose=0, random_state=42)
cat_model_selected.fit(X_train[top_features], y_train)

# Evaluate
y_pred = cat_model_selected.predict(X_test[top_features])
print("CatBoost (top features) R2:", r2_score(y_test, y_pred))
print("CatBoost (top features) RMSE:", mean_squared_error(y_test, y_pred, squared=False))

# 4️⃣ SHAP summary plot
shap.summary_plot(explainer(X_train[top_features]), X_train[top_features], show=True)
