In [None]:
!pip install pandas
!pip install scikit-learn
!pip install lightgbm
!pip install xgboost
!pip install catboost

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# Func 1: TTV, Xy Splits - will need to add numpy.stack() capability for mol fingerprints
# Function
def splits(df, mode= 1):
    if df.shape[0] <= 1000:
        raise ValueError(f"Not enough data: {df.shape[0]} datapoints")
    required_cols = {"SMILES", "Class", "Value", "Kinase", "Group", "Fingerprint"}
    if set(df.columns) != required_cols:
        raise TypeError(f"Incorrect dataframe structure: \nProvided {list(df.columns)} \nExpected {list(required_cols)}")
    if mode == 1:
        y = df["Class"]
    elif mode == 0:
        y = df["Value"]
    else:
        raise ValueError(f"{mode} is not a valid key. Use mode= 1 (clf) or mode= 0 (reg)")
    X = df["Fingerprint"]
    X_remainder, X_test, y_remainder, y_test = train_test_split(X, y, test_size= 0.2)
    X_train, X_val, y_train, y_val = train_test_split(X_remainder, y_remainder, test_size= 0.25)

    return X_train, X_test, X_val, y_train, y_test, y_val
"""
# Usage (regression)
X_train, X_test, X_val, y_train, y_test, y_val = splits(df, mode= 0)
# Usage (classification)
X_train, X_test, X_val, y_train, y_test, y_val = splits(df, mode= 1)
"""

'\n# Usage (regression)\nX_train, X_test, X_val, y_train, y_test, y_val = splits(df, mode= 0)\n# Usage (classification)\nX_train, X_test, X_val, y_train, y_test, y_val = splits(df, mode= 1)\n'

In [3]:
df1 = pd.read_csv(r"Representative_kinases (1)\Representative_kinases\Rep_kinases_dataset.csv", sep= "	")
df2 = pd.read_csv(r"Dark Matter (I'm not running descriptors again).csv")

In [16]:
df1.head()

Unnamed: 0.1,Unnamed: 0,CPD_ID,NonstereoAromaticSMILES,p_standard_type,p_value,Full_name,Kinase_name,Kinase_family,Kinase_group,UniProt_ID,Source,PAINS_bool
0,17,19,CCCC(=O)Nc1n[nH]c2ncc(B3OC(C)(C)C(C)(C)O3)cc12,pIC50,6.45,glycogen synthase kinase 3 alpha,GSK3A,GSK,CMGC,P49840,"BindingDB, DTC",False
1,18,19,CCCC(=O)Nc1n[nH]c2ncc(B3OC(C)(C)C(C)(C)O3)cc12,pIC50,6.45,glycogen synthase kinase 3 beta,GSK3B,GSK,CMGC,P49841,"BindingDB, DTC",False
2,24,25,Cc1c(Oc2ccc(C(C)N)cc2F)ccc2c1B(O)OC2,pIC50,7.81,protein kinase C alpha,PKCa,PKC,AGC,P17252,BindingDB,False
3,25,25,Cc1c(Oc2ccc(C(C)N)cc2F)ccc2c1B(O)OC2,pIC50,8.75,protein kinase cAMP-activated catalytic subuni...,PKACa,PKA,AGC,P17612,BindingDB,False
4,26,25,Cc1c(Oc2ccc(C(C)N)cc2F)ccc2c1B(O)OC2,pIC50,7.86,"adrenergic, beta, receptor kinase 1",BARK1,GRK,AGC,P25098,BindingDB,False


In [4]:
df1 = df1[df1["p_standard_type"] == "pIC50"]
df1["SMILES"] = df1["NonstereoAromaticSMILES"]
df1["Value"] = df1["p_value"]
df1["Class"] = 1
df1["Kinase"] = df1["Kinase_name"]
df1["Group"] = df1["Kinase_group"]

df2["Value"] = 0
df2["Kinase"] = "n/a"
df2["Group"]= "n/a"

dfx = df1[["SMILES", "Class", "Value", "Kinase", "Group"]]
dfy = df2[["SMILES", "Class", "Value", "Kinase", "Group"]]
del df1, df2
df = pd.concat([dfx, dfy], axis= 0)
del dfx, dfy
df.head(99999)
# For now, FP = 1

Unnamed: 0,SMILES,Class,Value,Kinase,Group
0,CCCC(=O)Nc1n[nH]c2ncc(B3OC(C)(C)C(C)(C)O3)cc12,1,6.45,GSK3A,CMGC
1,CCCC(=O)Nc1n[nH]c2ncc(B3OC(C)(C)C(C)(C)O3)cc12,1,6.45,GSK3B,CMGC
2,Cc1c(Oc2ccc(C(C)N)cc2F)ccc2c1B(O)OC2,1,7.81,PKCa,AGC
3,Cc1c(Oc2ccc(C(C)N)cc2F)ccc2c1B(O)OC2,1,8.75,PKACa,AGC
4,Cc1c(Oc2ccc(C(C)N)cc2F)ccc2c1B(O)OC2,1,7.86,BARK1,AGC
...,...,...,...,...,...
15156,CN1CCNC[C@@H]1c1noc(-c2nn(C)c3ccccc3c2=O)n1,0,0.00,,
15157,CN1CCNC[C@H]1c1noc(-c2nn(C)c3ccccc3c2=O)n1,0,0.00,,
15158,CN1CCNC[C@@H]1c1noc(-c2ccccc2SCC(N)=O)n1,0,0.00,,
15159,CN1CCNC[C@H]1c1noc(-c2ccccc2SCC(N)=O)n1,0,0.00,,


In [5]:
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator

def gen_fps(df, fingerprint_type="Morgan", resolution=1024):
    if "SMILES" not in df.columns:
        raise ValueError("DataFrame must contain a 'SMILES' column.")

    smiles = df["SMILES"].tolist()
    mols = [Chem.MolFromSmiles(s) for s in smiles if Chem.MolFromSmiles(s)]
    del smiles
    
    if fingerprint_type == 'Morgan':
        generator = rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=resolution)
    elif fingerprint_type == 'AtomPair':
        generator = rdFingerprintGenerator.GetAtomPairGenerator(fpSize=resolution)
    else:
        raise ValueError(f"Unsupported fingerprint type: {fingerprint_type}")

    fingerprints = generator.GetFingerprints(mols)
    del mols
    return fingerprints
    del fingerprints

In [6]:
df["Fingerprint"] = gen_fps(df, "Morgan", 1024)
X_train, X_test, X_val, y_train, y_test, y_val = splits(df, mode= 0)

In [77]:
# Test run for above script
dfz = df.iloc[0:100, 0:-1]
fingerprint_types = ["Morgan", "AtomPair"]
resolutions = [1024, 2048, 4098]
dfz["Fingerprint"] = gen_fps(dfz, fingerprint_type= fingerprint_types[0], resolution = resolutions[0])
dfz.head()

Unnamed: 0,SMILES,Class,Value,Kinase,Group,Fingerprint
0,CCCC(=O)Nc1n[nH]c2ncc(B3OC(C)(C)C(C)(C)O3)cc12,1,6.45,GSK3A,CMGC,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,CCCC(=O)Nc1n[nH]c2ncc(B3OC(C)(C)C(C)(C)O3)cc12,1,6.45,GSK3B,CMGC,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Cc1c(Oc2ccc(C(C)N)cc2F)ccc2c1B(O)OC2,1,7.81,PKCa,AGC,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
3,Cc1c(Oc2ccc(C(C)N)cc2F)ccc2c1B(O)OC2,1,8.75,PKACa,AGC,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
4,Cc1c(Oc2ccc(C(C)N)cc2F)ccc2c1B(O)OC2,1,7.86,BARK1,AGC,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."


In [7]:
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier, XGBRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC, SVR
from catboost import CatBoostClassifier, CatBoostRegressor

In [8]:
# These should work with either a clf or reg as input (scoring technique will vary), using clf as a placeholder
def hyper_params(clf, params, score, X, y):
    grid = GridSearchCV(clf, params, cv= 5, scoring= score)
    grid.fit(X, y)
    return grid.best_params_, grid.best_score_
    del grid

def tune_params(clf, params, metric, X, y, clf_title= "CLF"):
    param, score= hyper_params(clf, params, metric, X, y)
    print(f"{clf_title}: {param, score}, Metric: {metric}")

# Light Gradient Boosting (LGB)
clfLGB = LGBMClassifier(random_state = 42)
regLGB = LGBMRegressor(random_state = 42)
LGB_params = {
    "num_leaves": [10, 50, 100],
    "max_depth": [-1, 2, 5],
    "learning_rate": [0.01, 0.1, 0.2],
    "feature_fraction": [0.1, 0.5, 1]
}
# 3x3x3x3 training grid - 81 runs

# Extreme Gradient Boosting (XGB)
clfXGB = XGBClassifier(random_state = 42)
regXGB = XGBRegressor(random_state = 42)
XGB_params = {
    "n_estimators": [50, 100, 200],
    "max_depth": [-1, 2, 5],
    "learning_rate": [0.01, 0.1, 0.2],
    "subsample": [0.1, 0.5, 1.0]
}
# 3x3x3x3 training grid - 81 runs

# Multilayer Perceptron (MLP)
clfMLP = MLPClassifier(random_state = 42)
regMLP = MLPRegressor(random_state = 42)
MLP_params = {
    "hidden_layer_sizes": [(100,), (10,10)],
    "learning_rate_init": [0.001, 0.002, 0.005],
    "alpha": [0.0001, 0.0002]
}
# 2x3x2 training grid - 12 runs

# Random Forest (RF)
clfRF = RandomForestClassifier(random_state = 42)
regRF = RandomForestRegressor(random_state = 42)
RF_params = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 2, 5],
    "max_features": [10, 100],
    "min_samples_split": [2, 5, 7]
}
# 3x3x3x3 training grid - 81 runs

# Support Vector Machine (SVM)
clfSVM = SVC()
regSVM = SVR()
SVM_params = {
    "C": [1, 0.5, 2, 5],
    "kernel": ["linear", "rbf"]
}
# 4x2 training grid - 8 runs

# CatBoost (CB)
clfCB = CatBoostClassifier(random_seed = 42)
regCB = CatBoostRegressor(random_seed = 42)
CB_params = {
    "iterations": [500, 1000, 2000],
    "depth": [2, 5, 7],
    "learning_rate": [0.01, 0.02, 0.03],
    "l2_leaf_reg": [1.0, 2.0, 3.0]
}
# 3x3x3x3 training grid - 81 runs

In [15]:
# Test Case
RF_params = {
    "n_estimators": [50, 100],
    "max_depth": [None],
    "max_features": [1, 100],
    "min_samples_split": [2]
}
tune_params(regRF, RF_params, "r2", np.stack(X_val), y_val, "Random Forest Regressor")

# Note: Must stack X variable (Fingerprint), designate a metric to tune based off, and designate the correct regressor / classifier and param dict

Random Forest Regressor: ({'max_depth': None, 'max_features': 100, 'min_samples_split': 2, 'n_estimators': 100}, np.float64(0.9293446270932499)), Metric: r2


In [21]:
from sklearn.metrics import accuracy_score
def create_ensemble_model(models_dict):
    estimators = [(name, clf) for name, clf in models_dict.items()]
    ensemble_clf = VotingClassifier(
        estimators=estimators,
        voting="soft"
    )
    return ensemble_clf

# Dummy Data
X = np.random.rand(100, 10)  
y = np.random.randint(0, 2, 100)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Untuned, but in-use for now
tuned_classifiers = {
    'RF': RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42),
    'SVC': SVC(kernel='rbf', probability=True, random_state=42),
    'MLP': MLPClassifier(hidden_layer_sizes=(50,), max_iter=200, random_state=42)
}

# Basic Functionalisation
ensemble_model = create_ensemble_model(
    models_dict=tuned_classifiers
)

ensemble_model.fit(X_train, y_train)
print("Training complete.")
y_pred = ensemble_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("\n--- Evaluation ---")
print(f"Ensemble Accuracy on Test Set: {accuracy:.4f}")

Training complete.

--- Evaluation ---
Ensemble Accuracy on Test Set: 0.5667


