In this notebook we reproduce one of the classifiers demonstrated in the paper [In Silico Prediction of Cytochrome P450-Drug Interaction: QSARs for CYP3A4 and CYP2C9](https://www.mdpi.com/1422-0067/17/6/914)

In [77]:
import os
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

path_data = "../datasets/CYP/"

Load training data, separate labels from features and select the features used in the paper for the KNeighbors Classifiers

In [78]:
train_data = pd.read_csv(os.path.join(path_data, "CYP2C9_dataset_training.csv"), decimal=",")
train_labels = (train_data["Class"] == "Active").values.astype(int)
train_data = train_data.drop(columns="Class")
train_data_knn = train_data.iloc[:,[5, 6, 7, 8, 9, 10]]

Load testing data, separate labels from features and select the features used in the paper for the KNeighbors Classifiers

In [53]:
test_data = pd.read_csv(os.path.join(path_data, "CYP2C9_dataset_testing.csv"), decimal=",")
test_labels = (test_data["Class"] == "Active").values.astype(int)
test_data = test_data.drop(columns="Class")
test_data_knn = test_data.iloc[:,[5, 6, 7, 8, 9, 10]]

Assign appropiate data types to the features

In [57]:
test_data = test_data.astype({"SMILES": object, "Sp": float, "nBM": int, "ARR": float, "nPyrimidines": int, "HyWi_B(m)": float, "GATS2i": float, "Eta_betaP_A": float, "nRNR2": int, "F01[C-N]": int, "MLOGP": float})
train_data = train_data.astype({"SMILES": object, "Sp": float, "nBM": int, "ARR": float, "nPyrimidines": int, "HyWi_B(m)": float, "GATS2i": float, "Eta_betaP_A": float, "nRNR2": int, "F01[C-N]": int, "MLOGP": float})

In [70]:
nbrs = KNeighborsClassifier(n_neighbors=14, algorithm='ball_tree')
nbrs.fit(train_data_knn, train_lables)
predicted = nbrs.predict(test_data_knn)

In [71]:
tn, fp, fn, tp = confusion_matrix(test_labels, predicted).ravel()

In [73]:
Sn = tp/(tp+fn)
Sp = tn/(tn+fp)
ner = (Sn+Sp)/2
print("NER:", ner, "Sensitivity:", Sn, "Specificity:", Sp)

NER: 0.7336992681304059 Sensitivity: 0.6111111111111112 Specificity: 0.8562874251497006
