## Import Data

In [5]:
import os
import numpy as np
import pandas as pd
import csv
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial import distance
import joblib

from eosce.models import ErsiliaCompoundEmbeddings

## Ersilia Descriptor Model

In [36]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [37]:
train_df

Unnamed: 0,DrugBankID,SMILES,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,...,embedding_390,embedding_391,embedding_392,embedding_393,embedding_394,embedding_395,embedding_396,embedding_397,embedding_398,embedding_399
0,Compound::DB07558,[H]C(=O)[C@H](CCCC)NC(=O)[C@H](CC(C)C)NC(=O)[C...,0.555787,0.684139,0.512719,-0.529718,0.262826,-0.655200,-0.660008,-0.542136,...,-0.374233,-0.247684,-0.171424,0.751738,0.813677,0.581156,0.583023,0.023468,-0.481377,0.333508
1,Compound::DB00642,NC1=NC(=O)C2=C(NC=C2CCC2=CC=C(C=C2)C(=O)N[C@@H...,-0.392229,-0.408550,-0.678823,-0.117495,0.345655,-0.461185,0.179226,-0.390624,...,0.103734,0.404542,-0.495886,-0.024562,0.537487,-0.723340,-0.179744,-0.217128,0.518866,-0.603604
2,Compound::DB07687,[H][C@]1(N)CC[C@@]([H])(CC1)NC1=NC2=NC=NN2C(NC...,-0.595988,0.566669,0.453570,0.568219,0.629722,-0.558329,-0.581300,0.621745,...,0.524033,-0.575679,-0.583070,-0.175447,0.595868,-0.497308,-0.425932,-0.623063,0.635332,0.630280
3,Compound::DB01116,O=C1N(CC2=CC=CC=C2)C2C[S+]3CCCC3C2N1CC1=CC=CC=C1,0.284446,-0.204648,-0.366842,0.314053,0.005129,0.716794,0.401084,-0.772718,...,0.255622,-0.406962,-0.638894,-0.250029,-0.332250,-0.635454,-0.518743,0.546009,0.286829,-0.728248
4,Compound::DB09212,CC(C(O)=O)C1=CC=C(CC2CCCC2=O)C=C1,0.610174,0.601935,-0.689303,-0.590422,0.741201,-0.624510,-0.437523,-0.668031,...,0.433412,-0.537336,-0.696722,-0.370152,-0.215580,-0.317241,-0.527324,-0.550422,0.400996,-0.569671
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7040,Compound::DB07838,OC1=C(\C=C2/SC(=S)N(CC3=CC=CC=C3)C2=O)C=CC=C1[...,0.576086,0.531252,-0.397937,0.656185,0.546268,-0.548028,-0.635353,0.616000,...,-0.635258,-0.471979,-0.337811,0.298309,0.678311,0.549808,0.158899,0.603398,-0.587207,0.505850
7041,Compound::DB07267,CC1=CC=CC(=N1)C1=NC(NC2=CC=NC=C2)=C2C=CC=CC2=N1,-0.628725,0.552413,-0.509549,0.533964,0.572574,0.556859,-0.627744,0.587102,...,-0.640469,-0.655582,0.653508,0.587660,0.500923,-0.557616,-0.524073,-0.415384,-0.608517,-0.602539
7042,Compound::DB07473,COC(=O)[C@@H]1CS[C@]2(N1C(=O)C1=CC=CC=C21)C1=C...,0.547474,0.594530,0.598974,0.588209,-0.518837,-0.532142,-0.621101,-0.575629,...,-0.571024,-0.679230,0.520724,0.540047,0.583776,0.552496,-0.426420,-0.364009,0.507542,-0.616206
7043,Compound::DB00977,[H][C@@]12CC[C@@](O)(C#C)[C@@]1(C)CC[C@]1([H])...,-0.274505,-0.261260,-0.074179,0.472879,-0.036972,0.161962,-0.306623,0.011028,...,0.371580,0.385515,-0.458153,0.033768,0.068725,-0.652143,-0.418417,0.122688,0.749156,-0.826212


In [9]:
# Function to calculate Ersilia descriptors
def calculate_ersilia_descriptors(smiles):
    model = ErsiliaCompoundEmbeddings()
    embeddings = model.transform([smiles])
    return embeddings

# Function to preprocess data and create embeddings
def preprocess_data(df):
    # Get the target embeddings
    embeddings = df.iloc[:, 2:].values
    # create Ersilia descriptor columns
    df['ersilia_descriptors'] = df['SMILES'].apply(calculate_ersilia_descriptors)
    df = df.dropna()
    # Extract the descriptors as a NumPy array
    ersilia_descriptors = np.vstack(df['ersilia_descriptors'].values)
    return ersilia_descriptors, embeddings

In [10]:
# The ersilia descriptor is X, our features
# The embeddings is y, our target variable
X_train, y_train = preprocess_data(train_df)
X_test, y_test = preprocess_data(test_df)

print("The length of X_train is:", len(X_train))
print("The length of X_test is:", len(X_test))

[10:41:41] Unusual charge on atom 0 number of radical electrons set to zero


The length of X_train is: 7045
The length of X_test is: 1762


Save the ersilia embeddings since it takes a long time to convert them.

In [12]:
# # Specify the directory
directory = 'data/ersilia_embeddings/'

# Create the directory if it doesn't exist
os.makedirs(directory, exist_ok=True)

# Save the variables as text files
np.savetxt(os.path.join(directory, 'X_train_ersilia1.txt'), X_train)
np.savetxt(os.path.join(directory, 'y_train_ersilia1.txt'), y_train)
np.savetxt(os.path.join(directory, 'X_test_ersilia1.txt'), X_test)
np.savetxt(os.path.join(directory, 'y_test_ersilia1.txt'), y_test)

In [14]:
# # Load the variables as NumPy arrays
# X_train = np.loadtxt(os.path.join(directory, 'X_train_ersilia1.txt'))
# y_train = np.loadtxt(os.path.join(directory, 'y_train_ersilia1.txt'))
# X_test = np.loadtxt(os.path.join(directory, 'X_test_ersilia1.txt'))
# y_test = np.loadtxt(os.path.join(directory, 'y_test_ersilia1.txt'))

In [13]:
print(X_train.shape)
print(X_train)

(7045, 1024)
[[-0.02310181 -0.04397583 -0.00270081 ... -0.00564575 -0.0881958
   0.04522705]
 [ 0.01737976 -0.01235962 -0.06063843 ... -0.01861572 -0.05303955
  -0.0249176 ]
 [ 0.14355469  0.14587402 -0.13378906 ... -0.04837036  0.01075745
  -0.10162354]
 ...
 [ 0.27685547  0.19360352 -0.2286377  ... -0.02882385 -0.01132202
  -0.08850098]
 [-0.00341797  0.03546143  0.10424805 ... -0.01483154  0.01533508
   0.04705811]
 [ 0.12420654 -0.15637207 -0.07702637 ...  0.03060913 -0.12634277
   0.03120422]]


## Train Model with KNN

In [27]:
from sklearn.neighbors import NearestNeighbors

knn_model = NearestNeighbors(n_neighbors=3, metric='euclidean')
knn_model.fit(X_train)

# Find the indices of the three closest neighbors
distances, indices = knn_model.kneighbors(X_test[0:1, :])

# Extract y outputs of the three closest records
closest_y = y_train[indices]

print("Distances:", distances)
print("Indices:", indices)
print("Closest Embeddings:", closest_y)
# Calculate the average along the first axis (axis=0)
average_output = np.mean(closest_y, axis=1)
print("Length of output:", len(average_output[0]))
print(average_output)

Distances: [[2.0264864  2.19537783 2.23343897]]
Indices: [[2369 5277 2436]]
Closest Embeddings: [[[-0.5735711   0.37308925 -0.5972502  ... -0.44339147 -0.3090781
    0.5482032 ]
  [-0.5945403  -0.44518626 -0.5611646  ...  0.625268    0.5491931
   -0.51047724]
  [ 0.56813157  0.5638532  -0.63853717 ... -0.5590035  -0.53747356
    0.50896156]]]
Length of output: 400
[[-0.19999328  0.16391873 -0.59898399  0.56114909  0.65665456 -0.62799144
  -0.30286559  0.31078055  0.56411586  0.1807971   0.55732662  0.23367964
  -0.18237691 -0.09352232  0.17632443  0.41520052 -0.15366588  0.31306577
  -0.17567227  0.56511272  0.28451493  0.19100693 -0.15944883 -0.19927127
   0.22459325 -0.1947426   0.08593253 -0.0937753  -0.50835407  0.24905544
   0.60398447  0.21182925  0.62380188 -0.25798699  0.57470514 -0.5668905
  -0.30021788  0.26165003 -0.1312384   0.3809584  -0.15695113 -0.61281192
  -0.14000219 -0.18989841  0.53619286 -0.59154709 -0.38337433 -0.23912089
   0.22025827  0.10729546  0.6419343  -0.2

## Evaluate model on the test set
#### Mean Squared Error

In [34]:
# Use the trained KNN model to find the neighbors for each sample in x_test
distances_test, indices_test = knn_model.kneighbors(X_test)

# Extract y outputs of the three closest records for each sample in x_test
closest_y_test = y_train[indices_test]

# Calculate the average along the first axis for each sample
average_output_test = np.mean(closest_y_test, axis=1)

# Evaluate the performance using mean squared error
mse = mean_squared_error(y_test, average_output_test)
print(f"Mean Squared Error on Test Set: {mse}")

Mean Squared Error on Test Set: 0.2708968508285105


In [35]:
print("Closest y_test:", closest_y_test)

Closest y_test: [[[-0.5735711   0.37308925 -0.5972502  ... -0.44339147 -0.3090781
    0.5482032 ]
  [-0.5945403  -0.44518626 -0.5611646  ...  0.625268    0.5491931
   -0.51047724]
  [ 0.56813157  0.5638532  -0.63853717 ... -0.5590035  -0.53747356
    0.50896156]]

 [[-0.6005462   0.6055812  -0.5242582  ...  0.6410329  -0.52004665
    0.47353834]
  [ 0.595301    0.6533095  -0.62112886 ...  0.59556204 -0.55555624
    0.49803704]
  [-0.72417855  0.5839263  -0.4494264  ...  0.6064595  -0.40337744
    0.4793027 ]]

 [[-0.40796453  0.17764696  0.5150937  ...  0.1562344   0.7907612
   -0.2759412 ]
  [-0.5956661   0.7130561   0.44248554 ...  0.33546528  0.69543463
    0.43060935]
  [-0.7231395   0.7241269   0.53797907 ...  0.12707949  0.52756506
   -0.34128043]]

 ...

 [[ 0.49792987  0.5125827   0.3092664  ...  0.25743964  0.93057483
   -0.29174167]
  [ 0.57928836  0.48944658  0.44959563 ...  0.2764773   0.8786604
   -0.3164748 ]
  [-0.39553514 -0.17724572  0.04777418 ...  0.46698603  0.54444

In [32]:
print("Average Ouptut Test:", average_output_test)

Average Ouptut Test: [[-0.19999328  0.16391873 -0.59898399 ... -0.12570899 -0.09911952
   0.18222917]
 [-0.24314125  0.61427233 -0.53160449 ...  0.61435148 -0.49299344
   0.48362603]
 [-0.57559004  0.53827665  0.49851944 ...  0.20625972  0.67125363
  -0.06220409]
 ...
 [ 0.2272277   0.27492785  0.26887874 ...  0.33363432  0.78455851
  -0.38418842]
 [-0.04708442  0.52745792 -0.02126018 ... -0.51215395  0.59735809
  -0.4060195 ]
 [ 0.40970762  0.31380822  0.43841271 ...  0.08163132 -0.38014714
  -0.28559785]]


#### R Squared Score

In [40]:
# Calculate the R2 score for the entire test set
r2 = r2_score(y_test, average_output_test)
print(f"Overall R2 Score: {r2}")

Overall R2 Score: -0.10868162736446807


#### Cosine Similarity

In [42]:
cosine_sim = cosine_similarity(y_test, average_output_test)
print(f"Cosine Similarity: {cosine_sim}")

mean_cosine_similarity = np.mean(cosine_sim)
print(f"Mean Cosine Similarity: {mean_cosine_similarity}")

Cosine Similarity: [[ 0.30205464  0.42833475  0.25077799 ...  0.00319881  0.03728955
   0.34556184]
 [ 0.36386092  0.60158339  0.28988368 ... -0.04585789 -0.01264661
   0.31092235]
 [ 0.35871463  0.29136447  0.6384677  ...  0.05018722  0.0711093
   0.20928497]
 ...
 [-0.056573   -0.09113579  0.12126339 ...  0.94739698  0.64582504
  -0.01570938]
 [-0.06007854 -0.07827573  0.09031833 ...  0.54104194  0.61866196
   0.02193535]
 [ 0.3095099   0.34641646  0.38122138 ...  0.05298846  0.09540129
   0.41717689]]
Mean Cosine Similarity: 0.16484680698992094


#### Euclidean Distance

In [43]:
euclidean_dist = distance.euclidean(y_test.flatten(), average_output_test.flatten())
print(f"Euclidean Distance: {euclidean_dist}")

Euclidean Distance: 436.953201686331


Smaller values indicate that the vectors are closer in the Euclidean space, and larger values indicate they are farther apart.

### Save Model

In [33]:
# Save the trained model to a file
joblib.dump(knn_model, 'knn_model.joblib')
# loaded_knn_model = joblib.load('knn_model.joblib')

['knn_model.joblib']