In [113]:
# import packages
import pickle 
import numpy as np
import pandas as pd

from Mold2_pywrapper import Mold2
from rdkit import Chem

In [114]:
# check xgboost version to make sure it is 1.6.2 to be able to lead the model

import xgboost as xgb

print(xgb.__version__)

1.6.2


In [118]:
def load_model(model):
    '''Function to load the model'''
    with open(model, 'rb') as file:
        model = pickle.load(file)
    return model

In [123]:
def load_mask(mask):
    '''Function to load the variance threshold mask'''
    with open(mask, 'rb') as file:
        mask = pickle.load(file)
    return mask

In [124]:
# load model
model = load_model('pkl/classifier1')

# load mask
mask = load_mask('pkl/mask1')

In [125]:
# load sample data
df = pd.read_csv("sample.csv")
print(df.shape)

# apply mask to sample data
df = df.loc[:, mask]

print(df.shape)

(2, 777)
(2, 645)


In [126]:
pred = model.predict(df)
print(pred)

[1 1]


## Wrapper Accuracy/Precision Check  

Can we use the wrapper instead of the command line tool (gui)?  
Based on the experiments below, Yes We Can. The wrappers outputs are basically the same as the gui outputs.

In [141]:
smiles_list = ['CC(=O)NC1=CC=C(C=C1)O', 'C(C(C1C(=C(C(=O)O1)O)O)O)O']

mols = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]

mold2 = Mold2()
mold2.calculate(mols, show_banner=False)

# returns a dataframe 

Unnamed: 0,D001,D002,D003,D004,D005,D006,D007,D008,D009,D010,...,D768,D769,D770,D771,D772,D773,D774,D775,D776,D777
0,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,3.90689,-0.108058,0.545455,1.05973
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,3.80735,2.71667,0.0,-2.73537


Instantiating a Mold2 object ensures that the executables for your platform are accessible. If this is not the case, an attempt to download them from the website of the FDA is made.

Should one have downloaded the original ZIP file available from the website of the FDA, the executables can be installed using the following:

In [151]:
# path_to_zipfile = '...'  # Replace by the path to the ZIP file on your machine
# mold2 = Mold2.from_executable(path_to_zipfile)
# print(mold2.calculate(mols))

Executables will be installed for future use. From then on, default instanciation may be carried out

In [152]:
mold2 = Mold2()
results = mold2.calculate(mols, show_banner=False)

In [153]:
# convert to float
results = results.astype(float)

# convert to 3 decimal places
results = results.round(3)
results

Unnamed: 0,D001,D002,D003,D004,D005,D006,D007,D008,D009,D010,...,D768,D769,D770,D771,D772,D773,D774,D775,D776,D777
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.907,-0.108,0.545,1.06
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.807,2.717,0.0,-2.735


In [154]:
# check for NaN values
results.isnull().values.any()

False

So I'm thinking when setting it up on a server, I can just run the above code to install the executables and leave it on the server. Then when I run the code to generate the descriptors, it will use the executables that are already installed.

In [155]:
results.to_csv('desc/wrapper_output.csv', index=False)

In [156]:
# read output from mold2 gui to compare with results from mold2_pywrapper
df_gui = pd.read_csv("desc/gui_output.txt", sep='\t')
df_gui.drop(['ReadIn_ID', 'USER_ID'], inplace=True, axis=1)
df_gui = df_gui.astype(float)
df_gui = df_gui.round(3)
df_gui

Unnamed: 0,D001,D002,D003,D004,D005,D006,D007,D008,D009,D010,...,D768,D769,D770,D771,D772,D773,D774,D775,D776,D777
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.907,-0.108,0.545,1.06


In [157]:
# read second output from mold2 gui to compare with results from mold2_pywrapper
df_gui2 = pd.read_csv("desc/gui_output_2.txt", sep='\t')
df_gui2.drop(['ReadIn_ID', 'USER_ID'], inplace=True, axis=1)
df_gui2 = df_gui2.astype(float)
df_gui2 = df_gui2.round(3)
df_gui = pd.concat([df_gui, df_gui2], axis=0)
df_gui

Unnamed: 0,D001,D002,D003,D004,D005,D006,D007,D008,D009,D010,...,D768,D769,D770,D771,D772,D773,D774,D775,D776,D777
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.907,-0.108,0.545,1.06
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.807,2.717,0.0,-2.735


In [158]:
# check for NaN values
df_gui.isnull().values.any()

False

In [159]:
# compare results from mold2_pywrapper and mold2 gui
df_gui['D775']

0   -0.108
0    2.717
Name: D775, dtype: float64

In [160]:
results['D775']

0   -0.108
1    2.717
Name: D775, dtype: float64

In [161]:
# compare results from mold2_pywrapper and mold2 gui
df_gui.equals(results)

False

In [196]:
# compare the first rows of the two dataframes
check = df_gui.head(1) == results.head(1)

# which columns are not equal
whichfalse = check[check == False].stack()[0].index
whichfalse

# compare the second rows of the two dataframes
check2 = df_gui.iloc[1] == results.iloc[1]
check2 = pd.DataFrame(check2).T
whichfalse2 = check2[check2 == False].stack()[0].index
whichfalse2

whichfalse = whichfalse.append(whichfalse2)
whichfalse

Index(['D151', 'D243', 'D315', 'D415', 'D433', 'D438', 'D465', 'D151', 'D197',
       'D210', 'D252', 'D253', 'D318', 'D323', 'D333', 'D440'],
      dtype='object')

In [197]:
df_gui

Unnamed: 0,D001,D002,D003,D004,D005,D006,D007,D008,D009,D010,...,D768,D769,D770,D771,D772,D773,D774,D775,D776,D777
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.907,-0.108,0.545,1.06
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.807,2.717,0.0,-2.735


In [198]:
results

Unnamed: 0,D001,D002,D003,D004,D005,D006,D007,D008,D009,D010,...,D768,D769,D770,D771,D772,D773,D774,D775,D776,D777
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.907,-0.108,0.545,1.06
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.807,2.717,0.0,-2.735


In [199]:
df_gui[whichfalse]

Unnamed: 0,D151,D243,D315,D415,D433,D438,D465,D151.1,D197,D210,D252,D253,D318,D323,D333,D440
0,14.413,5.141,45.527,12.793,39.889,6.175,0.697,14.413,45.93,0.139,2.0,0.629,60.719,0.0,0.0,24.998
0,13.351,5.833,51.778,15.107,51.246,0.875,0.578,13.351,29.573,0.117,8.0,2.39,99.751,27.421,27.421,19.579


In [200]:
results[whichfalse]

Unnamed: 0,D151,D243,D315,D415,D433,D438,D465,D151.1,D197,D210,D252,D253,D318,D323,D333,D440
0,14.414,5.14,45.526,12.794,39.89,6.176,0.698,14.414,45.93,0.139,2.0,0.629,60.719,0.0,0.0,24.998
1,13.352,5.833,51.778,15.107,51.246,0.875,0.578,13.352,29.572,0.118,7.0,2.091,99.75,26.56,26.56,19.578


The wrapper does the calculation up to 6 decimal places whilst the gui does it up to 3. Rounding the values to 3 decimal places in the wrapper gives very similar results to the gui.  
The difference is basically negigible as seen above.  
The two compounds used were acetaminophin and vitamin C

## Backend

The backend logic that will be used for the web app is described below.

1. User inputs a SMILES string
2. Molecular Descriptors are generated using Mold2
3. The descriptors are stored in a database
4. The mask is used to reduce the number of descriptors
5. The new descriptors are passed to the model
6. The model returns a prediction, the activity and the confidence
7. The prediction is stored in the database
8. The prediction is returned to the user

## Frontend

The frontend logic that will be used for the web app is described below.

1. User inputs a SMILES string
2. The SMILES string is passed to the backend
3. The prediction is returned to the user