In [113]:
# import packages
import pickle 
import numpy as np
import pandas as pd

from Mold2_pywrapper import Mold2
from rdkit import Chem

In [114]:
# check xgboost version to make sure it is 1.6.2 to be able to lead the model

import xgboost as xgb

print(xgb.__version__)

1.6.2


In [118]:
def load_model(model):
    '''Function to load the model'''
    with open(model, 'rb') as file:
        model = pickle.load(file)
    return model

In [123]:
def load_mask(mask):
    '''Function to load the variance threshold mask'''
    with open(mask, 'rb') as file:
        mask = pickle.load(file)
    return mask

In [124]:
# load model
model = load_model('pkl/classifier1')

# load mask
mask = load_mask('pkl/mask1')

In [125]:
# load sample data
df = pd.read_csv("sample.csv")
print(df.shape)

# apply mask to sample data
df = df.loc[:, mask]

print(df.shape)

(2, 777)
(2, 645)


In [126]:
pred = model.predict(df)
print(pred)

[1 1]


In [127]:
smiles_list = ['CC(=O)NC1=CC=C(C=C1)O']

mols = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]

mold2 = Mold2()
mold2.calculate(mols, show_banner=False)

# returns a dataframe 

Unnamed: 0,D001,D002,D003,D004,D005,D006,D007,D008,D009,D010,...,D768,D769,D770,D771,D772,D773,D774,D775,D776,D777
0,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,3.90689,-0.108058,0.545455,1.05973


Instantiating a Mold2 object ensures that the executables for your platform are accessible. If this is not the case, an attempt to download them from the website of the FDA is made.

Should one have downloaded the original ZIP file available from the website of the FDA, the executables can be installed using the following:

In [128]:
# path_to_zipfile = '...'  # Replace by the path to the ZIP file on your machine
# mold2 = Mold2.from_executable(path_to_zipfile)
# print(mold2.calculate(mols))

Executables will be installed for future use. From then on, default instanciation may be carried out

In [129]:
mold2 = Mold2()
results = mold2.calculate(mols, show_banner=False)

In [130]:
# convert to float
results = results.astype(float)

# convert to 3 decimal places
results = results.round(3)
results

Unnamed: 0,D001,D002,D003,D004,D005,D006,D007,D008,D009,D010,...,D768,D769,D770,D771,D772,D773,D774,D775,D776,D777
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.907,-0.108,0.545,1.06


In [131]:
# check for NaN values
results.isnull().values.any()

False

So I'm thinking when setting it up on a server, I can just run the above code to install the executables and leave it on the server. Then when I run the code to generate the descriptors, it will use the executables that are already installed.

In [132]:
results.to_csv('desc/wrapper_output.csv', index=False)

In [133]:
# read output from mold2 gui to compare with results from mold2_pywrapper
df_gui = pd.read_csv("desc/gui_output.txt", sep='\t')
df_gui.drop(['ReadIn_ID', 'USER_ID'], inplace=True, axis=1)
df_gui = df_gui.astype(float)
df_gui = df_gui.round(3)
df_gui

Unnamed: 0,D001,D002,D003,D004,D005,D006,D007,D008,D009,D010,...,D768,D769,D770,D771,D772,D773,D774,D775,D776,D777
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.907,-0.108,0.545,1.06


In [134]:
# check for NaN values
df_gui.isnull().values.any()

False

In [135]:
# compare results from mold2_pywrapper and mold2 gui
df_gui['D775']

0   -0.108
Name: D775, dtype: float64

In [136]:
results['D775']

0   -0.108
Name: D775, dtype: float64

In [137]:
# compare results from mold2_pywrapper and mold2 gui
df_gui.equals(results)

False

In [138]:
check = df_gui == results
check

# which columns is False
whichfalse = check[check == False].stack()[0].index

In [139]:
df_gui[whichfalse]

Unnamed: 0,D151,D243,D315,D415,D433,D438,D465
0,14.413,5.141,45.527,12.793,39.889,6.175,0.697


In [140]:
results[whichfalse]

Unnamed: 0,D151,D243,D315,D415,D433,D438,D465
0,14.414,5.14,45.526,12.794,39.89,6.176,0.698
