In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [2]:
dataset = pd.read_csv('data/schneider50k.tsv', sep='\t', index_col=0)

# drop columns 'original_rxn' and 'source'
dataset = dataset.drop(columns=['original_rxn', 'source'])

In [3]:
dataset.describe()

Unnamed: 0,rxn_class,rxn,split
count,50000,50000,50000
unique,50,43491,2
top,6.1.5,CCCCCC.COc1cc(C(=O)O)ccc1OCCCCl.ClCCl.O=S(Cl)C...,test
freq,1000,19,40000


In [4]:
# extract only 'test' from the dataset
train_dataset = dataset[dataset['split'] == 'train'].drop(columns=['split'])
test_dataset = dataset[dataset['split'] == 'test'].drop(columns=['split'])

In [10]:
rxn_class_set = set(dataset['rxn_class'])
# create a dictionary to map rxn_class to an integer
rxn_class_dict = {rxn_class: i for i, rxn_class in enumerate(rxn_class_set)}

# add a new column 'reaction_type' to the dataset and assign the integer value
train_dataset['reaction_type'] = train_dataset['rxn_class'].map(rxn_class_dict)
test_dataset['reaction_type'] = test_dataset['rxn_class'].map(rxn_class_dict)

# split the rxn into input and output
train_dataset['input'] = train_dataset['rxn'].apply(lambda x: x.split('>>')[0])
train_dataset['output'] = train_dataset['rxn'].apply(lambda x: x.split('>>')[1])

test_dataset['input'] = test_dataset['rxn'].apply(lambda x: x.split('>>')[0])
test_dataset['output'] = test_dataset['rxn'].apply(lambda x: x.split('>>')[1])


In [21]:
from variables import *

In [27]:
mapping = {}
mpg = {}

In [43]:
for i in rxn_class_dict:
    mapping[rxn_class_dict[i]] = i
    mpg[rxn_class_dict[i]] = REACTION_CLASSES[i].lower()

In [54]:
mapped11 = pd.read_csv('mapped.csv')

In [60]:
map_dict_store = mapped11[['Reaction_encoding', 'Scalability']].set_index('Reaction_encoding').T.to_dict('list')

In [62]:
for i in map_dict_store:
    map_dict_store[i] = map_dict_store[i][0]
map_dict_store

{0: 8,
 1: 8,
 2: 9,
 3: 2,
 4: 8,
 5: 9,
 6: 3,
 7: 9,
 8: 10,
 9: 10,
 10: 7,
 11: 8,
 12: 0,
 13: 2,
 14: 9,
 15: 9,
 16: 10,
 17: 7,
 18: 7,
 19: 9,
 20: 1,
 21: 8,
 22: 8,
 23: 8,
 24: 9,
 25: 7,
 26: 9,
 27: 8,
 28: 9,
 29: 7,
 30: 9,
 31: 8,
 32: 9,
 33: 8,
 34: 9,
 35: 9,
 36: 5,
 37: 10,
 38: 10,
 39: 8,
 40: 9,
 41: 2,
 42: 10,
 43: 8,
 44: 8,
 45: 9,
 46: 7,
 47: 8,
 48: 9,
 49: 10}

In [45]:
data_provided = pd.read_csv('scalability.csv')
# convert 'reaction' to lower case
data_provided['Reaction'] = data_provided['Reaction'].str.lower()

In [46]:
dataaa = data_provided.set_index('Reaction').T.to_dict('list')

In [47]:
dataaa

{'acetal hydrolysis to aldehyde': [9],
 'acetal hydrolysis to diol': [9],
 'acetylation': [10],
 'acyl chloride esterification': [8],
 'acyl chloride with ammonia to amide': [9],
 'acylation of nitrogen nucleophiles by carboxylic acids': [9],
 'addition of primary amines to aldehydes/thiocarbonyls': [9],
 'addition of secondary amines to aldehydes/thiocarbonyls': [8],
 'alcohol deprotection from silyl ethers': [9],
 'alcohol deprotection from silyl ethers (diol)': [9],
 'alcohol deprotection from silyl ethers (double)': [9],
 'alcohol protection with silyl ethers': [9],
 'alcohol to azide': [2],
 'alcohol to bromide with hbr': [8],
 'alcohol to chloride_ch2cl2': [1],
 'alcohol to chloride_chcl3': [1],
 'alcohol to chloride_hcl': [8],
 'alcohol to chloride_pocl3': [7],
 'alcohol to chloride_socl2': [7],
 'aldol condensation': [2],
 'alkene epoxidation': [4],
 'alkyl bromides from alcohols': [8],
 'alkyl chlorides from alcohols': [8],
 'alkylation of amines': [9],
 'alkyne bromination': 

In [48]:
df_data = []
for i in mpg:
    try:
        df_data.append([i, mpg[i], dataaa[mpg[i]]])
    except:
        print(i, mpg[i])

0 amide to amine reduction
1 carboxylic acid to alcohol reduction
3 alcohol to ketone oxidation
4 chlorination
5 o-bn deprotection
6 nitration
7 n-bn deprotection
8 carboxylic acid to acid chloride
9 bromo suzuki-type coupling
10 chloro n-arylation
11 ketone reductive amination
12 n-cbz deprotection
13 alcohol to aldehyde oxidation
14 isocyanate + amine reaction
15 amide schotten-baumann
16 n-acetylation
17 iodo n-alkylation
18 chloro n-alkylation
19 thioether synthesis
20 stille reaction
21 methoxy to hydroxy
22 co2h-tbu deprotection
23 hydroxy to chloro
24 hydroxy to methoxy
25 bromo n-arylation
26 co2h-me deprotection
27 eschweiler-clarke methylation
28 nitrile reduction
29 bromo n-alkylation
30 nitro to amino
31 co2h-et deprotection
32 ester schotten-baumann
33 bromination
34 n-boc deprotection
35 aldehyde reductive amination
36 mitsunobu aryl ether synthesis
37 methyl esterification
38 fischer-speier esterification
39 sonogashira coupling
40 sulfonamide schotten-baumann
41 wohl-zi

In [49]:
df_data

[[2, 'williamson ether synthesis', [9]]]

In [11]:
# find max length of rxn split
max_len = 0
for rxn in train_dataset['rxn']:
    max_len = max(len(rxn.split('>>')), max_len)
max_len

2

In [12]:
train_dataset

Unnamed: 0,rxn_class,rxn,reaction_type,input,output,input_fingerprint,output_fingerprint
3,2.2.3,CCS(=O)(=O)Cl.CN(C(=O)N(C)[C@@H]1CN(C(=O)C2CCN...,40,CCS(=O)(=O)Cl.CN(C(=O)N(C)[C@@H]1CN(C(=O)C2CCN...,CCS(=O)(=O)N1CCC(C(=O)N2C[C@@H](c3ccc(F)cc3)[C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6,6.2.2,C1CCOC1.COC(=O)c1ccc(-c2c(C)cccc2C)c(C)c1.Cl.[...,26,C1CCOC1.COC(=O)c1ccc(-c2c(C)cccc2C)c(C)c1.Cl.[...,Cc1cc(C(=O)O)ccc1-c1c(C)cccc1C,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
11,1.6.8,CI.CN(C)C=O.ClCCl.O.O=c1[nH]c2ccc(Br)cc2c(=O)o...,17,CI.CN(C)C=O.ClCCl.O.O=c1[nH]c2ccc(Br)cc2c(=O)o...,Cn1c(=O)oc(=O)c2cc(Br)ccc21,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
14,1.6.2,CC#N.CCOC(C)=O.CCOCC.N#CBr.Nc1ccc2c3c(cccc13)C...,29,CC#N.CCOC(C)=O.CCOCC.N#CBr.Nc1ccc2c3c(cccc13)CC2,N#CNc1ccc2c3c(cccc13)CC2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
16,1.6.4,CCN(CC)CC.COc1ccccc1CCl.Cc1ccc(Nc2cc(N(C)C)nc(...,18,CCN(CC)CC.COc1ccccc1CCl.Cc1ccc(Nc2cc(N(C)C)nc(...,COc1ccccc1CN1CCN(c2nc(Nc3ccc(C)cc3)cc(N(C)C)n2...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...
49972,7.3.1,CC(=O)O.CCO.CO[C@@H]1CN(C(=O)OC(C)(C)C)C[C@H]1...,28,CC(=O)O.CCO.CO[C@@H]1CN(C(=O)OC(C)(C)C)C[C@H]1...,CC(=O)O.CO[C@@H]1CN(C(=O)OC(C)(C)C)C[C@H]1CN,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
49984,7.3.1,CC(C)OP(=O)(CCC#N)C(C)(C)C.CCO.N>>CC(C)OP(=O)(...,28,CC(C)OP(=O)(CCC#N)C(C)(C)C.CCO.N,CC(C)OP(=O)(CCCN)C(C)(C)C,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
49986,7.3.1,CC(=O)O.N#CCc1ncc(C(F)(F)F)cc1Cl.[Pd]>>CC(=O)O...,28,CC(=O)O.N#CCc1ncc(C(F)(F)F)cc1Cl.[Pd],CC(=O)O.NCCc1ncc(C(F)(F)F)cc1Cl,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
49994,7.3.1,CCCCOc1ccc(C#N)cc1.CCOCC.O.[Al+3].[H-].[H-].[H...,28,CCCCOc1ccc(C#N)cc1.CCOCC.O.[Al+3].[H-].[H-].[H...,CCCCOc1ccc(CN)cc1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."


In [13]:
def compute_fingerprint(smiles, radius=2, nBits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol,
                                                        radius,
                                                        nBits=nBits)
    return list(fingerprint)

In [None]:
train_dataset['input_fingerprint'] = train_dataset['input'].apply(
    compute_fingerprint)
train_dataset['output_fingerprint'] = train_dataset['output'].apply(
    compute_fingerprint)
train_dataset = train_dataset.dropna(
    subset=['input_fingerprint', 'output_fingerprint'])
X_train = train_dataset.apply(
    lambda row: row['input_fingerprint'] + row['output_fingerprint'], axis=1)
X_train = list(X_train)
y_train = train_dataset['reaction_type']
X_train = pd.DataFrame(X_train)
y_train = pd.Series(y_train)

test_dataset['input_fingerprint'] = test_dataset['input'].apply(
    compute_fingerprint)
test_dataset['output_fingerprint'] = test_dataset['output'].apply(
    compute_fingerprint)
X_test = test_dataset.apply(
    lambda row: row['input_fingerprint'] + row['output_fingerprint'], axis=1)
X_test = list(X_test)
y_test = test_dataset['reaction_type']
X_test = pd.DataFrame(X_test)
y_test = pd.Series(y_test)

In [15]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [17]:
# Save model
import joblib

joblib.dump(clf, 'rfc.pkl')

['rfc.pkl']

In [16]:
y_pred = clf.predict(X_test)
acc_score = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print("Accuracy:", acc_score)
print("Classification Report:\n", report)

Accuracy: 0.9106
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.90      0.88       800
           1       0.90      0.98      0.94       800
           2       0.82      0.69      0.75       800
           3       0.90      0.89      0.89       800
           4       0.94      0.81      0.87       800
           5       0.94      0.97      0.95       800
           6       0.95      0.99      0.97       800
           7       0.97      0.97      0.97       800
           8       0.88      0.99      0.93       800
           9       0.88      0.98      0.93       800
          10       0.82      0.81      0.81       800
          11       0.87      0.89      0.88       800
          12       0.88      0.99      0.94       800
          13       0.95      0.99      0.97       800
          14       0.97      1.00      0.99       800
          15       0.78      0.94      0.86       800
          16       0.94      0.96      0

In [None]:
print(report)

In [1]:
import lxml.etree as ET
import pandas as pd

# Path to the DrugBank XML file
xml_file = '/Users/shreyasv/Downloads/drugbank.xml'

# Parse the XML file
tree = ET.parse(xml_file)
root = tree.getroot()

# Namespace dictionary to handle the XML namespaces
ns = {
    'db': 'http://www.drugbank.ca',
}

# Extract SMILES strings
smiles_list = []
for drug in root.findall('db:drug', ns):
    smiles = drug.find(
        'db:calculated-properties/db:property[db:kind="SMILES"]/db:value', ns)
    if smiles is not None:
        smiles_list.append(smiles.text)

# Create a DataFrame from the list of SMILES strings
smiles_df = pd.DataFrame(smiles_list, columns=['SMILES'])

# Export the DataFrame to a CSV file
csv_file = '/Users/shreyasv/Downloads/drugbank_smiles_strings.csv'
smiles_df.to_csv(csv_file, index=False)

print(f"CSV file with SMILES strings has been saved to {csv_file}")


CSV file with SMILES strings has been saved to /Users/shreyasv/Downloads/drugbank_smiles_strings.csv


In [2]:
smiles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11925 entries, 0 to 11924
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   SMILES  11925 non-null  object
dtypes: object(1)
memory usage: 93.3+ KB
