In [2]:
import pandas as pd
import numpy as np
from smileFunctionalGroup import functional_group_exists
import rdkit
from rdkit import Chem

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedShuffleSplit

In [3]:
#Read the data
df_all = pd.read_json('data.json')

In [4]:
df = df_all.dropna() #drop items we could not get smiles codes for

In [5]:
#add the carboxcylic acid label to each spectra
#1 = contains a carboxcylic acid 0 = does not contain a carboxcylic acid
df['carboxcylicAcid'] = [functional_group_exists(x, '[CX3](=O)[OX1H0-,OX2H1]') for x in df.smiles]

In [6]:
#build a dataframe with the wavenumbers as the columns and abs values as the elements
absorbances = df.y_n.to_list()
wavenumbers = df.x_[0]
df2 = pd.DataFrame(data=absorbances,columns=wavenumbers)
#add the classifier
df2['carboxcylicAcid']=df.carboxcylicAcid

In [7]:
df2.head()

Unnamed: 0,555.0,558.7440273038,562.4880546075,566.2320819113,569.976109215,573.7201365188,577.4641638225,581.2081911263,584.95221843,588.6962457338,...,3816.04778157,3819.7918088737,3823.5358361775,3827.2798634812,3831.023890785,3834.7679180887,3838.5119453925,3842.2559726962,3846.0,carboxcylicAcid
0,0.001257,0.000767,2.6e-05,0.00024,0.00073,0.001513,0.003204,0.004885,0.006861,0.008144,...,0.001413,0.001268,0.001112,0.001182,0.001547,0.001781,0.001767,0.001656,0.001362,0
1,0.019457,0.002503,0.006072,0.003486,0.003016,0.002266,0.001948,0.002633,0.003775,0.002967,...,0.006446,0.006916,0.006142,0.005523,0.005652,0.006025,0.005974,0.005624,0.005624,0
2,0.011879,0.00459,0.002293,0.002298,0.0038,0.004275,0.00405,0.002804,0.001988,0.003156,...,0.004166,0.004456,0.004973,0.005682,0.005357,0.004554,0.004387,0.004918,0.005012,0
3,0.002493,0.010488,0.017283,0.021403,0.022252,0.022538,0.021164,0.007701,0.002945,0.013754,...,0.001392,0.002042,0.001595,0.002055,0.002125,0.003149,0.002391,0.002164,0.00141,0
4,0.050332,0.042429,0.038806,0.029132,0.019194,0.027235,0.023681,0.019342,0.021757,0.023747,...,0.002149,0.004904,0.005664,0.004724,0.003624,0.002766,0.002386,0.000901,0.000901,0


### Testing a SVM methond on a single label
Below we are testing the SVM model on predicting the carboxcylic acid functional group

In [8]:
# Split the data into training and testing sets
X = df2.drop('carboxcylicAcid',axis=1)  # Features
y = df2['carboxcylicAcid']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the classification model
classifier = SVC(kernel='linear', random_state=42)
classifier.fit(X_train, y_train)

# Make predictions on the test set and evaluate the model
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.9836639439906651


### Adding more Labels

In [9]:
#define a dictionary of functional groups to add
functional_groups = {
    'alcohol': '[OH]',
    'alkane': '[CX4]',
    'alkene': '[CX3]=[CX3]',
    'alkyne': '[CX2]#[CX2]',
    'ether': '[OD2]([#6])[#6]',
    'ketone': '[CX3](=[OX1])[!#0]',
    'carbonyl': '[CX3](=[OX1])[#6]',
    'nitrile': '[NX1]#[CX2]',
    'amine': '[NX3;!$(NC=O)]',
    'aromatic': '[c]',
    'ester': '[CX3](=[OX1])[OX2H0][#6;!$(C=O)]',
    'aldehyde': '[CX3H1](=O)[#6]',
    'thiol': '[SH]'
}
#get a list of keys
kys = functional_groups.keys()


In [10]:
for key in kys:
    df[key] = [functional_group_exists(x, functional_groups[key]) for x in df.smiles]

In [11]:
df.head()

Unnamed: 0,jcamp,x,y,x_,y_,y_n,smiles,carboxcylicAcid,alcohol,alkane,...,alkyne,ether,ketone,carbonyl,nitrile,amine,aromatic,ester,aldehyde,thiol
0,"{'title': '4-Chlorophenyl phenyl ether', 'jcam...","[550.0, 554.0, 558.0, 562.0, 566.0, 570.0, 574...","[6.08e-05, 9.879999999999999e-05, 6.84e-05, 0....","[555.0, 558.7440273038, 562.4880546075, 566.23...","[9.12e-05, 5.56771e-05, 1.8546e-06, 1.74048e-0...","[0.0012570538, 0.0007674249, 2.556300000000000...",C1=CC=C(C=C1)OC2=CC=C(C=C2)Cl,0,0,0,...,0,1,0,0,0,0,1,0,0,0
1,"{'title': '12-Chloro-5-dodecyne', 'jcamp-dx': ...","[550.0, 554.0, 558.0, 562.0, 566.0, 570.0, 574...","[0.0027632, 0.0011132, 7.04e-05, 0.0002816, 0....","[555.0, 558.7440273038, 562.4880546075, 566.23...","[0.0008525, 0.0001096846, 0.0002660311, 0.0001...","[0.0194569516, 0.0025033769, 0.006071734200000...",CCCCC#CCCCCCCCl,0,0,1,...,1,0,0,0,0,0,0,0,0,0
2,"{'title': '4,7-Dimethyl-4-octanol', 'jcamp-dx'...","[550.0, 554.0, 558.0, 562.0, 566.0, 570.0, 574...","[0.0005828, 0.0004371, 0.0001581, 7.13e-05, 6....","[555.0, 558.7440273038, 562.4880546075, 566.23...","[0.00036735, 0.0001419546, 7.092180000000001e-...","[0.011878962600000001, 0.0045903729, 0.0022933...",CCCC(C)(CCC(C)C)O,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,"{'title': 'Carbonic acid, dimethyl ester', 'jc...","[450.0, 454.0, 458.0, 462.0, 466.0, 470.0, 474...","[0.026084576, 0.008485344, 0.01547789600000000...","[555.0, 558.7440273038, 562.4880546075, 566.23...","[0.001944558, 0.008181798, 0.01348326100000000...","[0.0024925806, 0.010487622700000001, 0.0172831...",COC(=O)OC,0,0,1,...,0,1,1,0,0,0,0,1,0,0
4,"{'title': 'Hexane, 2-bromo-', 'jcamp-dx': 4.24...","[450.0, 454.0, 458.0, 462.0, 466.0, 470.0, 474...","[0.008516064, 0.007822896000000001, 0.00717924...","[555.0, 558.7440273038, 562.4880546075, 566.23...","[0.02487978, 0.0209735705, 0.0191825626, 0.014...","[0.0503316171, 0.0424293831, 0.038806187000000...",CCCCC(C)Br,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [12]:
for key in kys:    
    print(df[key].value_counts())

0    6646
1    1924
Name: alcohol, dtype: int64
1    6815
0    1755
Name: alkane, dtype: int64
0    7488
1    1082
Name: alkene, dtype: int64
0    8359
1     211
Name: alkyne, dtype: int64
0    6576
1    1994
Name: ether, dtype: int64
0    5656
1    2914
Name: ketone, dtype: int64
0    5779
1    2791
Name: carbonyl, dtype: int64
0    8231
1     339
Name: nitrile, dtype: int64
0    7192
1    1378
Name: amine, dtype: int64
1    4647
0    3923
Name: aromatic, dtype: int64
0    7627
1     943
Name: ester, dtype: int64
0    8369
1     201
Name: aldehyde, dtype: int64
0    8458
1     112
Name: thiol, dtype: int64


In [13]:
k = list(kys)

In [14]:
df[k]

Unnamed: 0,alcohol,alkane,alkene,alkyne,ether,ketone,carbonyl,nitrile,amine,aromatic,ester,aldehyde,thiol
0,0,0,0,0,1,0,0,0,0,1,0,0,0
1,0,1,0,1,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,1,1,0,0,0,0,1,0,0
4,0,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8565,0,1,0,0,0,0,0,0,0,1,0,0,0
8566,0,1,0,0,0,0,0,0,0,0,0,0,0
8567,0,1,0,0,0,0,0,0,0,0,0,0,0
8568,0,1,0,0,0,1,1,1,0,1,0,0,0


In [15]:
#to hold all of the models
models = {}
# gather x and y data
X = df2.drop('carboxcylicAcid',axis=1)  # Features
ys = {}
for key in k:
    y = df[key] # Target variable
    ys[key] = y

yt = {}
for key in k:
    #split the data
    X_train, X_test, y_train, y_test = train_test_split(X, ys[key], test_size=0.2, random_state=42)
    # Build the classification model
    classifier = SVC(kernel='linear', random_state=42)
    classifier.fit(X_train, y_train)
    #store model
    models[key]=classifier
    #store testing data 
    yt[key] = y_test



In [16]:
#test and report the accuracy of the models
for key in k:
    # Make predictions on the test set and evaluate the model
    y_pred = models[key].predict(X_test)
    accuracy = accuracy_score(yt[key], y_pred)
    print("Accuracy for ",key,':', accuracy)

Accuracy for  alcohol : 0.9649941656942824
Accuracy for  alkane : 0.8570595099183197
Accuracy for  alkene : 0.9072345390898483
Accuracy for  alkyne : 0.985414235705951
Accuracy for  ether : 0.9212368728121354
Accuracy for  ketone : 0.9504084014002334
Accuracy for  carbonyl : 0.9364060676779463
Accuracy for  nitrile : 0.9579929988331388
Accuracy for  amine : 0.9364060676779463
Accuracy for  aromatic : 0.9095682613768962
Accuracy for  ester : 0.956242707117853
Accuracy for  aldehyde : 0.9906651108518086
Accuracy for  thiol : 0.9906651108518086
