## Machine Learning Models

Libraries that need to be installed are:
- Pytorch
- Deepchem
- transformers (if the error shows)

In [None]:
# !pip install torch
# !pip install deepchem
# !pip install transformers

In [25]:
import pandas as pd
import numpy as np
from deepchem.feat.smiles_tokenizer import BasicSmilesTokenizer

Data is read from a csv of the overalldf dataframe exported from the milestones notebook

In [29]:
# Recreate overall dataframe of our data

overalldf = pd.read_csv("final-data-csv.csv")
overalldf

Unnamed: 0,CID,SMILES,explosive,flammable,oxidizer,pressurized,corrosive,toxic,irritant,health hazard,environmental hazard
0,4,CC(CN)O,0,0,0,0,1,0,1,0,0
1,6,C1=CC(=C(C=C1[N+](=O)[O-])[N+](=O)[O-])Cl,0,0,0,0,1,1,1,1,1
2,8,CCC(C)(C(C(=O)O)O)O,0,0,0,0,0,0,1,0,0
3,11,C(CCl)Cl,0,1,0,0,0,1,1,1,1
4,12,C1=C(C=C(C(=C1O)O)O)O,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
223255,171395742,CC1(C2CCC1(C(=O)C2=C(C(F)(F)F)[O-])C)C.CC1(C2C...,0,0,0,0,0,0,1,0,0
223256,171395937,CCCN1CC(CC2C1CC3=CNC4=CC=CC2=C34)CS(=O)(=O)C,0,0,0,0,0,0,1,0,0
223257,171395972,C1=NC(=NN1C2C(C(C(O2)CO)O)O)C(=O)O,0,0,0,0,0,1,0,0,0
223258,171396137,CC(C)[O-].CC(C)[O-].CC(C)[O-].[Fe],0,1,0,0,0,0,1,0,0


In [30]:
# Create SMILES tokenizer

tokenizer = BasicSmilesTokenizer()
tokenized_list = []

for smile in overalldf["SMILES"]:
    tokenized_list.append(tokenizer.tokenize(smile))

# Add new column with tokenized SMILES to the dataframe
overalldf.insert(2, "Tokenized_SMILES", tokenized_list, False)
overalldf

Unnamed: 0,CID,SMILES,Tokenized_SMILES,explosive,flammable,oxidizer,pressurized,corrosive,toxic,irritant,health hazard,environmental hazard
0,4,CC(CN)O,"[C, C, (, C, N, ), O]",0,0,0,0,1,0,1,0,0
1,6,C1=CC(=C(C=C1[N+](=O)[O-])[N+](=O)[O-])Cl,"[C, 1, =, C, C, (, =, C, (, C, =, C, 1, [N+], ...",0,0,0,0,1,1,1,1,1
2,8,CCC(C)(C(C(=O)O)O)O,"[C, C, C, (, C, ), (, C, (, C, (, =, O, ), O, ...",0,0,0,0,0,0,1,0,0
3,11,C(CCl)Cl,"[C, (, C, Cl, ), Cl]",0,1,0,0,0,1,1,1,1
4,12,C1=C(C=C(C(=C1O)O)O)O,"[C, 1, =, C, (, C, =, C, (, C, (, =, C, 1, O, ...",0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
223255,171395742,CC1(C2CCC1(C(=O)C2=C(C(F)(F)F)[O-])C)C.CC1(C2C...,"[C, C, 1, (, C, 2, C, C, C, 1, (, C, (, =, O, ...",0,0,0,0,0,0,1,0,0
223256,171395937,CCCN1CC(CC2C1CC3=CNC4=CC=CC2=C34)CS(=O)(=O)C,"[C, C, C, N, 1, C, C, (, C, C, 2, C, 1, C, C, ...",0,0,0,0,0,0,1,0,0
223257,171395972,C1=NC(=NN1C2C(C(C(O2)CO)O)O)C(=O)O,"[C, 1, =, N, C, (, =, N, N, 1, C, 2, C, (, C, ...",0,0,0,0,0,1,0,0,0
223258,171396137,CC(C)[O-].CC(C)[O-].CC(C)[O-].[Fe],"[C, C, (, C, ), [O-], ., C, C, (, C, ), [O-], ...",0,1,0,0,0,0,1,0,0


In [31]:
# Vectorize tokenized SMILES
from sklearn.feature_extraction.text import TfidfVectorizer

X = overalldf["Tokenized_SMILES"]
y = np.asarray((overalldf[overalldf.columns[3:]]))

vectorizer = TfidfVectorizer(analyzer=lambda x:x)
X = vectorizer.fit_transform(X)

In [32]:
# Split training and test data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# Perform multilabel classification with Logistic Regression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

model = MultiOutputClassifier(LogisticRegression(max_iter= 300000)).fit(X_train, y_train)
yhat_test = model.predict(X_test)
yhat_train = model.predict(X_train)
print(yhat_test)

print(f"Test Accuracy Score: {accuracy_score(y_test, yhat_test)}")
print(f"Train Accuracy Score: {accuracy_score(y_train, yhat_train)}")
print(classification_report(y_test, yhat_test))


[[0 0 0 ... 1 0 0]
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 1 0 0]
 ...
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 1 0 0]]
Test Accuracy Score: 0.6025038072202813
Train Accuracy Score: 0.5985678133118337
              precision    recall  f1-score   support

           0       0.14      0.03      0.05        36
           1       0.48      0.11      0.18      2633
           2       1.00      0.27      0.43        85
           3       0.67      0.03      0.05        72
           4       0.45      0.04      0.07      7015
           5       0.66      0.05      0.08      3116
           6       0.85      0.99      0.92     37787
           7       0.75      0.06      0.11      3246
           8       0.65      0.04      0.08      3611

   micro avg       0.84      0.67      0.74     57601
   macro avg       0.63      0.18      0.22     57601
weighted avg       0.76      0.67      0.63     57601
 samples avg       0.84      0.72      0.76     57601



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
