# Gradient Boost Classifier

 - Goal: To Apply Gradient Boost Classifier to several models to test Recall, Accuracy, F1 score and ROC AUC Score, using diffrent features inputs 

## Import Statemenets

In [48]:
import pandas as pd
import numpy as np

#import matplotlib as plt
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
from rdkit import Chem

# FEATURES
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem # For Morgan Fingerprint (Circular Fingerprints)
from rdkit.Chem import MACCSkeys # For MACCS keys

# SCALING DATA
from sklearn.preprocessing import scale

# For splitting data into training and test sets.
from sklearn.model_selection import train_test_split

# For processing how well our methods have classified our data
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score,roc_auc_score

# Principal Component Analysis (PCA)
from sklearn.decomposition import PCA

# Logistic Regression ML Model
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import GradientBoostingClassifier

## User Defined Helper Functions

In [49]:
####################################################################
###
###    User Defined Helper Functions
###
####################################################################

# FUNCTION LOAD_DATA
# Load the data from the csv file in the [data] folder. We separate 
# the molecule data from the "explosive" labes and return the labels
# an a molecules from SMILES representation for each molecule in the 
# dataset
def load_data(data_file):
    mol_df = pd.read_csv(data_file)

    smiles = np.array(mol_df['smiles'])
    labels = np.array(mol_df['labels']) 
    
    mols = np.array([Chem.MolFromSmiles(smile) for smile in smiles])
    
    return mols, labels

# FUNCTION GEN_FPRINTS
# Generate molecule fingerprints for each molecule in an input list
def gen_fprints(mols):
    f_prints = np.array([Chem.RDKFingerprint(mol) for mol in mols])  
    return f_prints

# FUNCTION gen_MACCS 
# Generate the MACCS keys for each molecule in an input list of molecules
def gen_MACCS(mols):
    MACCS_keys = np.array([MACCSkeys.GenMACCSKeys(mol) for mol in mols])
    return MACCS_keys

# FUNCTION gen_morgan_prints
# Generates Morgan finger prints for each molecule in an input list of molecules
def gen_morgan_prints(mols,radius):
    morgan_prints = np.array([AllChem.GetMorganFingerprintAsBitVect(mol,radius,nBits=1024) for mol in mols])
    return morgan_prints

## Loading Data

In [50]:
data_file = 'molecule_data.csv'
mols, labels = load_data(data_file)

## Using Morgan Features

In [51]:
# load different molecular features separately
##############################################
#rdk_features    = gen_fprints(mols)
#maccs_features  = gen_MACCS(mols)
morgan_features = gen_morgan_prints(mols,radius=18)

In [52]:
# Split data for [training] and [testing]
morgan_train, morgan_test, label_train, label_test = train_test_split(morgan_features, labels, \
                                                                test_size=0.3, shuffle=True)

In [53]:
######################
# Principal Components
######################

# Determine principal components using [training data] and apply the transformation to the [test data]
PC_morgan_proj = PCA(n_components=4)
morgan_proj_train = PC_morgan_proj.fit_transform(morgan_train)
morgan_proj_test = PC_morgan_proj.transform(morgan_test)

### Gradient Boosting with PCA

In [54]:
#########################################################################
#
# Model with PCA
#
#########################################################################
# Train Logistic Regression model using PC transformed Morgan features
GR = GradientBoostingClassifier(n_estimators = 100)
model = GR.fit(morgan_proj_train,label_train)
pred = model.predict(morgan_proj_test)


In [55]:
# Model Generalizability Analysis
accuracy = accuracy_score(label_test, pred)
conf_matrix = confusion_matrix(label_test, pred)
F1Score = f1_score(label_test,pred)
roc_auc = roc_auc_score(label_test,pred)
recall = recall_score(label_test,pred)

print('Confusion Matrix')
print(conf_matrix)

print('\nRecall')
print(recall)  

print('\nAccuracy')
print(accuracy)  

print('\nF1 Score')
print(F1Score)

print('\nROC AUC Score')
print(roc_auc)

Confusion Matrix
[[1170    7]
 [   9   71]]

Recall
0.8875

Accuracy
0.9872712808273667

F1 Score
0.8987341772151898

ROC AUC Score
0.9407763381478335


### Gradient Boosting [WITHOUT] PCA

In [56]:
#########################################################################
#
# Gradient Boost [WITHOUT] PCA
#
#########################################################################
# Train Logistic Regression model using PC transformed Morgan features

GR = GradientBoostingClassifier(n_estimators = 100)
model = GR.fit(morgan_train,label_train)
pred = model.predict(morgan_test)


In [57]:
# Model Generalizability Analysis
accuracy = accuracy_score(label_test, pred)
conf_matrix = confusion_matrix(label_test, pred)
F1Score = f1_score(label_test,pred)
roc_auc = roc_auc_score(label_test,pred)
recall = recall_score(label_test,pred)

print('Confusion Matrix')
print(conf_matrix)

print('\nRecall')
print(recall)  

print('\nAccuracy')
print(accuracy)  

print('\nF1 Score')
print(F1Score)

print('\nROC AUC Score')
print(roc_auc)

Confusion Matrix
[[1169    8]
 [   6   74]]

Recall
0.925

Accuracy
0.9888623707239459

F1 Score
0.9135802469135802

ROC AUC Score
0.9591015293118097


## Using MACCS Features

In [58]:
# load different molecular features separately
##############################################
#rdk_features    = gen_fprints(mols)
maccs_features  = gen_MACCS(mols)
#morgan_features = gen_morgan_prints(mols,radius=18)


In [59]:
# Split data for [training] and [testing]
morgan_train, morgan_test, label_train, label_test = train_test_split(morgan_features, labels, \
                                                                test_size=0.3, shuffle=True)

In [60]:
######################
# Principal Components
######################

# Determine principal components using [training data] and apply the transformation to the [test data]
PC_maccs_proj = PCA(n_components=4)
maccs_proj_train = PC_maccs_proj.fit_transform(maccs_train)
maccs_proj_test = PC_maccs_proj.transform(maccs_test)

### Gradient Boosting with PCA

In [61]:
#########################################################################
#
# Gradient Boosting with PCA
#
#########################################################################
# Train Logistic Regression model using PC transformed Morgan features
GR = GradientBoostingClassifier(n_estimators = 100)
model = GR.fit(maccs_proj_train,label_train)
pred = model.predict(maccs_proj_test)

In [62]:
# Model Generalizability Analysis
accuracy = accuracy_score(label_test, pred)
conf_matrix = confusion_matrix(label_test, pred)
F1Score = f1_score(label_test,pred)
roc_auc = roc_auc_score(label_test,pred)
recall = recall_score(label_test,pred)

print('Confusion Matrix')
print(conf_matrix)

print('\nRecall')
print(recall)  

print('\nAccuracy')
print(accuracy)  

print('\nF1 Score')
print(F1Score)

print('\nROC AUC Score')
print(roc_auc)

Confusion Matrix
[[1160    7]
 [  89    1]]

Recall
0.011111111111111112

Accuracy
0.9236276849642004

F1 Score
0.020408163265306124

ROC AUC Score
0.5025564124535846


### Gradient Boost [WITHOUT] PCA

In [63]:
#########################################################################
#
# Gradient Boost [WITHOUT] PCA
#
#########################################################################
# Train Logistic Regression model using PC transformed Morgan features

GR = GradientBoostingClassifier(n_estimators = 100)
model = GR.fit(maccs_train,label_train)
pred = model.predict(maccs_test)

In [64]:
# Model Generalizability Analysis
accuracy = accuracy_score(label_test, pred)
conf_matrix = confusion_matrix(label_test, pred)
F1Score = f1_score(label_test,pred)
roc_auc = roc_auc_score(label_test,pred)
recall = recall_score(label_test,pred)

print('Confusion Matrix')
print(conf_matrix)

print('\nRecall')
print(recall)  

print('\nAccuracy')
print(accuracy)  

print('\nF1 Score')
print(F1Score)

print('\nROC AUC Score')
print(roc_auc)

Confusion Matrix
[[1164    3]
 [  90    0]]

Recall
0.0

Accuracy
0.9260143198090692

F1 Score
0.0

ROC AUC Score
0.4987146529562982
