# LogisticReg

In [1]:
import pandas as pd
import numpy as np

#import matplotlib as plt
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
from rdkit import Chem

# FEATURES
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem # For Morgan Fingerprint (Circular Fingerprints)
from rdkit.Chem import MACCSkeys # For MACCS keys

# SCALING DATA
from sklearn.preprocessing import scale

# For splitting data into training and test sets.
from sklearn.model_selection import train_test_split

# For processing how well our methods have classified our data
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score,roc_auc_score

# Principal Component Analysis (PCA)
from sklearn.decomposition import PCA

# Logistic Regression ML Model
from sklearn.linear_model import LogisticRegression

In [2]:
####################################################################
###
###    User Defined Helper Functions
###
####################################################################

# FUNCTION LOAD_DATA
# Load the data from the csv file in the [data] folder. We separate 
# the molecule data from the "explosive" labes and return the labels
# an a molecules from SMILES representation for each molecule in the 
# dataset
def load_data(data_file):
    mol_df = pd.read_csv(data_file)

    smiles = np.array(mol_df['smiles'])
    labels = np.array(mol_df['labels']) 
    
    mols = np.array([Chem.MolFromSmiles(smile) for smile in smiles])
    
    return mols, labels

# FUNCTION GEN_FPRINTS
# Generate molecule fingerprints for each molecule in an input list
def gen_fprints(mols):
    f_prints = np.array([Chem.RDKFingerprint(mol) for mol in mols])  
    return f_prints

# FUNCTION gen_MACCS 
# Generate the MACCS keys for each molecule in an input list of molecules
def gen_MACCS(mols):
    MACCS_keys = np.array([MACCSkeys.GenMACCSKeys(mol) for mol in mols])
    return MACCS_keys

# FUNCTION gen_morgan_prints
# Generates Morgan finger prints for each molecule in an input list of molecules
def gen_morgan_prints(mols,radius):
    morgan_prints = np.array([AllChem.GetMorganFingerprintAsBitVect(mol,radius,nBits=1024) for mol in mols])
    return morgan_prints

In [4]:
data_file = 'molecule_data.csv'
mols, labels = load_data(data_file)

In [5]:
# load different molecular features separately
##############################################
#rdk_features    = gen_fprints(mols)
#maccs_features  = gen_MACCS(mols)
morgan_features = gen_morgan_prints(mols,radius=18)

In [6]:
# Split data for [training] and [testing]
morgan_train, morgan_test, label_train, label_test = train_test_split(morgan_features, labels, \
                                                                test_size=0.3, shuffle=True)

In [7]:
######################
# Principal Components
######################

# Determine principal components using [training data] and apply the transformation to the [test data]
PC_morgan_proj = PCA(n_components=4)
morgan_proj_train = PC_morgan_proj.fit_transform(morgan_train)
morgan_proj_test = PC_morgan_proj.transform(morgan_test)
morgan_proj_features = PC_morgan_proj.fit_transform(morgan_features)

In [8]:
#########################################################################
#
# Logistic Regression with PCA
#
#########################################################################
# Train Logistic Regression model using PC transformed Morgan features

LRmodelPC = LogisticRegression()
LRmodelPC.fit(morgan_proj_train, label_train)
pred = LRmodelPC.predict(morgan_proj_test)

In [8]:
# Model Generalizability Analysis
accuracy = accuracy_score(label_test, pred)
conf_matrix = confusion_matrix(label_test, pred)
F1Score = f1_score(label_test,pred)
roc_auc = roc_auc_score(label_test,pred)
recall = recall_score(label_test,pred)

print('Confusion Matrix')
print(conf_matrix)

print('\nRecall')
print(recall)  

print('\nAccuracy')
print(accuracy)  

print('\nF1 Score')
print(F1Score)

print('\nROC AUC Score')
print(roc_auc)

Confusion Matrix
[[1176    4]
 [   8   69]]

Recall
0.8961038961038961

Accuracy
0.9904534606205251

F1 Score
0.9199999999999999

ROC AUC Score
0.9463570327977108


In [None]:
Model1_df = CrossValidation(LRmodelPC,morgan_proj_features,labels)
Model1_df

In [9]:
#########################################################################
#
# Logistic Regression [WITHOUT] PCA
#
#########################################################################

# Train Logistic Regression model using PC transformed Morgan features
LRmodel = LogisticRegression()
LRmodel.fit(morgan_train, label_train)
pred = LRmodel.predict(morgan_test)

In [10]:
# Model Generalizability Analysis
accuracy = accuracy_score(label_test, pred)
conf_matrix = confusion_matrix(label_test, pred)
F1Score = f1_score(label_test,pred)
roc_auc = roc_auc_score(label_test,pred)
recall = recall_score(label_test,pred)

print('Confusion Matrix')
print(conf_matrix)

print('\nRecall')
print(recall)  

print('\nAccuracy')
print(accuracy)  

print('\nF1 Score')
print(F1Score)

print('\nROC AUC Score')
print(roc_auc)

Confusion Matrix
[[1173    7]
 [   7   70]]

Recall
0.9090909090909091

Accuracy
0.9888623707239459

F1 Score
0.9090909090909091

ROC AUC Score
0.9515793528505393


In [None]:
Model2_df = CrossValidation(LRmodel,morgan_features,labels)
Model2_df

In [None]:
result = pd.concat([Model1_df,Model2_df], axis=1, join='inner')
final = result.transpose()
final.to_csv("LogisticReg_Result.csv")
final