# Ada Boost

 - Goal: To Apply Ada Boost to several models to test Recall, Accuracy, F1 score and ROC AUC Score, using diffrent features inputs 

## Import Statements

In [239]:
import pandas as pd
import numpy as np

#import matplotlib as plt
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
from rdkit import Chem

# FEATURES
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem # For Morgan Fingerprint (Circular Fingerprints)
from rdkit.Chem import MACCSkeys # For MACCS keys

# SCALING DATA
from sklearn.preprocessing import scale

# For splitting data into training and test sets.
from sklearn.model_selection import train_test_split

# For processing how well our methods have classified our data
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score,roc_auc_score

# Principal Component Analysis (PCA)
from sklearn.decomposition import PCA

# Logistic Regression ML Model
from sklearn.linear_model import LogisticRegression

# Ada Boost Classifier
from sklearn.ensemble import AdaBoostClassifier

# Cross Validation
from sklearn.model_selection import cross_val_score

#CV
from sklearn.model_selection import cross_validate

## User Defined Helper Functions

In [304]:
# FUNCTION LOAD_DATA
# Load the data from the csv file in the [data] folder. We separate 
# the molecule data from the "explosive" labes and return the labels
# an a molecules from SMILES representation for each molecule in the 
# dataset
def load_data(data_file):
    mol_df = pd.read_csv(data_file)

    smiles = np.array(mol_df['smiles'])
    labels = np.array(mol_df['labels']) 
    
    mols = np.array([Chem.MolFromSmiles(smile) for smile in smiles])
    
    return mols, labels

# FUNCTION GEN_FPRINTS
# Generate molecule fingerprints for each molecule in an input list
def gen_fprints(mols):
    f_prints = np.array([Chem.RDKFingerprint(mol) for mol in mols])  
    return f_prints

# FUNCTION gen_MACCS 
# Generate the MACCS keys for each molecule in an input list of molecules
def gen_MACCS(mols):
    MACCS_keys = np.array([MACCSkeys.GenMACCSKeys(mol) for mol in mols])
    return MACCS_keys

# FUNCTION gen_morgan_prints
# Generates Morgan finger prints for each molecule in an input list of molecules
def gen_morgan_prints(mols,radius):
    morgan_prints = np.array([AllChem.GetMorganFingerprintAsBitVect(mol,radius,nBits=1024) for mol in mols])
    return morgan_prints

# FUNCTION CrossValidation
# Runs Cross Validation and outputs the results in a dataframe
def CrossValidation(ML_model,X,Y):
    model_scores = []
    model_scores.append(cross_validate(ML_model, X , Y , cv=5, scoring=('recall','accuracy', 'f1', 'roc_auc'), return_train_score=True))
    models_df = pd.DataFrame(model_scores, columns = ['fit_time','score_time','test_recall','train_recall','test_accuracy','train_accuracy','test_f1','train_f1','test_roc_auc','train_roc_auc'],index=["AdaModel_PCA"])
    av_column = models_df.mean(axis = 0)
    return pd.DataFrame(av_column)

# FUNCTION Output
# Prints out the Scores of the model
def Output(pred,label_test):
    # Model Generalizability Analysis
    accuracy = accuracy_score(label_test, pred)
    conf_matrix = confusion_matrix(label_test, pred)
    F1Score = f1_score(label_test,pred)
    roc_auc = roc_auc_score(label_test,pred)
    recall = recall_score(label_test,pred)

    print('\033[1m' + 'Confusion Matrix' + '\033[0m') # printing in bold
    print(conf_matrix)
    
    print('\033[1m' + '\nRecall' + '\033[0m')
    print(recall)  

    print('\033[1m' + '\nAccuracy' + '\033[0m')
    print(accuracy)  
    
    print('\033[1m' + '\nF1 Score' + '\033[0m')
    print(F1Score)

    print('\033[1m' + '\nROC AUC Score' + '\033[0m')
    print(roc_auc)

    
def getKeyFromBond(bond):
    atom1 = int(bond.GetBeginAtom().GetAtomicNum())
    atom2 = int(bond.GetEndAtom().GetAtomicNum())

    if atom1 > atom2:
        atom1, atom2 = atom2, atom1

    bondType = int(bond.GetBondTypeAsDouble() * 2 - 2)
    key = atom1 | (atom2 << 8) | (bondType << 16)

    return key

def formatMolecule(ID):
    pt = Chem.GetPeriodicTable()
    atom1 = pt.GetElementSymbol(int(ID & 255))
    atom2 = pt.GetElementSymbol(int((ID >> 8) & 255))
    bondType = ['-', ':', '=', 'err', '#'][(ID >> 16) & 255]
    return '%s%s%s' % (atom1,bondType,atom2)

def gen_nathan_prints(mols):
    # this dictionary is responsible for assigning a unique ID (index) to
    # every unique bond.  The bonds are assigned incrementing IDs as discovered
    bondIDs = dict()
    numUniqueBonds = 0
    
    for molecule in mols:
        for bond in molecule.GetBonds():
            key = getKeyFromBond(bond)
            if key not in bondIDs:
                bondIDs[key] = numUniqueBonds
                numUniqueBonds += 1
    
    # list of numpy byte arrays representing the feature vector of each molecule
    fingerprints = []

    for molecule in mols:
        # One molecule has 105 carbon-carbon single bonds, so the fingerprint
        # format is set to preserve up to that many occurances of any unique bond
            
        # each fingerprint requires 1 byte to store the count of each unique bond type
        # plus the extra 2 bytes store the molecule's molar mass
        fingerprint = np.zeros(numUniqueBonds + 2, np.uint8)
        
        for bond in molecule.GetBonds():
            key = getKeyFromBond(bond)
            index = bondIDs[key]
            fingerprint[index] += 1
        
        # the heaviest molecule in our dataset weights 3431.9089999999887 g/mol
        # encode the weight with 1/5 increments of fractional value
        # in the last 2 bytes of the finger print / feature vector
        weight = Descriptors.MolWt(molecule)
        intWeight = round(weight * 5)
        fingerprint[numUniqueBonds] = intWeight & 255
        fingerprint[numUniqueBonds + 1] = (intWeight >> 8) & 255
        
        fingerprints.append(fingerprint)
    
    # create a mapping from bond ID to bond key to aid feature importance
    sourceMap = np.zeros(numUniqueBonds, np.uint32)
    for k, v in bondIDs.items():
        sourceMap[v] = k
    
    return fingerprints, sourceMap

## Loading data

In [191]:
data_file = 'molecule_data.csv'
mols, labels = load_data(data_file)

## Using Morgan Features

In [192]:
# load different molecular features separately
##############################################
#rdk_features    = gen_fprints(mols)
#maccs_features  = gen_MACCS(mols)
morgan_features = gen_morgan_prints(mols,radius=16)

In [193]:
# Split data for [training] and [testing]
morgan_train, morgan_test, label_train, label_test = train_test_split(morgan_features, labels, \
                                                                test_size=0.3, shuffle=True)

In [295]:
######################
# Principal Components
######################

# Determine principal components using [training data] and apply the transformation to the [test data]
PC_morgan_proj = PCA(n_components=961)
morgan_proj_train = PC_morgan_proj.fit_transform(morgan_train)
morgan_proj_test = PC_morgan_proj.transform(morgan_test)
morgan_proj_features = PC_morgan_proj.fit_transform(morgan_features)

### Ada Boost with PCA

In [282]:
#########################################################################
#
# Ada Boost with PCA
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
AdaModel_PCA = AdaBoostClassifier(n_estimators = 100, learning_rate = 1)
Ada = AdaModel_PCA.fit(morgan_proj_train,label_train)
pred1 = Ada.predict(morgan_proj_test)

#### Scores

In [283]:
Scores1 = Output(pred1,label_test)
Scores1

Confusion Matrix
[[1165    7]
 [  10   75]]

Recall
0.8823529411764706

Accuracy
0.9864757358790772

F1 Score
0.8982035928143712

ROC AUC Score
0.9381901224653684


#### Cross Validation

In [296]:
M_av_df1 = CrossValidation(AdaModel_PCA,morgan_proj_features,labels)
M_av_df1

Unnamed: 0,0
fit_time,91.961993
score_time,1.447982
test_recall,0.830189
train_recall,1.0
test_accuracy,0.98568
train_accuracy,1.0
test_f1,0.88
train_f1,1.0
test_roc_auc,0.993366
train_roc_auc,1.0


### Ada Boost [WITHOUT] PCA

In [None]:
#########################################################################
#
# Ada Boost [WITHOUT] PCA
#
#########################################################################
# Train Logistic Regression model using PC transformed Morgan features
AdaModel_ = AdaBoostClassifier(n_estimators = 100, learning_rate = 1)
Ada = AdaModel_.fit(morgan_train,label_train)
pred2 = Ada.predict(morgan_test)

#### Scores

In [285]:
Scores2 = Output(pred2,label_test)
Scores2

Confusion Matrix
[[1163    9]
 [  11   74]]

Recall
0.8705882352941177

Accuracy
0.9840891010342084

F1 Score
0.880952380952381

ROC AUC Score
0.9314545272033729


#### Cross Validation

In [286]:
M_av_df2 = CrossValidation(AdaModel_,morgan_features,labels)
M_av_df2

Unnamed: 0,0
fit_time,17.772021
score_time,1.584419
test_recall,0.830189
train_recall,0.995215
test_accuracy,0.98568
train_accuracy,0.999403
test_f1,0.88
train_f1,0.995215
test_roc_auc,0.986492
train_roc_auc,0.999995


### Ada Boost with PCA using Logistic regression

In [199]:
#########################################################################
#
# Ada Boost with PCA using Logistic regression
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
from sklearn.linear_model import LogisticRegression 

mylogregmodel = LogisticRegression()

AdaModel_LR_PCA = AdaBoostClassifier(n_estimators = 100,base_estimator = mylogregmodel, learning_rate = 1)
Ada = AdaModel_LR_PCA.fit(morgan_proj_train,label_train)
pred3 = Ada.predict(morgan_proj_test)

#### Scores

In [200]:
Scores3 = Output(pred3,label_test)
Scores3

Confusion Matrix
[[1161   11]
 [  10   75]]

Recall
0.8823529411764706

Accuracy
0.9832935560859188

F1 Score
0.8771929824561403

ROC AUC Score
0.9364836378237302


#### Cross Validation

In [None]:
M_av_df3 = CrossValidation(AdaModel_LR_PCA,morgan_proj_features,labels)
M_av_df3

### Ada Boost [WITHOUT] PCA using Logistic regression

In [287]:
#########################################################################
#
# Ada Boost without PCA using Logistic regression
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
from sklearn.linear_model import LogisticRegression 

mylogregmodel = LogisticRegression()

AdaModel_LR_ = AdaBoostClassifier(n_estimators = 100,base_estimator = mylogregmodel, learning_rate = 1)
Ada = AdaModel_LR_.fit(morgan_train,label_train)
pred4 = Ada.predict(morgan_test)

#### Scores

In [289]:
Scores4 = Output(pred4,label_test)
Scores4

Confusion Matrix
[[1166    6]
 [   6   79]]

Recall
0.9294117647058824

Accuracy
0.9904534606205251

F1 Score
0.9294117647058824

ROC AUC Score
0.9621461553904839


#### Cross Validation

In [293]:
M_av_df4 = CrossValidation(AdaModel_LR_,morgan_features,labels)
M_av_df4

Unnamed: 0,0
fit_time,13.220868
score_time,1.756825
test_recall,0.830189
train_recall,0.995215
test_accuracy,0.988067
train_accuracy,0.999403
test_f1,0.897959
train_f1,0.995215
test_roc_auc,0.99565
train_roc_auc,0.999988


### Ada Boost with PCA using Random Forests

In [203]:
#########################################################################
#
# Ada Boost with PCA using Random Forests
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 100,criterion='gini',bootstrap = True, warm_start = False)

AdaModel_RF_PCA = AdaBoostClassifier(n_estimators = 100,base_estimator = rf, learning_rate = 1)
RF = AdaModel_RF_PCA.fit(morgan_proj_train,label_train)
pred5 = RF.predict(morgan_proj_test)

#### Scores

In [204]:
Scores5 = Output(pred5,label_test)
Scores5

Confusion Matrix
[[1165    7]
 [   8   77]]

Recall
0.9058823529411765

Accuracy
0.9880668257756563

F1 Score
0.9112426035502958

ROC AUC Score
0.9499548283477212


#### Cross Validation

In [None]:
M_av_df5 = CrossValidation(AdaModel_RF_PCA,morgan_proj_features,labels)
M_av_df5

### Ada Boost [WITHOUT] PCA using Random Forests

In [298]:
#########################################################################
#
# Ada Boost with PCA using Random Forests
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 100,criterion='gini',bootstrap = True, warm_start = False)

AdaModel_RF_ = AdaBoostClassifier(n_estimators = 100,base_estimator = rf, learning_rate = 1)
RF = AdaModel_RF_.fit(morgan_train,label_train)
pred6 = RF.predict(morgan_test)

#### Scores

In [299]:
Scores6 = Output(pred6,label_test)
Scores6

Confusion Matrix
[[1168    4]
 [   6   79]]

Recall
0.9294117647058824

Accuracy
0.9920445505171042

F1 Score
0.9404761904761904

ROC AUC Score
0.9629993977113028


#### Cross Validation

In [294]:
M_av_df6 = CrossValidation(AdaModel_RF_,morgan_features,labels)
av_df6

Unnamed: 0,0
fit_time,13.489129
score_time,1.873122
test_recall,0.830189
train_recall,0.995215
test_accuracy,0.988067
train_accuracy,0.999403
test_f1,0.897959
train_f1,0.995215
test_roc_auc,0.99565
train_roc_auc,0.999988


### M_Final_df

In [None]:
M_result = pd.concat([M_av_df1, M_av_df2, M_av_df3, M_av_df4, M_av_df5, M_av_df6], axis=1, join='inner')
M_result.columns = ['M_Model1_df','M_Model2_df','M_Model3_df','M_Model4_df','M_Model5_df','M_Model6_df']
M_final = M_result.transpose()
M_final

## Using MACCS Features

In [158]:
# load different molecular features separately
##############################################
#rdk_features    = gen_fprints(mols)
maccs_features  = gen_MACCS(mols)
#morgan_features = gen_morgan_prints(mols,radius=18)

In [159]:
# Split data for [training] and [testing]
maccs_train, maccs_test, label_train, label_test = train_test_split(maccs_features, labels, \
                                                                test_size=0.3, shuffle=True)

In [297]:
######################
# Principal Components
######################

# Determine principal components using [training data] and apply the transformation to the [test data]
PC_maccs_proj = PCA(n_components=114)
maccs_proj_train = PC_maccs_proj.fit_transform(maccs_train)
maccs_proj_test = PC_maccs_proj.transform(maccs_test)
maccs_proj_features = PC_maccs_proj.fit_transform(maccs_features)

### Ada Boost with PCA

In [161]:
#########################################################################
#
# Ada Boost with PCA
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
AdaModel_PCA = AdaBoostClassifier(n_estimators = 100, learning_rate = 1)
Ada = AdaModel_PCA.fit(maccs_proj_train,label_train)
pred1 = Ada.predict(maccs_proj_test)

#### Scores

In [162]:
Scores1 = Output(pred1,label_test)
Scores1

Confusion Matrix
[[1161   17]
 [  21   58]]

Recall
0.7341772151898734

Accuracy
0.9697692919649961

F1 Score
0.7532467532467533

ROC AUC Score
0.8598729879005393


#### Cross Validation

In [None]:
MAC_av_df1 = CrossValidation(AdaModel_PCA,maccs_proj_features,labels)
av_df1

### Ada Boost [WITHOUT] PCA

In [163]:
### Ada Boost [WITHOUT] PCA

#########################################################################
#
# Ada Boost [WITHOUT] PCA
#
#########################################################################
# Train Logistic Regression model using PC transformed Morgan features
AdaModel_ = AdaBoostClassifier(n_estimators = 100, learning_rate = 1)
Ada = AdaModel_.fit(maccs_train,label_train)
pred2 = Ada.predict(maccs_test)

#### Scores

In [164]:
Scores2 = Output(pred2,label_test)
Scores2

Confusion Matrix
[[1174    4]
 [   7   72]]

Recall
0.9113924050632911

Accuracy
0.9912490055688147

F1 Score
0.9290322580645162

ROC AUC Score
0.9539984096623756


#### Cross Validation

In [None]:
MAC_av_df2 = CrossValidation(AdaModel_,maccs_features,labels)
av_df2

### Ada Boost with PCA using Logistic regression

In [165]:
#########################################################################
#
# Ada Boost with PCA using Logistic regression
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
from sklearn.linear_model import LogisticRegression 

mylogregmodel = LogisticRegression()

AdaModel_LR_PCA = AdaBoostClassifier(n_estimators = 100,base_estimator = mylogregmodel ,learning_rate = 1)
Ada = AdaModel_LR_PCA.fit(maccs_proj_train,label_train)
pred3 = Ada.predict(maccs_proj_test)

#### Scores

In [166]:
Scores3 = Output(pred3,label_test)
Scores3

Confusion Matrix
[[1169    9]
 [  18   61]]

Recall
0.7721518987341772

Accuracy
0.9785202863961814

F1 Score
0.8187919463087248

ROC AUC Score
0.8822559154112313


#### Cross Validation

In [None]:
MAC_av_df3 = CrossValidation(AdaModel_LR_PCA,maccs_proj_features,labels)
MAC_av_df3

### Ada Boost [WITHOUT] PCA using Logistic regression

In [167]:
#########################################################################
#
# Ada Boost with PCA using Logistic regression
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
from sklearn.linear_model import LogisticRegression 

mylogregmodel = LogisticRegression()

AdaModel_LR_ = AdaBoostClassifier(n_estimators = 100,base_estimator = mylogregmodel ,learning_rate = 1)
Ada = AdaModel_LR_.fit(maccs_train,label_train)
pred4 = Ada.predict(maccs_test)

#### Scores

In [168]:
Scores4 = Output(pred4,label_test)
Scores4

Confusion Matrix
[[1177    1]
 [   6   73]]

Recall
0.9240506329113924

Accuracy
0.994431185361973

F1 Score
0.954248366013072

ROC AUC Score
0.9616008682383788


#### Cross Validation


In [None]:
MAC_av_df4 = CrossValidation(AdaModel_LR_,maccs_features,labels)
MAC_av_df4

### Ada Boost with PCA using Random Forests

In [169]:
#########################################################################
#
# Ada Boost with PCA using Logistic regression
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
from sklearn.linear_model import LogisticRegression 

mylogregmodel = LogisticRegression()

AdaModel_RF_PCA = AdaBoostClassifier(n_estimators = 100,base_estimator = mylogregmodel ,learning_rate = 1)
Ada = AdaModel_RF_PCA.fit(maccs_proj_train,label_train)
pred5 = Ada.predict(maccs_proj_test)

#### Scores

In [170]:
Scores5 = Output(pred5,label_test)
Scores5

Confusion Matrix
[[1169    9]
 [  18   61]]

Recall
0.7721518987341772

Accuracy
0.9785202863961814

F1 Score
0.8187919463087248

ROC AUC Score
0.8822559154112313


#### Cross Validation

In [None]:
MAC_av_df5 = CrossValidation(AdaModel_RF_PCA,maccs_proj_features,labels)
MAC_av_df5

### Ada Boost [WITHOUT] PCA using Random Forests

In [171]:
#########################################################################
#
# Ada Boost with PCA using Random Forests
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 100,criterion='gini',bootstrap = True, warm_start = False)

AdaModel_RF_ = AdaBoostClassifier(n_estimators = 100,base_estimator = rf ,learning_rate = 1)
RF = AdaModel_RF_.fit(maccs_train,label_train)
pred6 = RF.predict(maccs_test)

#### Scores

In [172]:
Scores6 = Output(pred6,label_test)
Scores6

Confusion Matrix
[[1177    1]
 [   6   73]]

Recall
0.9240506329113924

Accuracy
0.994431185361973

F1 Score
0.954248366013072

ROC AUC Score
0.9616008682383788


#### Cross Validation

In [None]:
MAC_av_df6 = CrossValidation(AdaModel_RF_,maccs_features,labels)
MAC_av_df6

### MAC_Final_df

In [None]:
MAC_result = pd.concat([MAC_av_df1, MAC_av_df2, MAC_av_df3, MAC_av_df4, MAC_av_df5, MAC_av_df6], axis=1, join='inner')
MAC_result.columns = ['MAC_Model1_df','MAC_Model2_df','MAC_Model3_df','MAC_Model4_df','MAC_Model5_df','MAC_Model6_df']
MAC_final = MAC_result.transpose()
MAC_final

## Using RDKit Features

In [173]:
# load different molecular features separately
##############################################
rdk_features    = gen_fprints(mols)
#maccs_features  = gen_MACCS(mols)
#morgan_features = gen_morgan_prints(mols,radius=18)

In [174]:
# Split data for [training] and [testing]
rdk_train, rdk_test, label_train, label_test = train_test_split(rdk_features, labels, \
                                                                test_size=0.3, shuffle=True)

In [175]:
######################
# Principal Components
######################

# Determine principal components using [training data] and apply the transformation to the [test data]
PC_rdk_proj = PCA(n_components = 1570)
rdk_proj_train = PC_rdk_proj.fit_transform(rdk_train)
rdk_proj_test = PC_rdk_proj.transform(rdk_test)
rdk_proj_features = PC_rdk_proj.fit_transform(rdk_features)

### Ada Boost with PCA

In [176]:
#########################################################################
#
# Ada Boost with PCA
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
AdaModel_PCA = AdaBoostClassifier(n_estimators = 100, learning_rate = 1)
Ada = AdaModel_PCA.fit(rdk_proj_train,label_train)
pred1 = Ada.predict(rdk_proj_test)

#### Scores

In [177]:
Scores1 = Output(pred1,label_test)
Scores1

Confusion Matrix
[[1163   17]
 [  44   33]]

Recall
0.42857142857142855

Accuracy
0.9514717581543357

F1 Score
0.5196850393700788

ROC AUC Score
0.7070823244552058


#### Cross Validation

In [None]:
RDK_av_df1 = CrossValidation(AdaModel_PCA,rdk_proj_features,labels)
RDK_av_df1

### Ada Boost [WITHOUT] PCA

In [178]:
#########################################################################
#
# Ada Boost [WITHOUT] PCA
#
#########################################################################
# Train Logistic Regression model using PC transformed Morgan features
AdaModel_ = AdaBoostClassifier(n_estimators = 100, learning_rate = 1)
Ada = AdaModel_.fit(rdk_train,label_train)
pred2 = Ada.predict(rdk_test)

#### Scores

In [179]:
Scores2 = Output(pred2,label_test)
Scores2

Confusion Matrix
[[1169   11]
 [  13   64]]

Recall
0.8311688311688312

Accuracy
0.9809069212410502

F1 Score
0.8421052631578949

ROC AUC Score
0.9109233986352631


#### Cross Validation

In [None]:
RDK_av_df2 = CrossValidation(AdaModel_,rdk_features,labels)
RDK_av_df2

### Ada Boost with PCA using Logistic regression

In [180]:
#########################################################################
#
# Ada Boost with PCA using Logistic regression
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
from sklearn.linear_model import LogisticRegression 

mylogregmodel = LogisticRegression()

AdaModel_LR_PCA = AdaBoostClassifier(n_estimators = 100,base_estimator = mylogregmodel ,learning_rate = 1)
Ada = AdaModel_LR_PCA.fit(rdk_proj_train,label_train)
pred3 = Ada.predict(rdk_proj_test)

#### Scores

In [181]:
Scores3 = Output(pred3,label_test)
Scores3

Confusion Matrix
[[1169   11]
 [  68    9]]

Recall
0.11688311688311688

Accuracy
0.9371519490851233

F1 Score
0.18556701030927833

ROC AUC Score
0.5537805414924059


#### Cross Validation

In [None]:
RDK_av_df3 = CrossValidation(AdaModel_LR_PCA,rdk_proj_features,labels)
RDK_av_df3

### Ada Boost [WITHOUT] PCA using Logistic regression

In [182]:
#########################################################################
#
# Ada Boost with PCA using Logistic regression
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
from sklearn.linear_model import LogisticRegression 

mylogregmodel = LogisticRegression()

AdaModel_LR_ = AdaBoostClassifier(n_estimators = 100,base_estimator = mylogregmodel ,learning_rate = 1)
Ada = AdaModel_LR_.fit(rdk_train,label_train)
pred4 = Ada.predict(rdk_test)

#### Scores

In [183]:
Scores4 = Output(pred4,label_test)
Scores4

Confusion Matrix
[[1173    7]
 [  10   67]]

Recall
0.8701298701298701

Accuracy
0.9864757358790772

F1 Score
0.8874172185430463

ROC AUC Score
0.9320988333700199


#### Cross Validation

In [None]:
RDK_av_df4 = CrossValidation(AdaModel_LR_,rdk_features,labels)
RDK_av_df4

### Ada Boost with PCA using Random Forests

In [184]:
#########################################################################
#
# Ada Boost with PCA using Random Forests
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 100,criterion='gini',bootstrap = True, warm_start = False)

AdaModel_RF_PCA = AdaBoostClassifier(n_estimators = 100,base_estimator = rf ,learning_rate = 1)
RF = AdaModel_RF_PCA.fit(rdk_proj_train,label_train)
pred5 = RF.predict(rdk_proj_test)

#### Scores

In [300]:
Scores5 = Output(pred5,label_test)
Scores5

NameError: name 'pred5' is not defined

#### Cross Validation

In [None]:
RDK_av_df5 = CrossValidation(AdaModel_RF_PCA,rdk_proj_features,labels)
RDK_av_df5

### Ada Boost [WITHOUT] PCA using Random Forests

In [186]:
#########################################################################
#
# Ada Boost with PCA using Random Forests
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 100,criterion='gini',bootstrap = True, warm_start = False)

AdaModel_RF_ = AdaBoostClassifier(n_estimators = 100,base_estimator = rf ,learning_rate = 1)
RF = AdaModel_RF_.fit(rdk_train,label_train)
pred6 = RF.predict(rdk_test)

#### Scores

In [187]:
Scores6 = Output(pred6,label_test)
Scores6

Confusion Matrix
[[1173    7]
 [  10   67]]

Recall
0.8701298701298701

Accuracy
0.9864757358790772

F1 Score
0.8874172185430463

ROC AUC Score
0.9320988333700199


#### Cross Validation

In [None]:
RDK_av_df6 = CrossValidation(AdaModel_RF_,rdk_features,labels)
RDK_av_df6

### RDK_Final_df|

In [None]:
RDK_result = pd.concat([RDK_av_df1, RDK_av_df2, RDK_av_df3, RDK_av_df4, RDK_av_df5,RDK_av_df6], axis=1, join='inner')
RDK_result.columns = ['RDK_Model1_df','RDK_Model2_df','RDK_Model3_df','RDK_Model4_df','RDK_Model5_df','RDK_Model6_df']
RDK_final = RDK_result.transpose()
RDK_final

## Using Nathan Features

In [305]:
# load different molecular features separately
##############################################
#rdk_features    = gen_fprints(mols)
#maccs_features  = gen_MACCS(mols)
#morgan_features = gen_morgan_prints(mols,radius=18)
nathan_features, bondIDSourceMap = gen_nathan_prints(mols)

In [306]:
# Split data for [training] and [testing]
nathan_train, nathan_test, label_train, label_test = train_test_split(nathan_features, labels, \
                                                                test_size=0.3, shuffle=True)

In [307]:
######################
# Principal Components
######################

# Determine principal components using [training data] and apply the transformation to the [test data]
PC_nathan_proj = PCA(n_components = 3)
nathan_proj_train = PC_nathan_proj.fit_transform(nathan_train)
nathan_proj_test = PC_nathan_proj.transform(nathan_test)
nathan_proj_features = PC_nathan_proj.fit_transform(nathan_features)

### Ada Boost with PCA

In [308]:
#########################################################################
#
# Ada Boost with PCA
#
#########################################################################
# Train Ada Boost model using PC transformed Nathan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
AdaModel_PCA = AdaBoostClassifier(n_estimators = 100, learning_rate = 1)
Ada = AdaModel_PCA.fit(nathan_proj_train,label_train)
pred1 = Ada.predict(nathan_proj_test)

[1mConfusion Matrix[0m
[[1174   27]
 [  49    7]]
[1m
Recall[0m
0.125
[1m
Accuracy[0m
0.939538583929992
[1m
F1 Score[0m
0.15555555555555556
[1m
ROC AUC Score[0m
0.551259367194005


#### Scores

In [None]:
Scores1 = Output(pred1,label_test)
Scores1

#### Cross Validation

In [None]:
N_av_df1 = CrossValidation(AdaModel_PCA,nathan_proj_features,labels)
N_av_df1

### Ada Boost [WITHOUT] PCA

In [309]:
#########################################################################
#
# Ada Boost [WITHOUT] PCA
#
#########################################################################
# Train Logistic Regression model using PC transformed Nathan features
AdaModel_ = AdaBoostClassifier(n_estimators = 100, learning_rate = 1)
Ada = AdaModel_.fit(nathan_train,label_train)
pred2 = Ada.predict(nathan_test)

[1mConfusion Matrix[0m
[[1199    2]
 [   1   55]]
[1m
Recall[0m
0.9821428571428571
[1m
Accuracy[0m
0.9976133651551312
[1m
F1 Score[0m
0.9734513274336283
[1m
ROC AUC Score[0m
0.9902387891043178


#### Scores

In [None]:
Scores2 = Output(pred2,label_test)
Scores2

#### Cross Validation

In [None]:
N_av_df2 = CrossValidation(AdaModel_,nathan_features,labels)
N_av_df2

### Ada Boost with PCA using Logistic regression

In [314]:
#########################################################################
#
# Ada Boost with PCA using Logistic regression
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
from sklearn.linear_model import LogisticRegression 

mylogregmodel = LogisticRegression()

AdaModel_LR_PCA = AdaBoostClassifier(n_estimators = 100,base_estimator = mylogregmodel ,learning_rate = 1)
Ada = AdaModel_LR_PCA.fit(nathan_proj_train,label_train)
pred3 = Ada.predict(nathan_proj_test)

[1mConfusion Matrix[0m
[[1201    0]
 [  56    0]]
[1m
Recall[0m
0.0
[1m
Accuracy[0m
0.9554494828957836
[1m
F1 Score[0m
0.0
[1m
ROC AUC Score[0m
0.5


#### Scores

In [None]:
Scores3 = Output(pred3,label_test)
Scores3

#### Cross Validation

In [None]:
N_av_df3 = CrossValidation(AdaModel_LR_PCA,nathan_proj_features,labels)
N_av_df3

### Ada Boost [WITHOUT] PCA using Logistic regression

In [317]:
#########################################################################
#
# Ada Boost with PCA using Logistic regression
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
from sklearn.linear_model import LogisticRegression 

mylogregmodel = LogisticRegression(max_iter=10000)

AdaModel_LR_ = AdaBoostClassifier(n_estimators = 100,base_estimator = mylogregmodel ,learning_rate = 1)
Ada = AdaModel_LR_.fit(nathan_train,label_train)
pred4 = Ada.predict(nathan_test)

[1mConfusion Matrix[0m
[[1196    5]
 [   2   54]]
[1m
Recall[0m
0.9642857142857143
[1m
Accuracy[0m
0.994431185361973
[1m
F1 Score[0m
0.9391304347826087
[1m
ROC AUC Score[0m
0.9800612584750803


#### Scores

In [None]:
Scores4 = Output(pred4,label_test)
Scores4

#### Cross Validation

In [None]:
N_av_df4 = CrossValidation(AdaModel_LR_,nathan_features,labels)
N_av_df4

### Ada Boost with PCA using Random Forests

In [312]:
#########################################################################
#
# Ada Boost with PCA using Random Forests
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 100,criterion='gini',bootstrap = True, warm_start = False)

AdaModel_RF_PCA = AdaBoostClassifier(n_estimators = 100,base_estimator = rf ,learning_rate = 1)
RF = AdaModel_RF_PCA.fit(nathan_proj_train,label_train)
pred5 = RF.predict(nathan_proj_test)

[1mConfusion Matrix[0m
[[1193    8]
 [  56    0]]
[1m
Recall[0m
0.0
[1m
Accuracy[0m
0.949085123309467
[1m
F1 Score[0m
0.0
[1m
ROC AUC Score[0m
0.49666944213155706


#### Scores

In [None]:
Scores5 = Output(pred5,label_test)
Scores5

#### Cross Validation

In [None]:
N_av_df5 = CrossValidation(AdaModel_RF_PCA,nathan_proj_features,labels)
N_av_df5

### Ada Boost [WITHOUT] PCA using Random Forests

In [316]:
#########################################################################
#
# Ada Boost with PCA using Random Forests
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 100,criterion='gini',bootstrap = True, warm_start = False)

AdaModel_RF_ = AdaBoostClassifier(n_estimators = 100,base_estimator = rf ,learning_rate = 1)
RF = AdaModel_RF_.fit(nathan_train,label_train)
pred6 = RF.predict(nathan_test)

[1mConfusion Matrix[0m
[[1175   26]
 [  56    0]]
[1m
Recall[0m
0.0
[1m
Accuracy[0m
0.9347653142402546
[1m
F1 Score[0m
0.0
[1m
ROC AUC Score[0m
0.48917568692756036


#### Scores

In [None]:
Scores6 = Output(pred6,label_test)
Scores6

#### Cross Validation

In [None]:
N_av_df6 = CrossValidation(AdaModel_RF_,nathan_features,labels)
N_av_df6

### N_Final_df

In [None]:
N_result = pd.concat([N_av_df1, N_av_df2, N_av_df3, N_av_df4, N_av_df5,N_av_df6], axis=1, join='inner')
N_result.columns = ['N_Model1_df','N_Model2_df','N_Model3_df','N_Model4_df','N_Model5_df','N_Model6_df']
N_final = N_result.transpose()
N_final

## FULL DataFrame

In [321]:
M_result = pd.concat([M_result, MAC_result,RDK_result,N_result,av_df5,av_df6], axis=1, join='inner')
final = M_result.transpose()
final.to_csv("ADA_Result.csv")
final

NameError: name 'M_Model1_df' is not defined