# Ada Boost

 - Goal: To Apply Ada Boost to several models to test Recall, Accuracy, F1 score and ROC AUC Score, using diffrent features inputs 

## Import Statements

In [1]:
import pandas as pd
import numpy as np

#import matplotlib as plt
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
from rdkit import Chem

# FEATURES
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem # For Morgan Fingerprint (Circular Fingerprints)
from rdkit.Chem import MACCSkeys # For MACCS keys

# SCALING DATA
from sklearn.preprocessing import scale

# For splitting data into training and test sets.
from sklearn.model_selection import train_test_split

# For processing how well our methods have classified our data
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score,roc_auc_score

# Principal Component Analysis (PCA)
from sklearn.decomposition import PCA

# Logistic Regression ML Model
from sklearn.linear_model import LogisticRegression

# Ada Boost Classifier
from sklearn.ensemble import AdaBoostClassifier

# Cross Validation
from sklearn.model_selection import cross_val_score

#CV
from sklearn.model_selection import cross_validate

## User Defined Helper Functions

In [2]:
# FUNCTION LOAD_DATA
# Load the data from the csv file in the [data] folder. We separate 
# the molecule data from the "explosive" labes and return the labels
# an a molecules from SMILES representation for each molecule in the 
# dataset
def load_data(data_file):
    mol_df = pd.read_csv(data_file)

    smiles = np.array(mol_df['smiles'])
    labels = np.array(mol_df['labels']) 
    
    mols = np.array([Chem.MolFromSmiles(smile) for smile in smiles])
    
    return mols, labels

# FUNCTION GEN_FPRINTS
# Generate molecule fingerprints for each molecule in an input list
def gen_fprints(mols):
    f_prints = np.array([Chem.RDKFingerprint(mol) for mol in mols])  
    return f_prints

# FUNCTION gen_MACCS 
# Generate the MACCS keys for each molecule in an input list of molecules
def gen_MACCS(mols):
    MACCS_keys = np.array([MACCSkeys.GenMACCSKeys(mol) for mol in mols])
    return MACCS_keys

# FUNCTION gen_morgan_prints
# Generates Morgan finger prints for each molecule in an input list of molecules
def gen_morgan_prints(mols,radius):
    morgan_prints = np.array([AllChem.GetMorganFingerprintAsBitVect(mol,radius,nBits=1024) for mol in mols])
    return morgan_prints

# FUNCTION CrossValidation
# Runs Cross Validation and outputs the results in a dataframe
def CrossValidation(ML_model,X,Y):
    model_scores = []
    model_scores.append(cross_validate(ML_model, X , Y , cv=5, scoring=('recall','accuracy', 'f1', 'roc_auc','precision'), return_train_score=True))
    models_df = pd.DataFrame(model_scores, columns = ['fit_time','score_time','test_recall','train_recall','test_accuracy','train_accuracy','test_f1','train_f1','test_roc_auc','train_roc_auc','test_precision','train_precision'])
    av_column = models_df.mean(axis = 0)
    return pd.DataFrame(av_column)

# FUNCTION Output
# Prints out the Scores of the model
def Output(pred,label_test):
    # Model Generalizability Analysis
    accuracy = accuracy_score(label_test, pred)
    conf_matrix = confusion_matrix(label_test, pred)
    F1Score = f1_score(label_test,pred)
    roc_auc = roc_auc_score(label_test,pred)
    recall = recall_score(label_test,pred)

    print('\033[1m' + 'Confusion Matrix' + '\033[0m') # printing in bold
    print(conf_matrix)
    
    print('\033[1m' + '\nRecall' + '\033[0m')
    print(recall)  

    print('\033[1m' + '\nAccuracy' + '\033[0m')
    print(accuracy)  
    
    print('\033[1m' + '\nF1 Score' + '\033[0m')
    print(F1Score)

    print('\033[1m' + '\nROC AUC Score' + '\033[0m')
    print(roc_auc)

    
def getKeyFromBond(bond):
    atom1 = int(bond.GetBeginAtom().GetAtomicNum())
    atom2 = int(bond.GetEndAtom().GetAtomicNum())

    if atom1 > atom2:
        atom1, atom2 = atom2, atom1

    bondType = int(bond.GetBondTypeAsDouble() * 2 - 2)
    key = atom1 | (atom2 << 8) | (bondType << 16)

    return key

def formatMolecule(ID):
    pt = Chem.GetPeriodicTable()
    atom1 = pt.GetElementSymbol(int(ID & 255))
    atom2 = pt.GetElementSymbol(int((ID >> 8) & 255))
    bondType = ['-', ':', '=', 'err', '#'][(ID >> 16) & 255]
    return '%s%s%s' % (atom1,bondType,atom2)

def gen_nathan_prints(mols):
    # this dictionary is responsible for assigning a unique ID (index) to
    # every unique bond.  The bonds are assigned incrementing IDs as discovered
    bondIDs = dict()
    numUniqueBonds = 0
    
    for molecule in mols:
        for bond in molecule.GetBonds():
            key = getKeyFromBond(bond)
            if key not in bondIDs:
                bondIDs[key] = numUniqueBonds
                numUniqueBonds += 1
    
    # list of numpy byte arrays representing the feature vector of each molecule
    fingerprints = []

    for molecule in mols:
        # One molecule has 105 carbon-carbon single bonds, so the fingerprint
        # format is set to preserve up to that many occurances of any unique bond
            
        # each fingerprint requires 1 byte to store the count of each unique bond type
        # plus the extra 2 bytes store the molecule's molar mass
        fingerprint = np.zeros(numUniqueBonds + 2, np.uint8)
        
        for bond in molecule.GetBonds():
            key = getKeyFromBond(bond)
            index = bondIDs[key]
            fingerprint[index] += 1
        
        # the heaviest molecule in our dataset weights 3431.9089999999887 g/mol
        # encode the weight with 1/5 increments of fractional value
        # in the last 2 bytes of the finger print / feature vector
        weight = Descriptors.MolWt(molecule)
        intWeight = round(weight * 5)
        fingerprint[numUniqueBonds] = intWeight & 255
        fingerprint[numUniqueBonds + 1] = (intWeight >> 8) & 255
        
        fingerprints.append(fingerprint)
    
    # create a mapping from bond ID to bond key to aid feature importance
    sourceMap = np.zeros(numUniqueBonds, np.uint32)
    for k, v in bondIDs.items():
        sourceMap[v] = k
    
    return fingerprints, sourceMap

## Loading data

In [3]:
data_file = 'data/molecule_data.csv'
mols, labels = load_data(data_file)



## Using Morgan Features

In [4]:
# load different molecular features separately
##############################################
#rdk_features    = gen_fprints(mols)
#maccs_features  = gen_MACCS(mols)
morgan_features = gen_morgan_prints(mols,radius=16)

In [5]:
# Split data for [training] and [testing]
morgan_train, morgan_test, label_train, label_test = train_test_split(morgan_features, labels, \
                                                                test_size=0.3, shuffle=True)

In [6]:
######################
# Principal Components
######################

# Determine principal components using [training data] and apply the transformation to the [test data]
PC_morgan_proj = PCA(n_components=4)
morgan_proj_train = PC_morgan_proj.fit_transform(morgan_train)
morgan_proj_test = PC_morgan_proj.transform(morgan_test)
morgan_proj_features = PC_morgan_proj.fit_transform(morgan_features)

### Ada Boost with PCA

In [7]:
#########################################################################
#
# Ada Boost with PCA
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
AdaModel_PCA = AdaBoostClassifier(n_estimators = 100, learning_rate = 1)
Ada = AdaModel_PCA.fit(morgan_proj_train,label_train)
pred1 = Ada.predict(morgan_proj_test)

#### Scores

In [8]:
Scores1 = Output(pred1,label_test)
Scores1

[1mConfusion Matrix[0m
[[1174   10]
 [  11   62]]
[1m
Recall[0m
0.8493150684931506
[1m
Accuracy[0m
0.9832935560859188
[1m
F1 Score[0m
0.8551724137931035
[1m
ROC AUC Score[0m
0.9204345612736022


#### Cross Validation

In [9]:
av_df1 = CrossValidation(AdaModel_PCA,morgan_proj_features,labels)
av_df1

Unnamed: 0,0
fit_time,0.254583
score_time,0.033232
test_recall,0.773585
train_recall,0.961722
test_accuracy,0.9821
train_accuracy,0.996118
test_f1,0.845361
train_f1,0.968675
test_roc_auc,0.98505
train_roc_auc,0.999868


### Ada Boost [WITHOUT] PCA

In [10]:
#########################################################################
#
# Ada Boost [WITHOUT] PCA
#
#########################################################################
# Train Logistic Regression model using PC transformed Morgan features
AdaModel_ = AdaBoostClassifier(n_estimators = 100, learning_rate = 1)
Ada = AdaModel_.fit(morgan_train,label_train)
pred2 = Ada.predict(morgan_test)

#### Scores

In [11]:
Scores2 = Output(pred2,label_test)
Scores2

[1mConfusion Matrix[0m
[[1182    2]
 [  11   62]]
[1m
Recall[0m
0.8493150684931506
[1m
Accuracy[0m
0.9896579156722355
[1m
F1 Score[0m
0.9051094890510949
[1m
ROC AUC Score[0m
0.9238129396519807


#### Cross Validation

In [12]:
av_df2 = CrossValidation(AdaModel_,morgan_features,labels)
av_df2

Unnamed: 0,0
fit_time,5.684126
score_time,0.365156
test_recall,0.849057
train_recall,0.995215
test_accuracy,0.988067
train_accuracy,0.999403
test_f1,0.9
train_f1,0.995215
test_roc_auc,0.99065
train_roc_auc,0.999989


### Ada Boost with PCA using Logistic regression

In [13]:
#########################################################################
#
# Ada Boost with PCA using Logistic regression
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
from sklearn.linear_model import LogisticRegression 

mylogregmodel = LogisticRegression()

AdaModel_LR_PCA = AdaBoostClassifier(n_estimators = 100,base_estimator = mylogregmodel, learning_rate = 1)
Ada = AdaModel_LR_PCA.fit(morgan_proj_train,label_train)
pred3 = Ada.predict(morgan_proj_test)

#### Scores

In [14]:
Scores3 = Output(pred3,label_test)
Scores3

[1mConfusion Matrix[0m
[[1175    9]
 [  10   63]]
[1m
Recall[0m
0.863013698630137
[1m
Accuracy[0m
0.984884645982498
[1m
F1 Score[0m
0.8689655172413793
[1m
ROC AUC Score[0m
0.9277061736393928


#### Cross Validation

In [15]:
av_df3 = CrossValidation(AdaModel_LR_PCA,morgan_proj_features,labels)
av_df3

Unnamed: 0,0
fit_time,0.425511
score_time,0.032931
test_recall,0.830189
train_recall,0.885167
test_accuracy,0.980907
train_accuracy,0.986563
test_f1,0.846154
train_f1,0.891566
test_roc_auc,0.985843
train_roc_auc,0.994269


### Ada Boost [WITHOUT] PCA using Logistic regression

In [16]:
#########################################################################
#
# Ada Boost without PCA using Logistic regression
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
from sklearn.linear_model import LogisticRegression 

mylogregmodel = LogisticRegression()

AdaModel_LR_ = AdaBoostClassifier(n_estimators = 100,base_estimator = mylogregmodel, learning_rate = 1)
Ada = AdaModel_LR_.fit(morgan_train,label_train)
pred4 = Ada.predict(morgan_test)

#### Scores

In [17]:
Scores4 = Output(pred4,label_test)
Scores4

[1mConfusion Matrix[0m
[[1180    4]
 [   8   65]]
[1m
Recall[0m
0.8904109589041096
[1m
Accuracy[0m
0.9904534606205251
[1m
F1 Score[0m
0.9154929577464788
[1m
ROC AUC Score[0m
0.9435162902628655


#### Cross Validation

In [18]:
av_df4 = CrossValidation(AdaModel_LR_,morgan_features,labels)
av_df4

Unnamed: 0,0
fit_time,4.586882
score_time,0.192853
test_recall,0.830189
train_recall,0.995215
test_accuracy,0.988067
train_accuracy,0.999403
test_f1,0.897959
train_f1,0.995215
test_roc_auc,0.995674
train_roc_auc,0.999988


### Ada Boost with PCA using Random Forests

In [19]:
#########################################################################
#
# Ada Boost with PCA using Random Forests
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 100,criterion='gini',bootstrap = True, warm_start = False)

AdaModel_RF_PCA = AdaBoostClassifier(n_estimators = 100,base_estimator = rf, learning_rate = 1)
RF = AdaModel_RF_PCA.fit(morgan_proj_train,label_train)
pred5 = RF.predict(morgan_proj_test)

#### Scores

In [20]:
Scores5 = Output(pred5,label_test)
Scores5

[1mConfusion Matrix[0m
[[1179    5]
 [   7   66]]
[1m
Recall[0m
0.9041095890410958
[1m
Accuracy[0m
0.9904534606205251
[1m
F1 Score[0m
0.9166666666666666
[1m
ROC AUC Score[0m
0.9499433080340615


#### Cross Validation

In [21]:
av_df5 = CrossValidation(AdaModel_RF_PCA,morgan_proj_features,labels)
av_df5

Unnamed: 0,0
fit_time,0.336995
score_time,0.033284
test_recall,0.811321
train_recall,1.0
test_accuracy,0.986874
train_accuracy,1.0
test_f1,0.886598
train_f1,1.0
test_roc_auc,0.97415
train_roc_auc,1.0


### Ada Boost [WITHOUT] PCA using Random Forests

In [22]:
#########################################################################
#
# Ada Boost with PCA using Random Forests
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 100,criterion='gini',bootstrap = True, warm_start = False)

AdaModel_RF_ = AdaBoostClassifier(n_estimators = 100,base_estimator = rf, learning_rate = 1)
RF = AdaModel_RF_.fit(morgan_train,label_train)
pred6 = RF.predict(morgan_test)

#### Scores

In [23]:
Scores6 = Output(pred6,label_test)
Scores6

[1mConfusion Matrix[0m
[[1180    4]
 [  10   63]]
[1m
Recall[0m
0.863013698630137
[1m
Accuracy[0m
0.9888623707239459
[1m
F1 Score[0m
0.9
[1m
ROC AUC Score[0m
0.9298176601258792


#### Cross Validation

In [24]:
av_df6 = CrossValidation(AdaModel_RF_,morgan_features,labels)
av_df6

Unnamed: 0,0
fit_time,0.661244
score_time,0.043801
test_recall,0.849057
train_recall,1.0
test_accuracy,0.988067
train_accuracy,1.0
test_f1,0.9
train_f1,1.0
test_roc_auc,0.976337
train_roc_auc,1.0


## Using MACCS Features

In [25]:
# load different molecular features separately
##############################################
#rdk_features    = gen_fprints(mols)
maccs_features  = gen_MACCS(mols)
#morgan_features = gen_morgan_prints(mols,radius=18)

In [26]:
# Split data for [training] and [testing]
maccs_train, maccs_test, label_train, label_test = train_test_split(maccs_features, labels, \
                                                                test_size=0.3, shuffle=True)

In [27]:
######################
# Principal Components
######################

# Determine principal components using [training data] and apply the transformation to the [test data]
PC_maccs_proj = PCA(n_components=5)
maccs_proj_train = PC_maccs_proj.fit_transform(maccs_train)
maccs_proj_test = PC_maccs_proj.transform(maccs_test)
maccs_proj_features = PC_maccs_proj.fit_transform(maccs_features)

### Ada Boost with PCA

In [28]:
#########################################################################
#
# Ada Boost with PCA
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
AdaModel_PCA = AdaBoostClassifier(n_estimators = 100, learning_rate = 1)
Ada = AdaModel_PCA.fit(maccs_proj_train,label_train)
pred1 = Ada.predict(maccs_proj_test)

#### Scores

In [29]:
Scores1 = Output(pred1,label_test)
Scores1

[1mConfusion Matrix[0m
[[1170   13]
 [  14   60]]
[1m
Recall[0m
0.8108108108108109
[1m
Accuracy[0m
0.9785202863961814
[1m
F1 Score[0m
0.8163265306122449
[1m
ROC AUC Score[0m
0.8999108999109


#### Cross Validation

In [30]:
av_df1 = CrossValidation(AdaModel_PCA,maccs_proj_features,labels)
av_df1

Unnamed: 0,0
fit_time,0.270112
score_time,0.035535
test_recall,0.830189
train_recall,0.956938
test_accuracy,0.9821
train_accuracy,0.993431
test_f1,0.854369
train_f1,0.947867
test_roc_auc,0.972203
train_roc_auc,0.999479


### Ada Boost [WITHOUT] PCA

In [31]:
### Ada Boost [WITHOUT] PCA

#########################################################################
#
# Ada Boost [WITHOUT] PCA
#
#########################################################################
# Train Logistic Regression model using PC transformed Morgan features
AdaModel_ = AdaBoostClassifier(n_estimators = 100, learning_rate = 1)
Ada = AdaModel_.fit(maccs_train,label_train)
pred2 = Ada.predict(maccs_test)

#### Scores

In [32]:
Scores2 = Output(pred2,label_test)
Scores2

[1mConfusion Matrix[0m
[[1173   10]
 [   8   66]]
[1m
Recall[0m
0.8918918918918919
[1m
Accuracy[0m
0.9856801909307876
[1m
F1 Score[0m
0.88
[1m
ROC AUC Score[0m
0.9417194032578649


#### Cross Validation

In [33]:
av_df2 = CrossValidation(AdaModel_,maccs_features,labels)
av_df2

Unnamed: 0,0
fit_time,0.568489
score_time,0.057114
test_recall,0.886792
train_recall,0.980861
test_accuracy,0.988067
train_accuracy,0.99791
test_f1,0.903846
train_f1,0.983213
test_roc_auc,0.993438
train_roc_auc,0.99993


### Ada Boost with PCA using Logistic regression

In [34]:
#########################################################################
#
# Ada Boost with PCA using Logistic regression
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
from sklearn.linear_model import LogisticRegression 

mylogregmodel = LogisticRegression()

AdaModel_LR_PCA = AdaBoostClassifier(n_estimators = 100,base_estimator = mylogregmodel ,learning_rate = 1)
Ada = AdaModel_LR_PCA.fit(maccs_proj_train,label_train)
pred3 = Ada.predict(maccs_proj_test)

#### Scores

In [35]:
Scores3 = Output(pred3,label_test)
Scores3

[1mConfusion Matrix[0m
[[1165   18]
 [  10   64]]
[1m
Recall[0m
0.8648648648648649
[1m
Accuracy[0m
0.9777247414478918
[1m
F1 Score[0m
0.8205128205128206
[1m
ROC AUC Score[0m
0.9248246555938864


#### Cross Validation

In [36]:
av_df3 = CrossValidation(AdaModel_LR_PCA,maccs_proj_features,labels)
av_df3

Unnamed: 0,0
fit_time,0.343084
score_time,0.036466
test_recall,0.735849
train_recall,0.76555
test_accuracy,0.970167
train_accuracy,0.974619
test_f1,0.757282
train_f1,0.790123
test_roc_auc,0.981949
train_roc_auc,0.987459


### Ada Boost [WITHOUT] PCA using Logistic regression

In [37]:
#########################################################################
#
# Ada Boost with PCA using Logistic regression
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
from sklearn.linear_model import LogisticRegression 

mylogregmodel = LogisticRegression()

AdaModel_LR_ = AdaBoostClassifier(n_estimators = 100,base_estimator = mylogregmodel ,learning_rate = 1)
Ada = AdaModel_LR_.fit(maccs_train,label_train)
pred4 = Ada.predict(maccs_test)

#### Scores

In [38]:
Scores4 = Output(pred4,label_test)
Scores4

[1mConfusion Matrix[0m
[[1171   12]
 [   2   72]]
[1m
Recall[0m
0.972972972972973
[1m
Accuracy[0m
0.9888623707239459
[1m
F1 Score[0m
0.9113924050632912
[1m
ROC AUC Score[0m
0.9814146352607892


#### Cross Validation


In [39]:
av_df4 = CrossValidation(AdaModel_LR_,maccs_features,labels)
av_df4

Unnamed: 0,0
fit_time,1.061454
score_time,0.056118
test_recall,0.924528
train_recall,0.976077
test_accuracy,0.99284
train_accuracy,0.996417
test_f1,0.942308
train_f1,0.971429
test_roc_auc,0.99327
train_roc_auc,0.999855


### Ada Boost with PCA using Random Forests

In [40]:
#########################################################################
#
# Ada Boost with PCA using Logistic regression
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
from sklearn.linear_model import LogisticRegression 

mylogregmodel = LogisticRegression()

AdaModel_RF_PCA = AdaBoostClassifier(n_estimators = 100,base_estimator = mylogregmodel ,learning_rate = 1)
Ada = AdaModel_RF_PCA.fit(maccs_proj_train,label_train)
pred5 = Ada.predict(maccs_proj_test)

#### Scores

In [41]:
Scores5 = Output(pred5,label_test)
Scores5

[1mConfusion Matrix[0m
[[1165   18]
 [  10   64]]
[1m
Recall[0m
0.8648648648648649
[1m
Accuracy[0m
0.9777247414478918
[1m
F1 Score[0m
0.8205128205128206
[1m
ROC AUC Score[0m
0.9248246555938864


#### Cross Validation

In [42]:
av_df5 = CrossValidation(AdaModel_RF_PCA,maccs_proj_features,labels)
av_df5

Unnamed: 0,0
fit_time,0.325749
score_time,0.035991
test_recall,0.735849
train_recall,0.76555
test_accuracy,0.970167
train_accuracy,0.974619
test_f1,0.757282
train_f1,0.790123
test_roc_auc,0.981949
train_roc_auc,0.987459


### Ada Boost [WITHOUT] PCA using Random Forests

In [43]:
#########################################################################
#
# Ada Boost with PCA using Random Forests
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 100,criterion='gini',bootstrap = True, warm_start = False)

AdaModel_RF_ = AdaBoostClassifier(n_estimators = 100,base_estimator = rf ,learning_rate = 1)
RF = AdaModel_RF_.fit(maccs_train,label_train)
pred6 = RF.predict(maccs_test)

#### Scores

In [44]:
Scores6 = Output(pred6,label_test)
Scores6

[1mConfusion Matrix[0m
[[1178    5]
 [   3   71]]
[1m
Recall[0m
0.9594594594594594
[1m
Accuracy[0m
0.9936356404136834
[1m
F1 Score[0m
0.9466666666666667
[1m
ROC AUC Score[0m
0.9776164583856891


#### Cross Validation

In [45]:
av_df6 = CrossValidation(AdaModel_RF_,maccs_features,labels)
av_df6

Unnamed: 0,0
fit_time,0.230146
score_time,0.026292
test_recall,0.924528
train_recall,1.0
test_accuracy,0.99284
train_accuracy,1.0
test_f1,0.942308
train_f1,1.0
test_roc_auc,0.985639
train_roc_auc,1.0


## Using RDKit Features

In [46]:
# load different molecular features separately
##############################################
rdk_features    = gen_fprints(mols)
#maccs_features  = gen_MACCS(mols)
#morgan_features = gen_morgan_prints(mols,radius=18)

In [47]:
# Split data for [training] and [testing]
rdk_train, rdk_test, label_train, label_test = train_test_split(rdk_features, labels, \
                                                                test_size=0.3, shuffle=True)

In [48]:
######################
# Principal Components
######################

# Determine principal components using [training data] and apply the transformation to the [test data]
PC_rdk_proj = PCA(n_components = 5)
rdk_proj_train = PC_rdk_proj.fit_transform(rdk_train)
rdk_proj_test = PC_rdk_proj.transform(rdk_test)
rdk_proj_features = PC_rdk_proj.fit_transform(rdk_features)

### Ada Boost with PCA

In [49]:
#########################################################################
#
# Ada Boost with PCA
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
AdaModel_PCA = AdaBoostClassifier(n_estimators = 100, learning_rate = 1)
Ada = AdaModel_PCA.fit(rdk_proj_train,label_train)
pred1 = Ada.predict(rdk_proj_test)

#### Scores

In [50]:
Scores1 = Output(pred1,label_test)
Scores1

[1mConfusion Matrix[0m
[[1162    9]
 [  40   46]]
[1m
Recall[0m
0.5348837209302325
[1m
Accuracy[0m
0.9610182975338106
[1m
F1 Score[0m
0.652482269503546
[1m
ROC AUC Score[0m
0.7635989911226739


#### Cross Validation

In [51]:
av_df1 = CrossValidation(AdaModel_PCA,rdk_proj_features,labels)
av_df1

Unnamed: 0,0
fit_time,0.285064
score_time,0.035544
test_recall,0.509434
train_recall,0.712919
test_accuracy,0.953461
train_accuracy,0.975515
test_f1,0.580645
train_f1,0.784211
test_roc_auc,0.964319
train_roc_auc,0.992004


### Ada Boost [WITHOUT] PCA

In [52]:
#########################################################################
#
# Ada Boost [WITHOUT] PCA
#
#########################################################################
# Train Logistic Regression model using PC transformed Morgan features
AdaModel_ = AdaBoostClassifier(n_estimators = 100, learning_rate = 1)
Ada = AdaModel_.fit(rdk_train,label_train)
pred2 = Ada.predict(rdk_test)

#### Scores

In [53]:
Scores2 = Output(pred2,label_test)
Scores2

[1mConfusion Matrix[0m
[[1168    3]
 [   9   77]]
[1m
Recall[0m
0.8953488372093024
[1m
Accuracy[0m
0.9904534606205251
[1m
F1 Score[0m
0.927710843373494
[1m
ROC AUC Score[0m
0.9463934621571705


#### Cross Validation

In [54]:
av_df2 = CrossValidation(AdaModel_,rdk_features,labels)
av_df2

Unnamed: 0,0
fit_time,14.67323
score_time,0.716082
test_recall,0.811321
train_recall,0.985646
test_accuracy,0.983294
train_accuracy,0.998806
test_f1,0.86
train_f1,0.990385
test_roc_auc,0.989797
train_roc_auc,0.999974


### Ada Boost with PCA using Logistic regression

In [55]:
#########################################################################
#
# Ada Boost with PCA using Logistic regression
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
from sklearn.linear_model import LogisticRegression 

mylogregmodel = LogisticRegression()

AdaModel_LR_PCA = AdaBoostClassifier(n_estimators = 100,base_estimator = mylogregmodel ,learning_rate = 1)
Ada = AdaModel_LR_PCA.fit(rdk_proj_train,label_train)
pred3 = Ada.predict(rdk_proj_test)

#### Scores

In [56]:
Scores3 = Output(pred3,label_test)
Scores3

[1mConfusion Matrix[0m
[[1161   10]
 [  74   12]]
[1m
Recall[0m
0.13953488372093023
[1m
Accuracy[0m
0.9331742243436754
[1m
F1 Score[0m
0.22222222222222218
[1m
ROC AUC Score[0m
0.5654975870355291


#### Cross Validation

In [57]:
av_df3 = CrossValidation(AdaModel_LR_PCA,rdk_proj_features,labels)
av_df3

Unnamed: 0,0
fit_time,0.264704
score_time,0.033743
test_recall,0.207547
train_recall,0.23445
test_accuracy,0.934368
train_accuracy,0.943267
test_f1,0.285714
train_f1,0.340278
test_roc_auc,0.923927
train_roc_auc,0.916109


### Ada Boost [WITHOUT] PCA using Logistic regression

In [58]:
#########################################################################
#
# Ada Boost with PCA using Logistic regression
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
from sklearn.linear_model import LogisticRegression 

mylogregmodel = LogisticRegression()

AdaModel_LR_ = AdaBoostClassifier(n_estimators = 100,base_estimator = mylogregmodel ,learning_rate = 1)
Ada = AdaModel_LR_.fit(rdk_train,label_train)
pred4 = Ada.predict(rdk_test)

#### Scores

In [59]:
Scores4 = Output(pred4,label_test)
Scores4

[1mConfusion Matrix[0m
[[1170    1]
 [   4   82]]
[1m
Recall[0m
0.9534883720930233
[1m
Accuracy[0m
0.9960222752585521
[1m
F1 Score[0m
0.9704142011834319
[1m
ROC AUC Score[0m
0.9763172005640179


#### Cross Validation

In [60]:
av_df4 = CrossValidation(AdaModel_LR_,rdk_features,labels)
av_df4

Unnamed: 0,0
fit_time,9.464251
score_time,1.053659
test_recall,0.830189
train_recall,0.995215
test_accuracy,0.98926
train_accuracy,0.999403
test_f1,0.907216
train_f1,0.995215
test_roc_auc,0.991924
train_roc_auc,0.999993


### Ada Boost with PCA using Random Forests

In [61]:
#########################################################################
#
# Ada Boost with PCA using Random Forests
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 100,criterion='gini',bootstrap = True, warm_start = False)

AdaModel_RF_PCA = AdaBoostClassifier(n_estimators = 100,base_estimator = rf ,learning_rate = 1)
RF = AdaModel_RF_PCA.fit(rdk_proj_train,label_train)
pred5 = RF.predict(rdk_proj_test)

#### Scores

In [62]:
Scores5 = Output(pred5,label_test)
Scores5

[1mConfusion Matrix[0m
[[1166    5]
 [  38   48]]
[1m
Recall[0m
0.5581395348837209
[1m
Accuracy[0m
0.9657915672235481
[1m
F1 Score[0m
0.6906474820143885
[1m
ROC AUC Score[0m
0.7769348400293926


#### Cross Validation

In [63]:
av_df5 = CrossValidation(AdaModel_RF_PCA,rdk_proj_features,labels)
av_df5

Unnamed: 0,0
fit_time,32.749951
score_time,2.396834
test_recall,0.641509
train_recall,1.0
test_accuracy,0.972554
train_accuracy,0.999403
test_f1,0.747253
train_f1,0.995238
test_roc_auc,0.980435
train_roc_auc,0.999995


### Ada Boost [WITHOUT] PCA using Random Forests

In [64]:
#########################################################################
#
# Ada Boost with PCA using Random Forests
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 100,criterion='gini',bootstrap = True, warm_start = False)

AdaModel_RF_ = AdaBoostClassifier(n_estimators = 100,base_estimator = rf ,learning_rate = 1)
RF = AdaModel_RF_.fit(rdk_train,label_train)
pred6 = RF.predict(rdk_test)

#### Scores

In [65]:
Scores6 = Output(pred6,label_test)
Scores6

[1mConfusion Matrix[0m
[[1170    1]
 [  16   70]]
[1m
Recall[0m
0.813953488372093
[1m
Accuracy[0m
0.9864757358790772
[1m
F1 Score[0m
0.89171974522293
[1m
ROC AUC Score[0m
0.9065497587035529


#### Cross Validation

In [66]:
av_df6 = CrossValidation(AdaModel_RF_,rdk_features,labels)
av_df6

Unnamed: 0,0
fit_time,80.766676
score_time,3.842605
test_recall,0.849057
train_recall,0.995215
test_accuracy,0.98926
train_accuracy,0.999403
test_f1,0.909091
train_f1,0.995215
test_roc_auc,0.987718
train_roc_auc,0.999995


## Using Nathan Features

In [67]:
# load different molecular features separately
##############################################
#rdk_features    = gen_fprints(mols)
#maccs_features  = gen_MACCS(mols)
#morgan_features = gen_morgan_prints(mols,radius=18)
nathan_features, bondIDSourceMap = gen_nathan_prints(mols)

In [68]:
# Split data for [training] and [testing]
nathan_train, nathan_test, label_train, label_test = train_test_split(nathan_features, labels, \
                                                                test_size=0.3, shuffle=True)

In [69]:
######################
# Principal Components
######################

# Determine principal components using [training data] and apply the transformation to the [test data]
PC_nathan_proj = PCA(n_components = 10)
nathan_proj_train = PC_nathan_proj.fit_transform(nathan_train)
nathan_proj_test = PC_nathan_proj.transform(nathan_test)
nathan_proj_features = PC_nathan_proj.fit_transform(nathan_features)

In [70]:
### Ada Boost with PCA

#########################################################################
#
# Ada Boost with PCA
#
#########################################################################
# Train Ada Boost model using PC transformed Nathan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
AdaModel_PCA = AdaBoostClassifier(n_estimators = 100, learning_rate = 1)
Ada = AdaModel_PCA.fit(nathan_proj_train,label_train)
pred1 = Ada.predict(nathan_proj_test)

#### Scores

Scores1 = Output(pred1,label_test)
Scores1

#### Cross Validation

av_df1 = CrossValidation(AdaModel_PCA,nathan_proj_features,labels)
av_df1

[1mConfusion Matrix[0m
[[1188    1]
 [   9   59]]
[1m
Recall[0m
0.8676470588235294
[1m
Accuracy[0m
0.9920445505171042
[1m
F1 Score[0m
0.9218749999999999
[1m
ROC AUC Score[0m
0.933403007965171


Unnamed: 0,0
fit_time,0.666419
score_time,0.061276
test_recall,0.886792
train_recall,1.0
test_accuracy,0.986874
train_accuracy,1.0
test_f1,0.895238
train_f1,1.0
test_roc_auc,0.971061
train_roc_auc,1.0


In [71]:
### Ada Boost [WITHOUT] PCA
#########################################################################
#
# Ada Boost [WITHOUT] PCA
#
#########################################################################
# Train Logistic Regression model using PC transformed Nathan features
AdaModel_ = AdaBoostClassifier(n_estimators = 100, learning_rate = 1)
Ada = AdaModel_.fit(nathan_train,label_train)
pred2 = Ada.predict(nathan_test)

#### Scores

Scores2 = Output(pred2,label_test)
Scores2

#### Cross Validation

av_df2 = CrossValidation(AdaModel_,nathan_features,labels)
av_df2

[1mConfusion Matrix[0m
[[1186    3]
 [   9   59]]
[1m
Recall[0m
0.8676470588235294
[1m
Accuracy[0m
0.9904534606205251
[1m
F1 Score[0m
0.9076923076923077
[1m
ROC AUC Score[0m
0.9325619650719835


Unnamed: 0,0
fit_time,0.437255
score_time,0.07855
test_recall,0.886792
train_recall,1.0
test_accuracy,0.990453
train_accuracy,1.0
test_f1,0.921569
train_f1,1.0
test_roc_auc,0.995313
train_roc_auc,1.0


In [72]:
### Ada Boost with PCA using Logistic regression

#########################################################################
#
# Ada Boost with PCA using Logistic regression
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
from sklearn.linear_model import LogisticRegression 

mylogregmodel = LogisticRegression()

AdaModel_LR_PCA = AdaBoostClassifier(n_estimators = 100,base_estimator = mylogregmodel ,learning_rate = 1)
Ada = AdaModel_LR_PCA.fit(nathan_proj_train,label_train)
pred3 = Ada.predict(nathan_proj_test)

#### Scores

Scores3 = Output(pred3,label_test)
Scores3


[1mConfusion Matrix[0m
[[1189    0]
 [   8   60]]
[1m
Recall[0m
0.8823529411764706
[1m
Accuracy[0m
0.9936356404136834
[1m
F1 Score[0m
0.9375
[1m
ROC AUC Score[0m
0.9411764705882353


In [73]:
#### Cross Validation

av_df3 = CrossValidation(AdaModel_LR_PCA,nathan_proj_features,labels)
av_df3

Unnamed: 0,0
fit_time,0.769023
score_time,0.032443
test_recall,0.792453
train_recall,0.832536
test_accuracy,0.98568
train_accuracy,0.988653
test_f1,0.875
train_f1,0.901554
test_roc_auc,0.969355
train_roc_auc,0.991226


In [74]:
### Ada Boost [WITHOUT] PCA using Logistic regression

#########################################################################
#
# Ada Boost with PCA using Logistic regression
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
from sklearn.linear_model import LogisticRegression 

mylogregmodel = LogisticRegression(max_iter=10000)

AdaModel_LR_ = AdaBoostClassifier(n_estimators = 100,base_estimator = mylogregmodel ,learning_rate = 1)
Ada = AdaModel_LR_.fit(nathan_train,label_train)
pred4 = Ada.predict(nathan_test)

#### Scores

Scores4 = Output(pred4,label_test)
Scores4

[1mConfusion Matrix[0m
[[1186    3]
 [   4   64]]
[1m
Recall[0m
0.9411764705882353
[1m
Accuracy[0m
0.994431185361973
[1m
F1 Score[0m
0.9481481481481482
[1m
ROC AUC Score[0m
0.9693266709543363


In [75]:
#### Cross Validation

av_df4 = CrossValidation(AdaModel_LR_,nathan_features,labels)
av_df4

Unnamed: 0,0
fit_time,1.929114
score_time,0.067948
test_recall,0.849057
train_recall,0.913876
test_accuracy,0.988067
train_accuracy,0.992535
test_f1,0.9
train_f1,0.938575
test_roc_auc,0.981324
train_roc_auc,0.99859


In [76]:
### Ada Boost with PCA using Random Forests

#########################################################################
#
# Ada Boost with PCA using Random Forests
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 100,criterion='gini',bootstrap = True, warm_start = False)

AdaModel_RF_PCA = AdaBoostClassifier(n_estimators = 100,base_estimator = rf ,learning_rate = 1)
RF = AdaModel_RF_PCA.fit(nathan_proj_train,label_train)
pred5 = RF.predict(nathan_proj_test)

#### Scores

Scores5 = Output(pred5,label_test)
Scores5

[1mConfusion Matrix[0m
[[1189    0]
 [  10   58]]
[1m
Recall[0m
0.8529411764705882
[1m
Accuracy[0m
0.9920445505171042
[1m
F1 Score[0m
0.9206349206349206
[1m
ROC AUC Score[0m
0.9264705882352942


In [77]:
#### Cross Validation

av_df5 = CrossValidation(AdaModel_RF_PCA,nathan_proj_features,labels)
av_df5

Unnamed: 0,0
fit_time,0.395118
score_time,0.023188
test_recall,0.849057
train_recall,1.0
test_accuracy,0.986874
train_accuracy,1.0
test_f1,0.891089
train_f1,1.0
test_roc_auc,0.983932
train_roc_auc,1.0


In [78]:
### Ada Boost [WITHOUT] PCA using Random Forests

#########################################################################
#
# Ada Boost with PCA using Random Forests
#
#########################################################################
# Train Ada Boost model using PC transformed Morgan features

# n_estimators -> number of weak learners to train iteratively
# learning_rate -> It contributes to the weight of weak learners. It uses 1 as a default value
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 100,criterion='gini',bootstrap = True, warm_start = False)

AdaModel_RF_ = AdaBoostClassifier(n_estimators = 100,base_estimator = rf ,learning_rate = 1)
RF = AdaModel_RF_.fit(nathan_train,label_train)
pred6 = RF.predict(nathan_test)

#### Scores

Scores6 = Output(pred6,label_test)
Scores6


[1mConfusion Matrix[0m
[[1189    0]
 [   7   61]]
[1m
Recall[0m
0.8970588235294118
[1m
Accuracy[0m
0.994431185361973
[1m
F1 Score[0m
0.9457364341085273
[1m
ROC AUC Score[0m
0.9485294117647058


In [79]:
#### Cross Validation

av_df6 = CrossValidation(AdaModel_RF_,nathan_features,labels)
av_df6

Unnamed: 0,0
fit_time,0.2952
score_time,0.025778
test_recall,0.849057
train_recall,1.0
test_accuracy,0.988067
train_accuracy,1.0
test_f1,0.9
train_f1,1.0
test_roc_auc,0.986889
train_roc_auc,1.0


In [3]:
result = pd.concat([N_Model1_PCA, N_Model2_,N_Model3_PCA,N_Model4_,N_Model5_PCA,N_Model6_], axis=1, join='inner')
result.columns = ['RF_PCA','RF_','C','D','E','F']
results

NameError: name 'pd' is not defined