In [1]:
import pandas as pd
import numpy as np
import copy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from IPython.display import display,clear_output,HTML
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

import keras
from keras.layers import Dense,Dropout
from keras.models import Sequential

import researchpy as rp

%matplotlib inline
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows',305)

from sklearn.feature_selection import SelectKBest,chi2,RFE,RFECV,f_regression,SelectFromModel,f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier,BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split,cross_val_score

from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn import metrics

from xgboost import XGBClassifier
from pandas_ml import ConfusionMatrix

from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score,auc,roc_curve,recall_score,precision_score,matthews_corrcoef
from imblearn.over_sampling import SMOTE

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go

import featuretools as ft

from scipy import stats
import re

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

Using TensorFlow backend.


In [2]:
### Function to plot Feature Importance
def feat_impt_plot(feat_impt,y_labels,width_s=1000,height_s=1000):
    
    m=pd.DataFrame(y_labels,feat_impt).reset_index()
    m.columns=['Feature_Importance','Features']
    m.sort_values(by='Feature_Importance',inplace=True,ascending=False)
    m['Features']=m['Features'].str.replace("dom_function_","")
    m['Features']=m['Features'].str.replace("js_function_","")
    m['Features']=m['Features'].apply(lambda x: str(x).lstrip('.'))
    m['Features']=m['Features'].str.replace("(","")
    
    data = [go.Bar(x=m.Feature_Importance.values,y=m.Features.values,text=np.round(m.Feature_Importance,4),
            textposition = 'outside',
            marker=dict(
                color='rgb(158,202,225)',
                line=dict(
                    color='rgb(8,48,107)',
                    width=1.5),
            ),
            opacity=0.6,
            orientation='h'
        )]
    layout = go.Layout(autosize=False,
    width=width_s,
    height=height_s,
    xaxis=dict(title='Feature Importances',
        tickfont=dict(
            size=12,
            color='black'
        )),
    yaxis=dict(automargin=True))
    fig = go.Figure(data=data, layout=layout)
    iplot(fig)

In [3]:
def model_build(model,X_train,Y_train,X_test,Y_test,tree=False,feat=None):
    
    y_pred_train=model.predict(X_train)
    acc_train=accuracy_score(Y_train,y_pred_train)
    
    print("Accuracy of the model for training data is:",acc_train)
    print("Confusion Matrix for training data is:")
    cm_train=ConfusionMatrix(Y_train,y_pred_train)
    display(cm_train)
    
    
    y_pred_test=model.predict(X_test)
    acc_test=accuracy_score(Y_test,y_pred_test)
    print("Accuracy of the model for test data is:",acc_test)
    print("Confusion Matrix for test data is:")
    cm_test=ConfusionMatrix(Y_test,y_pred_test)
    display(cm_test)

    fpr, tpr, threshold = roc_curve(Y_test, y_pred_test)
    roc_auc =auc(fpr, tpr)
    mcc=matthews_corrcoef(Y_test,y_pred_test)
    
    precision=precision_score(Y_test,y_pred_test)
    recall=recall_score(Y_test,y_pred_test)
    if tree==True:
        feat_impt_plot(model.feature_importances_,feat)
        return model,cm_train,cm_test,acc_train,acc_test,roc_auc,model.feature_importances_,mcc,precision,recall
    
    return model,cm_train,cm_test,acc_train,acc_test,roc_auc,mcc,precision,recall

def KNN_model(X_train,Y_train,X_test,Y_test):
    print('\033[1m' + 'K-Nearest Neighbors' + '\033[0m')
    knn=KNeighborsClassifier()
    knn.fit(X_train,Y_train)

    return model_build(knn,X_train,Y_train,X_test,Y_test)

# def Logistic_model(X_train,Y_train,X_test,Y_test):
#     print('\033[1m' + 'Logistic Regression' + '\033[0m')
#     log=LogisticRegression()
#     log.fit(X_train,Y_train)
    
#     return model_build(log,X_train,Y_train,X_test,Y_test)

def RandomForest(X_train,Y_train,X_test,Y_test,feat):
    print('\033[1m' + 'RandomForest Classifier' + '\033[0m')
    rf=RandomForestClassifier(random_state=0)
    rf.fit(X_train,Y_train)

    
    return model_build(rf,X_train,Y_train,X_test,Y_test,True,feat)

def AdaBoost(X_train,Y_train,X_test,Y_test,feat):
    print('\033[1m' + 'Adaoost Classifier' + '\033[0m')
    ada=AdaBoostClassifier(random_state=0)
    ada.fit(X_train,Y_train)
    
    return model_build(ada,X_train,Y_train,X_test,Y_test,True,feat)

def GradientBoosting(X_train,Y_train,X_test,Y_test,feat):
    print('\033[1m' + 'GradientBoosting Classifier' + '\033[0m')
    gbc=GradientBoostingClassifier(random_state=0)
    gbc.fit(X_train,Y_train)
    
    return model_build(gbc,X_train,Y_train,X_test,Y_test,True,feat)

def ExtraTree(X_train,Y_train,X_test,Y_test,feat):
    print('\033[1m' + 'ExtraTree Classifier' + '\033[0m')
    ext_tree=ExtraTreesClassifier(random_state=0)
    ext_tree.fit(X_train,Y_train)
    
    return model_build(ext_tree,X_train,Y_train,X_test,Y_test,True,feat)

def XGB(X_train,Y_train,X_test,Y_test,feat):
    print('\033[1m' + 'XGB Classifier' + '\033[0m')
    xgb=XGBClassifier(random_state=0)
    xgb.fit(X_train,Y_train)
    
    return model_build(xgb,X_train,Y_train,X_test,Y_test,True,feat)

def Bagging(X_train,Y_train,X_test,Y_test):
    print('\033[1m' + 'Bagging Classifier' + '\033[0m')
    bagging=BaggingClassifier(random_state=0)
    bagging.fit(X_train,Y_train)
    
    return model_build(bagging,X_train,Y_train,X_test,Y_test)

def XGB(X_train,Y_train,X_test,Y_test,feat):
    print('\033[1m' + 'XGB Classifier' + '\033[0m')
    xgb=XGBClassifier(random_state=0)
    xgb.fit(X_train,Y_train)
    
    return model_build(xgb,X_train,Y_train,X_test,Y_test,True,feat)

def DeepLearning1(X_train,Y_train,X_test,Y_test):
    print('\033[1m' + 'Neural Network-1' + '\033[0m')
    np.random.seed(0)
    deep_learning=Sequential()
    deep_learning.add(Dense(units=64,activation='relu',use_bias=True,kernel_initializer='uniform',input_dim=X_train.shape[1]))
    deep_learning.add(Dense(units=128,activation='relu',use_bias=True,kernel_initializer='uniform'))
    deep_learning.add(Dense(units=1,activation='sigmoid',use_bias=True,kernel_initializer='uniform'))
    deep_learning.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
    deep_learning.fit(X_train,Y_train,batch_size=20,epochs=80,verbose=False)
    acc_train=deep_learning.evaluate(X_train,Y_train,verbose=False)[1]
    print("The accuracy of the model on training data is:",acc_train)
    cm_train=ConfusionMatrix(Y_train,deep_learning.predict_classes(X_train,batch_size=1,verbose=False).reshape(1,len(X_train))[0])
    cm_test=ConfusionMatrix(Y_test,deep_learning.predict_classes(X_test,batch_size=1,verbose=False).reshape(1,len(X_test))[0])
    display(cm_train)
    acc_test=accuracy_score(Y_test,deep_learning.predict_classes(X_test,batch_size=1,verbose=False))
    print("The accuracy of the model on test data is:",acc_test)
    display(cm_test)
    mcc=matthews_corrcoef(Y_test,deep_learning.predict_classes(X_test,batch_size=1,verbose=False))
    precision=precision_score(Y_test,deep_learning.predict_classes(X_test,batch_size=1,verbose=False))
    recall=recall_score(Y_test,deep_learning.predict_classes(X_test,batch_size=1,verbose=False))
    return  deep_learning,cm_train,cm_test,acc_train,acc_test,roc_auc,mcc,precision,recall

In [4]:
### Read HTML 
df_html=pd.read_csv('../../html/HTML.csv',index_col='domain')
df_html.drop(['Unnamed: 0'],axis=1,inplace=True)
cookie=[col for col in df_html.columns if 'number' in col]
df_html.drop(cookie,axis=1,inplace=True)
print("Shape of HTML dataset:",df_html.shape)
df_html.Target=df_html.Target.apply(lambda x: 1 if x=='Malicious' else 0)
count_columns=[col for col in df_html.columns if col.endswith('count')]
df_html['total_count']=df_html[count_columns].sum(axis=1)

href_columns=[col for col in df_html.columns if (('href_absolute' in col) | ('href_relative' in col) | ('href_page' in col))  ]
df_html['total_href']=df_html[href_columns].sum(axis=1)

total_img_src=[col for col in df_html.columns if 'img_src_' in col]
df_html['total_img_src']=df_html[total_img_src].sum(axis=1)


### Read JS 
df_js=pd.read_csv('../Javascript.csv',low_memory=False,index_col='domain')
cookie=[col for col in df_js.columns if 'number' in col]
df_js.drop(cookie,axis=1,inplace=True)
df_js.rename(columns={'Malicious':'Target'},inplace=True)
print("Shape of JS,DOM dataset:",df_js.shape)
df_js.Target=df_js.Target.apply(lambda x: 1 if x=='Malicious' else 0)


### Read HTTP
df_http=pd.read_csv('../HTTP.csv')
df_http.rename(columns={'Malicious':'Target'},inplace=True)
df_http.fillna(value=0,inplace=True)
cookies=[columns for columns in df_http.columns if 'number' in columns]
df_http.drop(cookies,axis=1,inplace=True)
df_http=df_http[df_http['domain']!=0]
df_http.sort_values(by='Target',inplace=True,ascending=False)
df_http.drop_duplicates(['domain'], keep='first',inplace=True)
df_http=df_http.sample(frac=1,random_state=0)
df_http.set_index(['domain'],drop=True,inplace=True)
print("Shape of HTTP dataset:",df_http.shape)



### Read URL
df_url=pd.read_csv('../URL.csv')
df_url.fillna(value=0,inplace=True)
cookies=[columns for columns in df_url.columns if 'number' in columns]
df_url.drop(cookies,axis=1,inplace=True)
df_url.drop(['url_host','url_ip'],axis=1,inplace=True)
df_url=df_url[df_url['domain']!=0]
df_url.sort_values(by='Target',inplace=True,ascending=False)
df_url.drop_duplicates(['domain'], keep='first',inplace=True)
df_url=df_url.sample(frac=1,random_state=0)
print("Shape of URL dataset:",df_url.shape)
df_url.set_index('domain',inplace=True,drop=True)

Shape of HTML dataset: (43491, 1461)
Shape of JS,DOM dataset: (43294, 401)
Shape of HTTP dataset: (45856, 672)



Columns (76,77,229,230,231,232,233,234,235,236,237,238,239,240,241,242,244,245,246,247,248,249,250,251,252,253,254,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,599,600,601,602,603,604,605,606,607,608,609,610,611,612,613,614,615,616,617,618,619,620,621,622,623,624,625,626,627,628,629,630,631,632,633,634,635,636,637,638,639,642,643,644,645,646,647,648,649,650,651,652,653,654,655,656,657,658,659,660,661,662,663,664,665,666,667,668,669,670,671,672,673,674,675,676,677,678,679,680,681,682,683,684,685,686,687,688,689,690,691,692,693,694,695,696,697,

Shape of URL dataset: (46771, 4194)


In [5]:
df=pd.concat([df_js,df_html,df_http,df_url],axis=1,join='inner')
df.drop(['Unnamed: 0'],axis=1,inplace=True)
df['Target_z']=df[['Target','Target','Target','Target']].apply(max,axis=1)
df.drop(['Target','Target','Target','Target'],axis=1,inplace=True)
print("Shape of combined dataset:",df.shape)
df.rename(columns={'Target_z':'Target'},inplace=True)
df.Target.value_counts()

Shape of combined dataset: (39183, 6726)


0    34742
1     4441
Name: Target, dtype: int64

In [6]:
columns=['dom_function_document.createElement(',
 'dom_function_document.write(',
 'dom_function_window.addEventListener(',
 'dom_function_window.setInterval(',
 'dom_function_window.setTimeout(',
 'js_function_.charCodeAt(',
 'js_function_.concat(',
 'js_function_.exec(',
 'js_function_.link(',
 'js_function_.search(',
 'js_function_.substring(',
 'js_function_escape(',
 'js_function_eval(',
 'js_function_parseInt(',
 'js_function_String.fromCharCode(',
 'js_function_unescape(',
 'js_function_.replace(',
 'a_count',
 'div_count',
 'embed_count',
 'frame_count',
 'iframe_count',
 'img_count',
 'input_count',
 'link_count',
 'meta_count',
 'object_count',
 'style_count',
 'title_count',
 'total_count',
 'total_href',
 'total_img_src',
 'iframe_hidden_true',
 'script_type_text/javascript',
 'a_href_out_of_domain',
 'area_href_out_of_domain',
 'audio_src_out_of_domain',
 'base_href_out_of_domain',
 'embed_src_out_of_domain',
 'form_action_out_of_domain',
 'frame_src_out_of_domain',
 'iframe_src_out_of_domain',
 'img_src_out_of_domain',
 'img_srcset_out_of_domain',
 'link_href_out_of_domain',
 'object_data_out_of_domain',
 'script_src_out_of_domain',
 'source_src_out_of_domain',
 'source_srcset_out_of_domain',
 'video_src_out_of_domain',
 'http_header_cache-control_set_max-age',
 'http_header_cache-control_set_must-revalidate',
 'http_header_cache-control_set_no-cache',
 'http_header_cache-control_set_no-store',
 'http_header_cache-control_set_private',
 'http_header_cache-control_set_public',
 'http_header_content-encoding_gzip',
 'http_header_content-language_text/html',
 'http_header_content-length',
 'http_header_server_apache',
 'http_header_server_nginx',
 'url_char_-',
 'url_char_.',
 'url_char_a',
 'url_char_b',
 'url_char_c',
 'url_char_d',
 'url_char_e',
 'url_char_f',
 'url_char_g',
 'url_char_h',
 'url_char_i',
 'url_char_j',
 'url_char_k',
 'url_char_l',
 'url_char_m',
 'url_char_n',
 'url_char_o',
 'url_char_p',
 'url_char_r',
 'url_char_s',
 'url_char_t',
 'url_char_u',
 'url_char_v',
 'url_char_w',
 'url_char_x',
 'url_char_y',
 'url_char_z',
 'url_extension_.com',
 'url_extension_.i',
 'url_extension_.net',
 'url_extensions',
 'url_length',
 'url_tlds',
 'url_words_with_length_4',
 'url_words_with_length_5',
 'url_words_with_length_6',
 'url_words_with_length_7',
 'url_words_with_length_8',
 'Target']

print("features being used")
display(columns)

features being used


['dom_function_document.createElement(',
 'dom_function_document.write(',
 'dom_function_window.addEventListener(',
 'dom_function_window.setInterval(',
 'dom_function_window.setTimeout(',
 'js_function_.charCodeAt(',
 'js_function_.concat(',
 'js_function_.exec(',
 'js_function_.link(',
 'js_function_.search(',
 'js_function_.substring(',
 'js_function_escape(',
 'js_function_eval(',
 'js_function_parseInt(',
 'js_function_String.fromCharCode(',
 'js_function_unescape(',
 'js_function_.replace(',
 'a_count',
 'div_count',
 'embed_count',
 'frame_count',
 'iframe_count',
 'img_count',
 'input_count',
 'link_count',
 'meta_count',
 'object_count',
 'style_count',
 'title_count',
 'total_count',
 'total_href',
 'total_img_src',
 'iframe_hidden_true',
 'script_type_text/javascript',
 'a_href_out_of_domain',
 'area_href_out_of_domain',
 'audio_src_out_of_domain',
 'base_href_out_of_domain',
 'embed_src_out_of_domain',
 'form_action_out_of_domain',
 'frame_src_out_of_domain',
 'iframe_src_o

### <a id='1' style="text-decoration : none; color : #000000;"> 1. Initial Models</a>

In [7]:
df_sel=df[columns].copy(deep=True)

In [8]:
train=df_sel.iloc[:31346,:]
train_X=train.iloc[:,train.columns!='Target'].values
train_Y=train.Target.values

scaler=StandardScaler()
scaled_X_train=scaler.fit_transform(train_X)

test=df_sel.iloc[31346:,:]
test_X=test.iloc[:,test.columns!='Target'].values
test_Y=test.Target.values
scaled_X_test=scaler.transform(test_X)
columns.remove('Target')

ind_columns=['KNeighborsClassifier','RandomForestClassifier','AdaBoostClassifier',
             'GradientBoostingClassifier','ExtraTreeClassifier','XGB','BaggingClassifier','Neural Network-1']

f=[KNN_model,RandomForest,AdaBoost,GradientBoosting, ExtraTree,XGB,Bagging,DeepLearning1]

## Table to store training and test measures
table_80_50_train=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR'])
table_80_50_test=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR','auc','MCC','Precision','Recall'])

feat_imp=[None]*5
j=0
for i in range(0,8):
    if i>=1 and i<=5:
        model,cm_train,cm_test,acc_train,acc_test,roc_auc,feat_imp[j],mcc,prec,rec=f[i](scaled_X_train,train_Y,scaled_X_test,test_Y,columns)
        j=j+1
    else:
        model,cm_train,cm_test,acc_train,acc_test,roc_auc,mcc,prec,rec=f[i](scaled_X_train,train_Y,scaled_X_test,test_Y)
    table_80_50_train.loc[i]=   ([acc_train,cm_train.TP,cm_train.TN,cm_train.FP,cm_train.FN,cm_train.FPR,cm_train.FNR])
    table_80_50_test.loc[i] =   ([acc_test ,cm_test.TP, cm_test.TN, cm_test.FP, cm_test.FN, cm_test.FPR, cm_test.FNR,roc_auc,mcc,prec,rec])
    print("\n")

[1mK-Nearest Neighbors[0m
Accuracy of the model for training data is: 0.9808907037580552
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      27723    82    27805
True         517  3024     3541
__all__    28240  3106    31346

Accuracy of the model for test data is: 0.9743524307770831
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6902    35     6937
True         166   734      900
__all__     7068   769     7837



[1mRandomForest Classifier[0m



The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.



Accuracy of the model for training data is: 0.997894468193709
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      27801     4    27805
True          62  3479     3541
__all__    27863  3483    31346

Accuracy of the model for test data is: 0.9765216281740462
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6900    37     6937
True         147   753      900
__all__     7047   790     7837



[1mAdaoost Classifier[0m
Accuracy of the model for training data is: 0.9728514004976712
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      27454   351    27805
True         500  3041     3541
__all__    27954  3392    31346

Accuracy of the model for test data is: 0.9706520352175577
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6847    90     6937
True         140   760      900
__all__     6987   850     7837



[1mGradientBoosting Classifier[0m
Accuracy of the model for training data is: 0.9797741338607797
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      27647   158    27805
True         476  3065     3541
__all__    28123  3223    31346

Accuracy of the model for test data is: 0.9756284292458849
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6883    54     6937
True         137   763      900
__all__     7020   817     7837



[1mExtraTree Classifier[0m



The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.



Accuracy of the model for training data is: 1.0
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      27805     0    27805
True           0  3541     3541
__all__    27805  3541    31346

Accuracy of the model for test data is: 0.9746076304708434
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6904    33     6937
True         166   734      900
__all__     7070   767     7837



[1mXGB Classifier[0m
Accuracy of the model for training data is: 0.9791679959165444
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      27639   166    27805
True         487  3054     3541
__all__    28126  3220    31346

Accuracy of the model for test data is: 0.9755008293990047
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6882    55     6937
True         137   763      900
__all__     7019   818     7837



[1mBagging Classifier[0m
Accuracy of the model for training data is: 0.9974797422318638
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      27801     4    27805
True          75  3466     3541
__all__    27876  3470    31346

Accuracy of the model for test data is: 0.978435625877249
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6893    44     6937
True         125   775      900
__all__     7018   819     7837



[1mNeural Network-1[0m
The accuracy of the model on training data is: 0.9996490780322849


Predicted  False  True  __all__
Actual                         
False      27803     2    27805
True           9  3532     3541
__all__    27812  3534    31346

The accuracy of the model on test data is: 0.9729488324614011


Predicted  False  True  __all__
Actual                         
False       6842    95     6937
True         117   783      900
__all__     6959   878     7837





### <a id='1.A' style="text-decoration : none; color : #000000;"> A. Results</a>

In [9]:
trace0 = go.Scatter(x=table_80_50_test.iloc[0].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[0].values[[0,7,8,9,10]],
                    mode = 'lines',
                    name = 'KNN_model')


trace2 = go.Scatter(x=table_80_50_test.iloc[1].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[1].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'Random Forest')


trace3 = go.Scatter(x=table_80_50_test.iloc[2].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[2].values[[0,7,8,9,10]],
                    mode = 'lines',
                    name = 'Ada Boost')

trace4 = go.Scatter(x=table_80_50_test.iloc[3].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[3].values[[0,7,8,9,10]],
                    mode = 'lines+markers',
                    name = 'GradientBoosting')

trace5 = go.Scatter(x=table_80_50_test.iloc[4].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[4].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'ExtraTree Classifier')


trace1 = go.Scatter(x=table_80_50_test.iloc[5].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[5].values[[0,7,8,9,10]],
                    mode = 'lines+markers',
                    name = 'XGB')


trace6 = go.Scatter(x=table_80_50_test.iloc[6].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[6].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'Bagging')

trace7 = go.Scatter(x=table_80_50_test.iloc[7].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[7].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'Neural Network')



data = [trace0, trace1, trace2, trace3, trace4, trace5, trace6, trace7]
layout = go.Layout(yaxis=dict(tick0=0,dtick=0.05),title='Result comparision for Features selected in Section 1')
fig = go.Figure(data=data, layout=layout)

iplot(fig, filename='Result')

In [10]:
table_80_50_train['index']=ind_columns
table_80_50_test['index']=ind_columns


table_80_50_test.set_index(['index'],drop=True,inplace=True)
table_80_50_train.set_index(['index'],drop=True,inplace=True)

print("Training Results")
display(table_80_50_train)
print("Test Results")
display(table_80_50_test)

Training Results


Unnamed: 0_level_0,accuracy,TP,TN,FP,FN,FPR,FNR
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
KNeighborsClassifier,0.980891,3024.0,27723.0,82.0,517.0,0.002949,0.146004
RandomForestClassifier,0.997894,3479.0,27801.0,4.0,62.0,0.000144,0.017509
AdaBoostClassifier,0.972851,3041.0,27454.0,351.0,500.0,0.012624,0.141203
GradientBoostingClassifier,0.979774,3065.0,27647.0,158.0,476.0,0.005682,0.134425
ExtraTreeClassifier,1.0,3541.0,27805.0,0.0,0.0,0.0,0.0
XGB,0.979168,3054.0,27639.0,166.0,487.0,0.00597,0.137532
BaggingClassifier,0.99748,3466.0,27801.0,4.0,75.0,0.000144,0.02118
Neural Network-1,0.999649,3532.0,27803.0,2.0,9.0,7.2e-05,0.002542


Test Results


Unnamed: 0_level_0,accuracy,TP,TN,FP,FN,FPR,FNR,auc,MCC,Precision,Recall
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
KNeighborsClassifier,0.974352,734.0,6902.0,35.0,166.0,0.005045,0.184444,0.905255,0.868669,0.954486,0.815556
RandomForestClassifier,0.976522,753.0,6900.0,37.0,147.0,0.005334,0.163333,0.915666,0.880372,0.953165,0.836667
AdaBoostClassifier,0.970652,760.0,6847.0,90.0,140.0,0.012974,0.155556,0.915735,0.852509,0.894118,0.844444
GradientBoostingClassifier,0.975628,763.0,6883.0,54.0,137.0,0.007784,0.152222,0.919997,0.876402,0.933905,0.847778
ExtraTreeClassifier,0.974608,734.0,6904.0,33.0,166.0,0.004757,0.184444,0.905399,0.869987,0.956975,0.815556
XGB,0.975501,763.0,6882.0,55.0,137.0,0.007928,0.152222,0.919925,0.875778,0.932763,0.847778
BaggingClassifier,0.978436,775.0,6893.0,44.0,125.0,0.006343,0.138889,0.927384,0.890855,0.946276,0.861111
Neural Network-1,0.972949,783.0,6842.0,95.0,117.0,0.013695,0.13,0.927384,0.865596,0.8918,0.87


### <a id='1.B' style="text-decoration : none; color : #000000;"> B. Average Feature Importances</a>

In [11]:
feat_impt_plot(np.average(feat_imp,axis=0),columns)

### <a id='2' style="text-decoration : none; color : #000000;"> 2. OverSampling </a>

In [12]:
sm = SMOTE(random_state=12, ratio = {1:27805})
X_train, Y_train = sm.fit_sample(scaled_X_train,train_Y)


ind_columns=['KNeighborsClassifier','RandomForestClassifier','AdaBoostClassifier',
             'GradientBoostingClassifier','ExtraTreeClassifier','XGB','BaggingClassifier','Neural Network-1']

f=[KNN_model,RandomForest,AdaBoost,GradientBoosting, ExtraTree,XGB,Bagging,DeepLearning1]

## Table to store training and test measures
table_80_50_train=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR'])
table_80_50_test=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR','auc','MCC','Precision','Recall'])

feat_imp=[None]*5
j=0
for i in range(0,8):
    if i>=1 and i<=5:
        model,cm_train,cm_test,acc_train,acc_test,roc_auc,feat_imp[j],mcc,prec,rec=f[i](X_train,Y_train,scaled_X_test,test_Y,columns)
        j=j+1
    else:
        model,cm_train,cm_test,acc_train,acc_test,roc_auc,mcc,prec,rec=f[i](X_train,Y_train,scaled_X_test,test_Y)
    table_80_50_train.loc[i]=   ([acc_train,cm_train.TP,cm_train.TN,cm_train.FP,cm_train.FN,cm_train.FPR,cm_train.FNR])
    table_80_50_test.loc[i] =   ([acc_test ,cm_test.TP, cm_test.TN, cm_test.FP, cm_test.FN, cm_test.FPR, cm_test.FNR,roc_auc,mcc,prec,rec])
    print("\n")

[1mK-Nearest Neighbors[0m
Accuracy of the model for training data is: 0.9778996583348318
Confusion Matrix for training data is:


Predicted  False   True  __all__
Actual                          
False      26578   1227    27805
True           2  27803    27805
__all__    26580  29030    55610

Accuracy of the model for test data is: 0.9376036748755902
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6515   422     6937
True          67   833      900
__all__     6582  1255     7837



[1mRandomForest Classifier[0m



The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.



Accuracy of the model for training data is: 0.9994065815500809
Confusion Matrix for training data is:


Predicted  False   True  __all__
Actual                          
False      27802      3    27805
True          30  27775    27805
__all__    27832  27778    55610

Accuracy of the model for test data is: 0.9761388286334056
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6882    55     6937
True         132   768      900
__all__     7014   823     7837



[1mAdaoost Classifier[0m
Accuracy of the model for training data is: 0.9720194209674519
Confusion Matrix for training data is:


Predicted  False   True  __all__
Actual                          
False      26951    854    27805
True         702  27103    27805
__all__    27653  27957    55610

Accuracy of the model for test data is: 0.9552124537450555
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6693   244     6937
True         107   793      900
__all__     6800  1037     7837



[1mGradientBoosting Classifier[0m
Accuracy of the model for training data is: 0.9825930588023737
Confusion Matrix for training data is:


Predicted  False   True  __all__
Actual                          
False      27370    435    27805
True         533  27272    27805
__all__    27903  27707    55610

Accuracy of the model for test data is: 0.9695036365956361
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6811   126     6937
True         113   787      900
__all__     6924   913     7837



[1mExtraTree Classifier[0m



The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.



Accuracy of the model for training data is: 1.0
Confusion Matrix for training data is:


Predicted  False   True  __all__
Actual                          
False      27805      0    27805
True           0  27805    27805
__all__    27805  27805    55610

Accuracy of the model for test data is: 0.9766492280209264
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6888    49     6937
True         134   766      900
__all__     7022   815     7837



[1mXGB Classifier[0m
Accuracy of the model for training data is: 0.9824132350296709
Confusion Matrix for training data is:


Predicted  False   True  __all__
Actual                          
False      27363    442    27805
True         536  27269    27805
__all__    27899  27711    55610

Accuracy of the model for test data is: 0.9692484369018757
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6811   126     6937
True         115   785      900
__all__     6926   911     7837



[1mBagging Classifier[0m
Accuracy of the model for training data is: 0.9988491278547024
Confusion Matrix for training data is:


Predicted  False   True  __all__
Actual                          
False      27799      6    27805
True          58  27747    27805
__all__    27857  27753    55610

Accuracy of the model for test data is: 0.9746076304708434
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6861    76     6937
True         123   777      900
__all__     6984   853     7837



[1mNeural Network-1[0m
The accuracy of the model on training data is: 0.9984714979320266


Predicted  False   True  __all__
Actual                          
False      27749     56    27805
True          29  27776    27805
__all__    27778  27832    55610

The accuracy of the model on test data is: 0.9678448385861937


Predicted  False  True  __all__
Actual                         
False       6784   153     6937
True          99   801      900
__all__     6883   954     7837





### <a id='2.A' style="text-decoration : none; color : #000000;"> A. Results</a>

In [13]:
trace0 = go.Scatter(x=table_80_50_test.iloc[0].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[0].values[[0,7,8,9,10]],
                    mode = 'lines',
                    name = 'KNN_model')


trace2 = go.Scatter(x=table_80_50_test.iloc[1].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[1].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'Random Forest')


trace3 = go.Scatter(x=table_80_50_test.iloc[2].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[2].values[[0,7,8,9,10]],
                    mode = 'lines',
                    name = 'Ada Boost')

trace4 = go.Scatter(x=table_80_50_test.iloc[3].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[3].values[[0,7,8,9,10]],
                    mode = 'lines+markers',
                    name = 'GradientBoosting')

trace5 = go.Scatter(x=table_80_50_test.iloc[4].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[4].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'ExtraTree Classifier')


trace1 = go.Scatter(x=table_80_50_test.iloc[5].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[5].values[[0,7,8,9,10]],
                    mode = 'lines+markers',
                    name = 'XGB')


trace6 = go.Scatter(x=table_80_50_test.iloc[6].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[6].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'Bagging')

trace7 = go.Scatter(x=table_80_50_test.iloc[7].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[7].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'Neural Network')



data = [trace0, trace1, trace2, trace3, trace4, trace5, trace6, trace7]
layout = go.Layout(yaxis=dict(tick0=0,dtick=0.05),title='Result comparision for Features selected in Section 1')
fig = go.Figure(data=data, layout=layout)

iplot(fig, filename='Result')

In [14]:
table_80_50_train['index']=ind_columns
table_80_50_test['index']=ind_columns


table_80_50_test.set_index(['index'],drop=True,inplace=True)
table_80_50_train.set_index(['index'],drop=True,inplace=True)

print("Training Results")
display(table_80_50_train)
print("Test Results")
display(table_80_50_test)

Training Results


Unnamed: 0_level_0,accuracy,TP,TN,FP,FN,FPR,FNR
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
KNeighborsClassifier,0.9779,27803.0,26578.0,1227.0,2.0,0.044129,7.2e-05
RandomForestClassifier,0.999407,27775.0,27802.0,3.0,30.0,0.000108,0.001079
AdaBoostClassifier,0.972019,27103.0,26951.0,854.0,702.0,0.030714,0.025247
GradientBoostingClassifier,0.982593,27272.0,27370.0,435.0,533.0,0.015645,0.019169
ExtraTreeClassifier,1.0,27805.0,27805.0,0.0,0.0,0.0,0.0
XGB,0.982413,27269.0,27363.0,442.0,536.0,0.015896,0.019277
BaggingClassifier,0.998849,27747.0,27799.0,6.0,58.0,0.000216,0.002086
Neural Network-1,0.998471,27776.0,27749.0,56.0,29.0,0.002014,0.001043


Test Results


Unnamed: 0_level_0,accuracy,TP,TN,FP,FN,FPR,FNR,auc,MCC,Precision,Recall
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
KNeighborsClassifier,0.937604,833.0,6515.0,422.0,67.0,0.060833,0.074444,0.932361,0.751767,0.663745,0.925556
RandomForestClassifier,0.976139,768.0,6882.0,55.0,132.0,0.007928,0.146667,0.922702,0.879203,0.933171,0.853333
AdaBoostClassifier,0.955212,793.0,6693.0,244.0,107.0,0.035174,0.118889,0.922969,0.795979,0.764706,0.881111
GradientBoostingClassifier,0.969504,787.0,6811.0,126.0,113.0,0.018163,0.125556,0.92814,0.850961,0.861993,0.874444
ExtraTreeClassifier,0.976649,766.0,6888.0,49.0,134.0,0.007064,0.148889,0.922024,0.881586,0.939877,0.851111
XGB,0.969248,785.0,6811.0,126.0,115.0,0.018163,0.127778,0.927029,0.849561,0.86169,0.872222
BaggingClassifier,0.974608,777.0,6861.0,76.0,123.0,0.010956,0.136667,0.926189,0.872595,0.910903,0.863333
Neural Network-1,0.967845,801.0,6784.0,153.0,99.0,0.022056,0.11,0.926189,0.846323,0.839623,0.89


### <a id='2.B' style="text-decoration : none; color : #000000;"> B. Average Feature Importances</a>

In [15]:
feat_impt_plot(np.average(feat_imp,axis=0),columns)

### <a id='3' style="text-decoration : none; color : #000000;">3. UnderSampling </a>

In [16]:
df_non=train[train.Target==0]
df_mal=train[train.Target==1]

df_non=df_non.sample(frac=len(df_mal)/len(df_non),random_state=0)
df_u=df_non.append(df_mal)
df_u.Target.value_counts() 

X_train=df_u[columns]
X_train=scaler.transform(X_train)
Y_train=df_u.Target.values

print("Shape after undersampling",X_train.shape)

## Table to store training and test measures so we can compare later
table_80_50_train=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR'])
table_80_50_test=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR','auc','MCC','Precision','Recall'])

feat_imp=[None]*5
j=0
for i in range(0,8):
    if i>=1 and i<=5:
        model,cm_train,cm_test,acc_train,acc_test,roc_auc,feat_imp[j],mcc,prec,rec=f[i](X_train,Y_train,scaled_X_test,test_Y,columns)
        j=j+1
    else:
        model,cm_train,cm_test,acc_train,acc_test,roc_auc,mcc,prec,rec=f[i](X_train,Y_train,scaled_X_test,test_Y)
    table_80_50_train.loc[i]=   ([acc_train,cm_train.TP,cm_train.TN,cm_train.FP,cm_train.FN,cm_train.FPR,cm_train.FNR])
    table_80_50_test.loc[i] =   ([acc_test ,cm_test.TP, cm_test.TN, cm_test.FP, cm_test.FN, cm_test.FPR, cm_test.FNR,roc_auc,mcc,prec,rec])
    print("\n")

Shape after undersampling (7082, 99)
[1mK-Nearest Neighbors[0m
Accuracy of the model for training data is: 0.9514261508048574
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False       3430   111     3541
True         233  3308     3541
__all__     3663  3419     7082

Accuracy of the model for test data is: 0.9470460635447238
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6608   329     6937
True          86   814      900
__all__     6694  1143     7837



[1mRandomForest Classifier[0m
Accuracy of the model for training data is: 0.9959051115504095
Confusion Matrix for training data is:



The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.



Predicted  False  True  __all__
Actual                         
False       3540     1     3541
True          28  3513     3541
__all__     3568  3514     7082

Accuracy of the model for test data is: 0.9584024499170601
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6684   253     6937
True          73   827      900
__all__     6757  1080     7837



[1mAdaoost Classifier[0m
Accuracy of the model for training data is: 0.9385766732561424
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False       3312   229     3541
True         206  3335     3541
__all__     3518  3564     7082

Accuracy of the model for test data is: 0.9376036748755902
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6505   432     6937
True          57   843      900
__all__     6562  1275     7837



[1mGradientBoosting Classifier[0m
Accuracy of the model for training data is: 0.9559446484044055
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False       3396   145     3541
True         167  3374     3541
__all__     3563  3519     7082

Accuracy of the model for test data is: 0.9526604568074518
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6621   316     6937
True          55   845      900
__all__     6676  1161     7837



[1mExtraTree Classifier[0m
Accuracy of the model for training data is: 1.0
Confusion Matrix for training data is:



The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.



Predicted  False  True  __all__
Actual                         
False       3541     0     3541
True           0  3541     3541
__all__     3541  3541     7082

Accuracy of the model for test data is: 0.9580196503764196
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6679   258     6937
True          71   829      900
__all__     6750  1087     7837



[1mXGB Classifier[0m
Accuracy of the model for training data is: 0.954250211804575
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False       3388   153     3541
True         171  3370     3541
__all__     3559  3523     7082

Accuracy of the model for test data is: 0.9525328569605717
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6624   313     6937
True          59   841      900
__all__     6683  1154     7837



[1mBagging Classifier[0m
Accuracy of the model for training data is: 0.9960463146003954
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False       3537     4     3541
True          24  3517     3541
__all__     3561  3521     7082

Accuracy of the model for test data is: 0.9569988516013781
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6667   270     6937
True          67   833      900
__all__     6734  1103     7837



[1mNeural Network-1[0m
The accuracy of the model on training data is: 0.9998587969500141


Predicted  False  True  __all__
Actual                         
False       3540     1     3541
True           0  3541     3541
__all__     3540  3542     7082

The accuracy of the model on test data is: 0.9414316702819957


Predicted  False  True  __all__
Actual                         
False       6546   391     6937
True          68   832      900
__all__     6614  1223     7837





### <a id='3.A' style="text-decoration : none; color : #000000;"> A. Results</a>

In [17]:
trace0 = go.Scatter(x=table_80_50_test.iloc[0].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[0].values[[0,7,8,9,10]],
                    mode = 'lines',
                    name = 'KNN_model')


trace2 = go.Scatter(x=table_80_50_test.iloc[1].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[1].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'Random Forest')


trace3 = go.Scatter(x=table_80_50_test.iloc[2].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[2].values[[0,7,8,9,10]],
                    mode = 'lines',
                    name = 'Ada Boost')

trace4 = go.Scatter(x=table_80_50_test.iloc[3].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[3].values[[0,7,8,9,10]],
                    mode = 'lines+markers',
                    name = 'GradientBoosting')

trace5 = go.Scatter(x=table_80_50_test.iloc[4].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[4].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'ExtraTree Classifier')


trace1 = go.Scatter(x=table_80_50_test.iloc[5].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[5].values[[0,7,8,9,10]],
                    mode = 'lines+markers',
                    name = 'XGB')


trace6 = go.Scatter(x=table_80_50_test.iloc[6].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[6].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'Bagging')

trace7 = go.Scatter(x=table_80_50_test.iloc[7].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[7].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'Neural Network')



data = [trace0, trace1, trace2, trace3, trace4, trace5, trace6, trace7]
layout = go.Layout(yaxis=dict(tick0=0,dtick=0.05),title='Result comparision for Features selected in Section 1')
fig = go.Figure(data=data, layout=layout)

iplot(fig, filename='Result')

In [18]:
table_80_50_train['index']=ind_columns
table_80_50_test['index']=ind_columns


table_80_50_test.set_index(['index'],drop=True,inplace=True)
table_80_50_train.set_index(['index'],drop=True,inplace=True)

print("Training Results")
display(table_80_50_train)
print("Test Results")
display(table_80_50_test)

Training Results


Unnamed: 0_level_0,accuracy,TP,TN,FP,FN,FPR,FNR
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
KNeighborsClassifier,0.951426,3308.0,3430.0,111.0,233.0,0.031347,0.065801
RandomForestClassifier,0.995905,3513.0,3540.0,1.0,28.0,0.000282,0.007907
AdaBoostClassifier,0.938577,3335.0,3312.0,229.0,206.0,0.064671,0.058176
GradientBoostingClassifier,0.955945,3374.0,3396.0,145.0,167.0,0.040949,0.047162
ExtraTreeClassifier,1.0,3541.0,3541.0,0.0,0.0,0.0,0.0
XGB,0.95425,3370.0,3388.0,153.0,171.0,0.043208,0.048291
BaggingClassifier,0.996046,3517.0,3537.0,4.0,24.0,0.00113,0.006778
Neural Network-1,0.999859,3541.0,3540.0,1.0,0.0,0.000282,0.0


Test Results


Unnamed: 0_level_0,accuracy,TP,TN,FP,FN,FPR,FNR,auc,MCC,Precision,Recall
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
KNeighborsClassifier,0.947046,814.0,6608.0,329.0,86.0,0.047427,0.095556,0.928509,0.77416,0.712161,0.904444
RandomForestClassifier,0.958402,827.0,6684.0,253.0,73.0,0.036471,0.081111,0.941209,0.816192,0.765741,0.918889
AdaBoostClassifier,0.937604,843.0,6505.0,432.0,57.0,0.062275,0.063333,0.937196,0.755336,0.661176,0.936667
GradientBoostingClassifier,0.95266,845.0,6621.0,316.0,55.0,0.045553,0.061111,0.946668,0.801766,0.727821,0.938889
ExtraTreeClassifier,0.95802,829.0,6679.0,258.0,71.0,0.037192,0.078889,0.94196,0.815367,0.762649,0.921111
XGB,0.952533,841.0,6624.0,313.0,59.0,0.04512,0.065556,0.944662,0.800163,0.728769,0.934444
BaggingClassifier,0.956999,833.0,6667.0,270.0,67.0,0.038922,0.074444,0.943317,0.812881,0.755213,0.925556
Neural Network-1,0.941432,832.0,6546.0,391.0,68.0,0.056364,0.075556,0.943317,0.762643,0.680294,0.924444


### <a id='3.B' style="text-decoration : none; color : #000000;"> B. Average Feature Importances</a>

In [19]:
feat_impt_plot(np.average(feat_imp,axis=0),columns)

### <a id='4' style="text-decoration : none; color : #000000;">4. Feature Transformation with feature selection </a>

In [8]:
df.reset_index(inplace=True)
columns.append('domain')
df_min=df[columns]

es = ft.EntitySet(id = 'malicious')
es.entity_from_dataframe(entity_id = 'data', dataframe = df_min, index = 'domain')

feature_matrix_sessions, features_defs = ft.dfs(entityset=es,target_entity="data",
                                                trans_primitives = ['add','divide','multiply'],n_jobs=-1,
                                                verbose=1,max_depth=1)

Built 19503 features
EntitySet scattered to workers in 8.780 seconds
Elapsed: 04:38 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 11/11 chunks


Task exception was never retrieved
future: <Task finished coro=<BaseTCPConnector.connect() done, defined at /nethomes/darshan/anaconda3/lib/python3.6/site-packages/distributed/comm/tcp.py:341> exception=CommClosedError('in <distributed.comm.tcp.TCPConnector object at 0x7fdf98c7e358>: ConnectionRefusedError: [Errno 111] Connection refused',)>
Traceback (most recent call last):
  File "/nethomes/darshan/anaconda3/lib/python3.6/site-packages/distributed/comm/tcp.py", line 348, in connect
    ip, port, max_buffer_size=MAX_BUFFER_SIZE, **kwargs
  File "/nethomes/darshan/anaconda3/lib/python3.6/site-packages/tornado/tcpclient.py", line 280, in connect
    af, addr, stream = await connector.start(connect_timeout=timeout)
  File "/nethomes/darshan/anaconda3/lib/python3.6/site-packages/tornado/tcpclient.py", line 143, in on_connect_done
    stream = future.result()
tornado.iostream.StreamClosedError: Stream is closed

During handling of the above exception, another exception occurred:

Tracebac

In [9]:
df.set_index(['domain'],drop=True,inplace=True)

In [10]:
fs=pd.concat([feature_matrix_sessions,pd.DataFrame(df_sel.Target)],axis=1)


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





In [11]:
del feature_matrix_sessions

fs.replace([np.inf],0,inplace=True)
fs.fillna(value=0,inplace=True)
fs=fs.reindex(df.index.values)
X=fs.loc[:,fs.columns!='Target']

y=fs.Target.values
feature_name = X.columns.tolist()

In [16]:
def cor_selector(X, y):
    cor_list = []
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-100:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature
cor_support, cor_feature = cor_selector(X, y)

X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=100)
chi_selector.fit(X_norm, y)
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()

rfe_selector = RFE(estimator=LogisticRegression(random_state=0), n_features_to_select=100, step=0.3, verbose=False)
rfe_selector.fit(X_norm, y)
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()

embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100,random_state=0))
embeded_rf_selector.fit(X, y)
embeded_rf_support = embeded_rf_selector.get_support()













In [17]:
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
feature_selection_df = pd.DataFrame({'Feature':feature_name, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support,
                                    'Random Forest':embeded_rf_support})
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)

In [20]:
X=fs[feature_selection_df[feature_selection_df.Total>=3]['Feature']]
X.fillna(value=0,inplace=True)
y=fs.Target.values
feature_name = X.columns.tolist()

scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)

print("The train/test split ratio is 70:30")
X_train, X_test, Y_train, Y_test = train_test_split(scaled_X,y,random_state=0,test_size=0.3)
print('Features we will be using:\n')
display(feature_name)

The train/test split ratio is 70:30
Features we will be using:



['url_tlds / total_count',
 'url_length / total_count',
 'url_extensions / total_count',
 'url_char_z / total_count',
 'url_words_with_length_5 / total_count',
 'url_words_with_length_4 / total_count',
 'url_extensions + url_length',
 'url_char_w + url_length',
 'url_char_u / total_count',
 'url_char_t / total_count',
 'url_char_s / total_count',
 'url_char_l / total_count',
 'url_char_i / total_count',
 'url_char_e / total_count',
 'url_char_. / total_count',
 'url_char_. + url_char_z',
 'url_char_. + url_char_y',
 'url_char_. + url_char_x',
 'url_char_. + url_char_w',
 'url_char_.',
 'js_function_String.fromCharCode( + url_length',
 'http_header_content-language_text/html / url_char_.',
 'http_header_content-language_text/html / title_count',
 'http_header_content-language_text/html + url_length',
 'http_header_content-encoding_gzip + http_header_content-language_text/html',
 'frame_src_out_of_domain + url_char_.',
 'frame_count + url_char_.']

In [None]:
fs=fs.reindex(df_sel.index.values)

In [24]:
train=fs.iloc[:31346]
train_Y=train.Target.values
train_X=train[feature_name].values


scaler=StandardScaler()
scaled_X_train=scaler.fit_transform(train_X)

test=fs.iloc[31346:]
test_Y=test.Target.values
test_X=test[feature_name]

scaled_X_test=scaler.transform(test_X)

table_80_50_train=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR'])
table_80_50_test=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR','auc','MCC','Precision','Recall'])

feat_imp=[None]*5
j=0
for i in range(0,8):
    if i>=1 and i<=5:
        model,cm_train,cm_test,acc_train,acc_test,roc_auc,feat_imp[j],mcc,prec,rec=f[i](scaled_X_train,train_Y,scaled_X_test,test_Y,feature_name)
        j=j+1
    else:
        model,cm_train,cm_test,acc_train,acc_test,roc_auc,mcc,prec,rec=f[i](scaled_X_train,train_Y,scaled_X_test,test_Y)
    table_80_50_train.loc[i]=   ([acc_train,cm_train.TP,cm_train.TN,cm_train.FP,cm_train.FN,cm_train.FPR,cm_train.FNR])
    table_80_50_test.loc[i] =   ([acc_test ,cm_test.TP, cm_test.TN, cm_test.FP, cm_test.FN, cm_test.FPR, cm_test.FNR,roc_auc,mcc,prec,rec])
    print("\n")

[1mK-Nearest Neighbors[0m
Accuracy of the model for training data is: 0.9776047980603586
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      27629   176    27805
True         526  3015     3541
__all__    28155  3191    31346

Accuracy of the model for test data is: 0.9702692356769171
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6866    71     6937
True         162   738      900
__all__     7028   809     7837



[1mRandomForest Classifier[0m



The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.



Accuracy of the model for training data is: 0.9973840362406686
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      27800     5    27805
True          77  3464     3541
__all__    27877  3469    31346

Accuracy of the model for test data is: 0.976394028327166
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6893    44     6937
True         141   759      900
__all__     7034   803     7837



[1mAdaoost Classifier[0m
Accuracy of the model for training data is: 0.9680980029349837
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      27512   293    27805
True         707  2834     3541
__all__    28219  3127    31346

Accuracy of the model for test data is: 0.9622304453234656
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6850    87     6937
True         209   691      900
__all__     7059   778     7837



[1mGradientBoosting Classifier[0m
Accuracy of the model for training data is: 0.9779876220251388
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      27621   184    27805
True         506  3035     3541
__all__    28127  3219    31346

Accuracy of the model for test data is: 0.9723108332270002
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6875    62     6937
True         155   745      900
__all__     7030   807     7837



[1mExtraTree Classifier[0m



The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.



Accuracy of the model for training data is: 0.9998723920117399
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      27805     0    27805
True           4  3537     3541
__all__    27809  3537    31346

Accuracy of the model for test data is: 0.9734592318489218
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6881    56     6937
True         152   748      900
__all__     7033   804     7837



[1mXGB Classifier[0m
Accuracy of the model for training data is: 0.976520130160148
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      27603   202    27805
True         534  3007     3541
__all__    28137  3209    31346

Accuracy of the model for test data is: 0.9725660329207605
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6875    62     6937
True         153   747      900
__all__     7028   809     7837



[1mBagging Classifier[0m
Accuracy of the model for training data is: 0.9972245262553436
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      27794    11    27805
True          76  3465     3541
__all__    27870  3476    31346

Accuracy of the model for test data is: 0.9746076304708434
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6881    56     6937
True         143   757      900
__all__     7024   813     7837



[1mNeural Network-1[0m
The accuracy of the model on training data is: 0.9909079308364703


Predicted  False  True  __all__
Actual                         
False      27753    52    27805
True         233  3308     3541
__all__    27986  3360    31346

The accuracy of the model on test data is: 0.9710348347581983


Predicted  False  True  __all__
Actual                         
False       6846    91     6937
True         136   764      900
__all__     6982   855     7837





### <a id='4.A' style="text-decoration : none; color : #000000;"> A. Results</a>

In [25]:
trace0 = go.Scatter(x=table_80_50_test.iloc[0].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[0].values[[0,7,8,9,10]],
                    mode = 'lines',
                    name = 'KNN_model')


trace2 = go.Scatter(x=table_80_50_test.iloc[1].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[1].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'Random Forest')


trace3 = go.Scatter(x=table_80_50_test.iloc[2].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[2].values[[0,7,8,9,10]],
                    mode = 'lines',
                    name = 'Ada Boost')

trace4 = go.Scatter(x=table_80_50_test.iloc[3].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[3].values[[0,7,8,9,10]],
                    mode = 'lines+markers',
                    name = 'GradientBoosting')

trace5 = go.Scatter(x=table_80_50_test.iloc[4].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[4].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'ExtraTree Classifier')


trace1 = go.Scatter(x=table_80_50_test.iloc[5].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[5].values[[0,7,8,9,10]],
                    mode = 'lines+markers',
                    name = 'XGB')


trace6 = go.Scatter(x=table_80_50_test.iloc[6].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[6].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'Bagging')

trace7 = go.Scatter(x=table_80_50_test.iloc[7].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[7].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'Neural Network')



data = [trace0, trace1, trace2, trace3, trace4, trace5, trace6, trace7]
layout = go.Layout(yaxis=dict(tick0=0,dtick=0.05),title='Result comparision for Features selected in Section 1')
fig = go.Figure(data=data, layout=layout)

iplot(fig, filename='Result')

In [26]:
table_80_50_train['index']=ind_columns
table_80_50_test['index']=ind_columns


table_80_50_test.set_index(['index'],drop=True,inplace=True)
table_80_50_train.set_index(['index'],drop=True,inplace=True)

print("Training Results")
display(table_80_50_train)
print("Test Results")
display(table_80_50_test)

Training Results


Unnamed: 0_level_0,accuracy,TP,TN,FP,FN,FPR,FNR
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
KNeighborsClassifier,0.977605,3015.0,27629.0,176.0,526.0,0.00633,0.148546
RandomForestClassifier,0.997384,3464.0,27800.0,5.0,77.0,0.00018,0.021745
AdaBoostClassifier,0.968098,2834.0,27512.0,293.0,707.0,0.010538,0.199661
GradientBoostingClassifier,0.977988,3035.0,27621.0,184.0,506.0,0.006618,0.142897
ExtraTreeClassifier,0.999872,3537.0,27805.0,0.0,4.0,0.0,0.00113
XGB,0.97652,3007.0,27603.0,202.0,534.0,0.007265,0.150805
BaggingClassifier,0.997225,3465.0,27794.0,11.0,76.0,0.000396,0.021463
Neural Network-1,0.990908,3308.0,27753.0,52.0,233.0,0.00187,0.065801


Test Results


Unnamed: 0_level_0,accuracy,TP,TN,FP,FN,FPR,FNR,auc,MCC,Precision,Recall
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
KNeighborsClassifier,0.970269,738.0,6866.0,71.0,162.0,0.010235,0.18,0.904883,0.848547,0.912237,0.82
RandomForestClassifier,0.976394,759.0,6893.0,44.0,141.0,0.006343,0.156667,0.918495,0.879972,0.945205,0.843333
AdaBoostClassifier,0.96223,691.0,6850.0,87.0,209.0,0.012541,0.232222,0.877618,0.805246,0.888175,0.767778
GradientBoostingClassifier,0.972311,745.0,6875.0,62.0,155.0,0.008938,0.172222,0.90942,0.858997,0.923172,0.827778
ExtraTreeClassifier,0.973459,748.0,6881.0,56.0,152.0,0.008073,0.168889,0.911519,0.864826,0.930348,0.831111
XGB,0.972566,747.0,6875.0,62.0,153.0,0.008938,0.17,0.910531,0.860386,0.923362,0.83
BaggingClassifier,0.974608,757.0,6881.0,56.0,143.0,0.008073,0.158889,0.916519,0.871033,0.931119,0.841111
Neural Network-1,0.971035,764.0,6846.0,91.0,136.0,0.013118,0.151111,0.916519,0.854715,0.893567,0.848889


### <a id='4.B' style="text-decoration : none; color : #000000;"> B. Average Feature Importances</a>

In [27]:
feat_impt_plot(np.average(feat_imp,axis=0),feature_name)

### <a id='5' style="text-decoration : none; color : #000000;">5. Feature Transformation with PCA </a>

In [None]:
del df

In [42]:
%whos

Variable                     Type                     Data/Info
---------------------------------------------------------------
AdaBoost                     function                 <function AdaBoost at 0x7fdfb36ac1e0>
AdaBoostClassifier           ABCMeta                  <class 'sklearn.ensemble.<...>ting.AdaBoostClassifier'>
Bagging                      function                 <function Bagging at 0x7fdfb36acae8>
BaggingClassifier            ABCMeta                  <class 'sklearn.ensemble.<...>gging.BaggingClassifier'>
ConfusionMatrix              type                     <class 'pandas_ml.confusi<...>trix.cm.ConfusionMatrix'>
DeepLearning1                function                 <function DeepLearning1 at 0x7fdfb36aca60>
Dense                        type                     <class 'keras.layers.core.Dense'>
Dropout                      type                     <class 'keras.layers.core.Dropout'>
ExtraTree                    function                 <function ExtraTree at 0x7fdfb

In [35]:
del scaled_X,X

In [37]:
X=fs.loc[:,fs.columns!='Target']
y=fs.Target.values
feature_name = X.columns.tolist()
X=np.nan_to_num(X)
scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)

In [43]:
import gc
gc.collect()

9128

In [None]:
sns.set_style('whitegrid')
pca = PCA(random_state=0)
pca.fit(scaled_X)
plt.figure(1, figsize=(10, 8))
plt.plot(np.cumsum(pca.explained_variance_ratio_),'--o', linewidth=2)
plt.axis('tight')
plt.xlabel('Number of Components',size=14)
plt.xticks(size=12)
plt.xlim([0,200])
plt.ylabel('% Explained Variance',size=14)
plt.yticks(size=12)
plt.title('% Variance Explained by Principal Components',size=14,y=1.05)
plt.show()

In [52]:
pca=PCA(n_components=750,whiten=True,random_state=0)
X_pca_21=pca.fit_transform(scaled_X)
print("Variance explained using 750 components is:",sum(pca.explained_variance_ratio_))

Variance explained using 750 components is: 0.8021136908620577


In [54]:
X_train=X_pca_21[:31346]
Y_train=y[0:31346]

In [55]:
X_test=X_pca_21[31346:]
Y_test=y[31346:]

## Table to store training and test measures so we can compare later
table_80_70_train=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR'])
table_80_70_test=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR','auc','mcc'])

In [57]:
table_80_50_train=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR'])
table_80_50_test=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR','auc','MCC','Precision','Recall'])

feat_imp=[None]*5
j=0
for i in range(0,8):
    if i>=1 and i<=5:
        model,cm_train,cm_test,acc_train,acc_test,roc_auc,feat_imp[j],mcc,prec,rec=f[i](X_train,Y_train,X_test,Y_test,np.arange(0,750).astype(str))
        j=j+1
    else:
        model,cm_train,cm_test,acc_train,acc_test,roc_auc,mcc,prec,rec=f[i](X_train,Y_train,X_test,Y_test)
    table_80_50_train.loc[i]=   ([acc_train,cm_train.TP,cm_train.TN,cm_train.FP,cm_train.FN,cm_train.FPR,cm_train.FNR])
    table_80_50_test.loc[i] =   ([acc_test ,cm_test.TP, cm_test.TN, cm_test.FP, cm_test.FN, cm_test.FPR, cm_test.FNR,roc_auc,mcc,prec,rec])
    print("\n")

[1mK-Nearest Neighbors[0m
Accuracy of the model for training data is: 0.9739998723920117
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      27772    33    27805
True         782  2759     3541
__all__    28554  2792    31346

Accuracy of the model for test data is: 0.9656756411892305
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6926    11     6937
True         258   642      900
__all__     7184   653     7837



[1mRandomForest Classifier[0m



The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.



Accuracy of the model for training data is: 0.9963950743316532
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      27804     1    27805
True         112  3429     3541
__all__    27916  3430    31346

Accuracy of the model for test data is: 0.9626132448641062
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6897    40     6937
True         253   647      900
__all__     7150   687     7837



[1mAdaoost Classifier[0m
Accuracy of the model for training data is: 0.9678427869584636
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      27404   401    27805
True         607  2934     3541
__all__    28011  3335    31346

Accuracy of the model for test data is: 0.9642720428735485
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6822   115     6937
True         165   735      900
__all__     6987   850     7837



[1mGradientBoosting Classifier[0m
Accuracy of the model for training data is: 0.9781471320104639
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      27645   160    27805
True         525  3016     3541
__all__    28170  3176    31346

Accuracy of the model for test data is: 0.9691208370549955
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6861    76     6937
True         166   734      900
__all__     7027   810     7837



[1mExtraTree Classifier[0m



The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.



Accuracy of the model for training data is: 1.0
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      27805     0    27805
True           0  3541     3541
__all__    27805  3541    31346

Accuracy of the model for test data is: 0.9321168814597423
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6929     8     6937
True         524   376      900
__all__     7453   384     7837



[1mXGB Classifier[0m
Accuracy of the model for training data is: 0.9789765839341543
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      27664   141    27805
True         518  3023     3541
__all__    28182  3164    31346

Accuracy of the model for test data is: 0.971545234145719
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6872    65     6937
True         158   742      900
__all__     7030   807     7837



[1mBagging Classifier[0m
Accuracy of the model for training data is: 0.9970012122758884
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      27801     4    27805
True          90  3451     3541
__all__    27891  3455    31346

Accuracy of the model for test data is: 0.9681000382799541
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6863    74     6937
True         176   724      900
__all__     7039   798     7837



[1mNeural Network-1[0m
The accuracy of the model on training data is: 0.9998723920117399


Predicted  False  True  __all__
Actual                         
False      27804     1    27805
True           3  3538     3541
__all__    27807  3539    31346

The accuracy of the model on test data is: 0.9744800306239633


Predicted  False  True  __all__
Actual                         
False       6847    90     6937
True         110   790      900
__all__     6957   880     7837





In [58]:
trace0 = go.Scatter(x=table_80_50_test.iloc[0].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[0].values[[0,7,8,9,10]],
                    mode = 'lines',
                    name = 'KNN_model')


trace2 = go.Scatter(x=table_80_50_test.iloc[1].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[1].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'Random Forest')


trace3 = go.Scatter(x=table_80_50_test.iloc[2].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[2].values[[0,7,8,9,10]],
                    mode = 'lines',
                    name = 'Ada Boost')

trace4 = go.Scatter(x=table_80_50_test.iloc[3].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[3].values[[0,7,8,9,10]],
                    mode = 'lines+markers',
                    name = 'GradientBoosting')

trace5 = go.Scatter(x=table_80_50_test.iloc[4].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[4].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'ExtraTree Classifier')


trace1 = go.Scatter(x=table_80_50_test.iloc[5].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[5].values[[0,7,8,9,10]],
                    mode = 'lines+markers',
                    name = 'XGB')


trace6 = go.Scatter(x=table_80_50_test.iloc[6].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[6].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'Bagging')

trace7 = go.Scatter(x=table_80_50_test.iloc[7].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[7].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'Neural Network')



data = [trace0, trace1, trace2, trace3, trace4, trace5, trace6, trace7]
layout = go.Layout(yaxis=dict(tick0=0,dtick=0.05),title='Result comparision for Features selected in Section 1')
fig = go.Figure(data=data, layout=layout)

iplot(fig, filename='Result')

In [59]:
table_80_50_train['index']=ind_columns
table_80_50_test['index']=ind_columns


table_80_50_test.set_index(['index'],drop=True,inplace=True)
table_80_50_train.set_index(['index'],drop=True,inplace=True)

print("Training Results")
display(table_80_50_train)
print("Test Results")
display(table_80_50_test)

Training Results


Unnamed: 0_level_0,accuracy,TP,TN,FP,FN,FPR,FNR
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
KNeighborsClassifier,0.974,2759.0,27772.0,33.0,782.0,0.001187,0.220842
RandomForestClassifier,0.996395,3429.0,27804.0,1.0,112.0,3.6e-05,0.031629
AdaBoostClassifier,0.967843,2934.0,27404.0,401.0,607.0,0.014422,0.171421
GradientBoostingClassifier,0.978147,3016.0,27645.0,160.0,525.0,0.005754,0.148263
ExtraTreeClassifier,1.0,3541.0,27805.0,0.0,0.0,0.0,0.0
XGB,0.978977,3023.0,27664.0,141.0,518.0,0.005071,0.146286
BaggingClassifier,0.997001,3451.0,27801.0,4.0,90.0,0.000144,0.025417
Neural Network-1,0.999872,3538.0,27804.0,1.0,3.0,3.6e-05,0.000847


Test Results


Unnamed: 0_level_0,accuracy,TP,TN,FP,FN,FPR,FNR,auc,MCC,Precision,Recall
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
KNeighborsClassifier,0.965676,642.0,6926.0,11.0,258.0,0.001586,0.286667,0.855874,0.821095,0.983155,0.713333
RandomForestClassifier,0.962613,647.0,6897.0,40.0,253.0,0.005766,0.281111,0.856561,0.80397,0.941776,0.718889
AdaBoostClassifier,0.964272,735.0,6822.0,115.0,165.0,0.016578,0.183333,0.900044,0.820334,0.864706,0.816667
GradientBoostingClassifier,0.969121,734.0,6861.0,76.0,166.0,0.010956,0.184444,0.9023,0.842674,0.906173,0.815556
ExtraTreeClassifier,0.932117,376.0,6929.0,8.0,524.0,0.001153,0.582222,0.708312,0.615348,0.979167,0.417778
XGB,0.971545,742.0,6872.0,65.0,158.0,0.00937,0.175556,0.907537,0.855047,0.919455,0.824444
BaggingClassifier,0.9681,724.0,6863.0,74.0,176.0,0.010667,0.195556,0.896889,0.836852,0.907268,0.804444
Neural Network-1,0.97448,790.0,6847.0,90.0,110.0,0.012974,0.122222,0.896889,0.873318,0.897727,0.877778


In [60]:
### Original Creator : Darshan Bhansali
### HTML code to hide the input cells 
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
To toggle code, click <a href="javascript:code_toggle()">here</a>.''')