In [1]:
import pandas as pd
import numpy as np
import copy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from IPython.display import display,clear_output,HTML
import warnings
warnings.filterwarnings("ignore")

import keras
from keras.layers import Dense,Dropout
from keras.models import Sequential

import researchpy as rp

%matplotlib inline
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows',305)

from sklearn.feature_selection import SelectKBest,chi2,RFE,RFECV,f_regression,SelectFromModel,f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier,BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split,cross_val_score

from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn import metrics

from xgboost import XGBClassifier


from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score,auc,roc_curve,recall_score,precision_score,matthews_corrcoef

from pandas_ml import ConfusionMatrix


from imblearn.over_sampling import SMOTE

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go

import featuretools as ft

from scipy import stats
import re

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

Using TensorFlow backend.


In [2]:
### Function to plot Feature Importance
def feat_impt_plot(feat_impt,y_labels,width_s=1000,height_s=1000):
    
    m=pd.DataFrame(y_labels,feat_impt).reset_index()
    m.columns=['Feature_Importance','Features']
    m.sort_values(by='Feature_Importance',inplace=True,ascending=False)
    m['Features']=m['Features'].str.replace("dom_function_","")
    m['Features']=m['Features'].str.replace("js_function_","")
    m['Features']=m['Features'].apply(lambda x: str(x).lstrip('.'))
    m['Features']=m['Features'].str.replace("(","")
    
    data = [go.Bar(x=m.Feature_Importance.values,y=m.Features.values,text=np.round(m.Feature_Importance,4),
            textposition = 'outside',
            marker=dict(
                color='rgb(158,202,225)',
                line=dict(
                    color='rgb(8,48,107)',
                    width=1.5),
            ),
            opacity=0.6,
            orientation='h'
        )]
    layout = go.Layout(autosize=False,
    width=width_s,
    height=height_s,
    xaxis=dict(title='Feature Importances',
        tickfont=dict(
            size=12,
            color='black'
        )),
    yaxis=dict(automargin=True))
    fig = go.Figure(data=data, layout=layout)
    iplot(fig)

In [3]:
def model_build(model,X_train,Y_train,X_test,Y_test,tree=False,feat=None):
    
    y_pred_train=model.predict(X_train)
    acc_train=accuracy_score(Y_train,y_pred_train)
    
    print("Accuracy of the model for training data is:",acc_train)
    print("Confusion Matrix for training data is:")
    cm_train=ConfusionMatrix(Y_train,y_pred_train)
    display(cm_train)
    
    
    y_pred_test=model.predict(X_test)
    acc_test=accuracy_score(Y_test,y_pred_test)
    print("Accuracy of the model for test data is:",acc_test)
    print("Confusion Matrix for test data is:")
    cm_test=ConfusionMatrix(Y_test,y_pred_test)
    display(cm_test)

    fpr, tpr, threshold = roc_curve(Y_test, y_pred_test)
    roc_auc =auc(fpr, tpr)
    mcc=matthews_corrcoef(Y_test,y_pred_test)
    
    precision=precision_score(Y_test,y_pred_test)
    recall=recall_score(Y_test,y_pred_test)
    if tree==True:
        feat_impt_plot(model.feature_importances_,feat)
        return model,cm_train,cm_test,acc_train,acc_test,roc_auc,model.feature_importances_,mcc,precision,recall
    
    return model,cm_train,cm_test,acc_train,acc_test,roc_auc,mcc,precision,recall

def KNN_model(X_train,Y_train,X_test,Y_test):
    print('\033[1m' + 'K-Nearest Neighbors' + '\033[0m')
    knn=KNeighborsClassifier()
    knn.fit(X_train,Y_train)

    return model_build(knn,X_train,Y_train,X_test,Y_test)

# def Logistic_model(X_train,Y_train,X_test,Y_test):
#     print('\033[1m' + 'Logistic Regression' + '\033[0m')
#     log=LogisticRegression()
#     log.fit(X_train,Y_train)
    
#     return model_build(log,X_train,Y_train,X_test,Y_test)

def RandomForest(X_train,Y_train,X_test,Y_test,feat):
    print('\033[1m' + 'RandomForest Classifier' + '\033[0m')
    rf=RandomForestClassifier(random_state=0)
    rf.fit(X_train,Y_train)

    
    return model_build(rf,X_train,Y_train,X_test,Y_test,True,feat)

def AdaBoost(X_train,Y_train,X_test,Y_test,feat):
    print('\033[1m' + 'Adaoost Classifier' + '\033[0m')
    ada=AdaBoostClassifier(random_state=0)
    ada.fit(X_train,Y_train)
    
    return model_build(ada,X_train,Y_train,X_test,Y_test,True,feat)

def GradientBoosting(X_train,Y_train,X_test,Y_test,feat):
    print('\033[1m' + 'GradientBoosting Classifier' + '\033[0m')
    gbc=GradientBoostingClassifier(random_state=0)
    gbc.fit(X_train,Y_train)
    
    return model_build(gbc,X_train,Y_train,X_test,Y_test,True,feat)

def ExtraTree(X_train,Y_train,X_test,Y_test,feat):
    print('\033[1m' + 'ExtraTree Classifier' + '\033[0m')
    ext_tree=ExtraTreesClassifier(random_state=0)
    ext_tree.fit(X_train,Y_train)
    
    return model_build(ext_tree,X_train,Y_train,X_test,Y_test,True,feat)

def XGB(X_train,Y_train,X_test,Y_test,feat):
    print('\033[1m' + 'XGB Classifier' + '\033[0m')
    xgb=XGBClassifier(random_state=0)
    xgb.fit(X_train,Y_train)
    
    return model_build(xgb,X_train,Y_train,X_test,Y_test,True,feat)

def Bagging(X_train,Y_train,X_test,Y_test):
    print('\033[1m' + 'Bagging Classifier' + '\033[0m')
    bagging=BaggingClassifier(random_state=0)
    bagging.fit(X_train,Y_train)
    
    return model_build(bagging,X_train,Y_train,X_test,Y_test)

def XGB(X_train,Y_train,X_test,Y_test,feat):
    print('\033[1m' + 'XGB Classifier' + '\033[0m')
    xgb=XGBClassifier(random_state=0)
    xgb.fit(X_train,Y_train)
    
    return model_build(xgb,X_train,Y_train,X_test,Y_test,True,feat)

def DeepLearning1(X_train,Y_train,X_test,Y_test):
    print('\033[1m' + 'Neural Network-1' + '\033[0m')
    np.random.seed(0)
    deep_learning=Sequential()
    deep_learning.add(Dense(units=64,activation='relu',use_bias=True,kernel_initializer='uniform',input_dim=X_train.shape[1]))
    deep_learning.add(Dense(units=128,activation='relu',use_bias=True,kernel_initializer='uniform'))
    deep_learning.add(Dense(units=1,activation='sigmoid',use_bias=True,kernel_initializer='uniform'))
    deep_learning.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
    deep_learning.fit(X_train,Y_train,batch_size=20,epochs=80,verbose=False)
    acc_train=deep_learning.evaluate(X_train,Y_train,verbose=False)[1]
    print("The accuracy of the model on training data is:",acc_train)
    cm_train=ConfusionMatrix(Y_train,deep_learning.predict_classes(X_train,batch_size=1,verbose=False).reshape(1,len(X_train))[0])
    cm_test=ConfusionMatrix(Y_test,deep_learning.predict_classes(X_test,batch_size=1,verbose=False).reshape(1,len(X_test))[0])
    display(cm_train)
    acc_test=accuracy_score(Y_test,deep_learning.predict_classes(X_test,batch_size=1,verbose=False))
    print("The accuracy of the model on test data is:",acc_test)
    display(cm_test)
    mcc=matthews_corrcoef(Y_test,deep_learning.predict_classes(X_test,batch_size=1,verbose=False))
    precision=precision_score(Y_test,deep_learning.predict_classes(X_test,batch_size=1,verbose=False))
    recall=recall_score(Y_test,deep_learning.predict_classes(X_test,batch_size=1,verbose=False))
    return  deep_learning,cm_train,cm_test,acc_train,acc_test,roc_auc,mcc,precision,recall

In [5]:
### Read HTML 
df_html=pd.read_csv('../../html/HTML.csv',index_col='domain')
df_html.drop(['Unnamed: 0'],axis=1,inplace=True)
cookie=[col for col in df_html.columns if 'number' in col]
df_html.drop(cookie,axis=1,inplace=True)
print("Shape of HTML dataset:",df_html.shape)
df_html.Target=df_html.Target.apply(lambda x: 1 if x=='Malicious' else 0)
count_columns=[col for col in df_html.columns if col.endswith('count')]
df_html['total_count']=df_html[count_columns].sum(axis=1)

href_columns=[col for col in df_html.columns if (('href_absolute' in col) | ('href_relative' in col) | ('href_page' in col))  ]
df_html['total_href']=df_html[href_columns].sum(axis=1)

total_img_src=[col for col in df_html.columns if 'img_src_' in col]
df_html['total_img_src']=df_html[total_img_src].sum(axis=1)


### Read JS 
df_js=pd.read_csv('../Javascript.csv',low_memory=False,index_col='domain')
cookie=[col for col in df_js.columns if 'number' in col]
df_js.drop(cookie,axis=1,inplace=True)
df_js.rename(columns={'Malicious':'Target'},inplace=True)
print("Shape of JS,DOM dataset:",df_js.shape)
df_js.Target=df_js.Target.apply(lambda x: 1 if x=='Malicious' else 0)


### Read HTTP
df_http=pd.read_csv('../HTTP.csv')
df_http.rename(columns={'Malicious':'Target'},inplace=True)
df_http.fillna(value=0,inplace=True)
cookies=[columns for columns in df_http.columns if 'number' in columns]
df_http.drop(cookies,axis=1,inplace=True)
df_http=df_http[df_http['domain']!=0]
df_http.sort_values(by='Target',inplace=True,ascending=False)
df_http.drop_duplicates(['domain'], keep='first',inplace=True)
df_http=df_http.sample(frac=1,random_state=0)
df_http.set_index(['domain'],drop=True,inplace=True)
print("Shape of HTTP dataset:",df_http.shape)



### Read URL
df_url=pd.read_csv('../URL.csv')
df_url.fillna(value=0,inplace=True)
cookies=[columns for columns in df_url.columns if 'number' in columns]
df_url.drop(cookies,axis=1,inplace=True)
df_url.drop(['url_host','url_ip'],axis=1,inplace=True)
df_url=df_url[df_url['domain']!=0]
df_url.sort_values(by='Target',inplace=True,ascending=False)
df_url.drop_duplicates(['domain'], keep='first',inplace=True)
df_url=df_url.sample(frac=1,random_state=0)
print("Shape of URL dataset:",df_url.shape)
df_url.set_index('domain',inplace=True,drop=True)

df=pd.concat([df_js,df_html,df_http,df_url],axis=1,join='inner')
df.drop(['Unnamed: 0'],axis=1,inplace=True)
df['Target_z']=df[['Target','Target','Target','Target']].apply(max,axis=1)
df.drop(['Target','Target','Target','Target'],axis=1,inplace=True)
print("Shape of combined dataset:",df.shape)
df.rename(columns={'Target_z':'Target'},inplace=True)
df.Target.value_counts()

Shape of HTML dataset: (43491, 1461)
Shape of JS,DOM dataset: (43294, 401)
Shape of HTTP dataset: (45856, 672)
Shape of URL dataset: (46771, 4194)
Shape of combined dataset: (39183, 6726)


0    34742
1     4441
Name: Target, dtype: int64

In [6]:
columns=['dom_function_document.createElement(',
 'dom_function_document.write(',
 'dom_function_window.addEventListener(',
 'dom_function_window.setInterval(',
 'dom_function_window.setTimeout(',
 'js_function_.charCodeAt(',
 'js_function_.concat(',
 'js_function_.exec(',
 'js_function_.link(',
 'js_function_.search(',
 'js_function_.substring(',
 'js_function_escape(',
 'js_function_eval(',
 'js_function_parseInt(',
 'js_function_String.fromCharCode(',
 'js_function_unescape(',
 'js_function_.replace(',
 'a_count',
 'div_count',
 'embed_count',
 'frame_count',
 'iframe_count',
 'img_count',
 'input_count',
 'link_count',
 'meta_count',
 'object_count',
 'style_count',
 'title_count',
 'total_count',
 'total_href',
 'total_img_src',
 'iframe_hidden_true',
 'script_type_text/javascript',
 'a_href_out_of_domain',
 'area_href_out_of_domain',
 'audio_src_out_of_domain',
 'base_href_out_of_domain',
 'embed_src_out_of_domain',
 'form_action_out_of_domain',
 'frame_src_out_of_domain',
 'iframe_src_out_of_domain',
 'img_src_out_of_domain',
 'img_srcset_out_of_domain',
 'link_href_out_of_domain',
 'object_data_out_of_domain',
 'script_src_out_of_domain',
 'source_src_out_of_domain',
 'source_srcset_out_of_domain',
 'video_src_out_of_domain',
 'http_header_cache-control_set_max-age',
 'http_header_cache-control_set_must-revalidate',
 'http_header_cache-control_set_no-cache',
 'http_header_cache-control_set_no-store',
 'http_header_cache-control_set_private',
 'http_header_cache-control_set_public',
 'http_header_content-encoding_gzip',
 'http_header_content-language_text/html',
 'http_header_content-length',
 'http_header_server_apache',
 'http_header_server_nginx',
 'url_char_-',
 'url_char_.',
 'url_char_a',
 'url_char_b',
 'url_char_c',
 'url_char_d',
 'url_char_e',
 'url_char_f',
 'url_char_g',
 'url_char_h',
 'url_char_i',
 'url_char_j',
 'url_char_k',
 'url_char_l',
 'url_char_m',
 'url_char_n',
 'url_char_o',
 'url_char_p',
 'url_char_r',
 'url_char_s',
 'url_char_t',
 'url_char_u',
 'url_char_v',
 'url_char_w',
 'url_char_x',
 'url_char_y',
 'url_char_z',
 'url_extension_.com',
 'url_extension_.i',
 'url_extension_.net',
 'url_extensions',
 'url_length',
 'url_tlds',
 'url_words_with_length_4',
 'url_words_with_length_5',
 'url_words_with_length_6',
 'url_words_with_length_7',
 'url_words_with_length_8',
 'Target']

print("features being used")
display(columns)

features being used


['dom_function_document.createElement(',
 'dom_function_document.write(',
 'dom_function_window.addEventListener(',
 'dom_function_window.setInterval(',
 'dom_function_window.setTimeout(',
 'js_function_.charCodeAt(',
 'js_function_.concat(',
 'js_function_.exec(',
 'js_function_.link(',
 'js_function_.search(',
 'js_function_.substring(',
 'js_function_escape(',
 'js_function_eval(',
 'js_function_parseInt(',
 'js_function_String.fromCharCode(',
 'js_function_unescape(',
 'js_function_.replace(',
 'a_count',
 'div_count',
 'embed_count',
 'frame_count',
 'iframe_count',
 'img_count',
 'input_count',
 'link_count',
 'meta_count',
 'object_count',
 'style_count',
 'title_count',
 'total_count',
 'total_href',
 'total_img_src',
 'iframe_hidden_true',
 'script_type_text/javascript',
 'a_href_out_of_domain',
 'area_href_out_of_domain',
 'audio_src_out_of_domain',
 'base_href_out_of_domain',
 'embed_src_out_of_domain',
 'form_action_out_of_domain',
 'frame_src_out_of_domain',
 'iframe_src_o

### <a id='1' style="text-decoration : none; color : #000000;"> 1. Initial Models</a>

In [7]:
df_sel=df[columns].copy(deep=True)

train=df_sel.iloc[:27428,:]
train_X=train.iloc[:,train.columns!='Target'].values
train_Y=train.Target.values

scaler=StandardScaler()
scaled_X_train=scaler.fit_transform(train_X)

test=df_sel.iloc[27428:,:]
test_X=test.iloc[:,test.columns!='Target'].values
test_Y=test.Target.values
scaled_X_test=scaler.transform(test_X)
columns.remove('Target')

ind_columns=['KNeighborsClassifier','RandomForestClassifier','AdaBoostClassifier',
             'GradientBoostingClassifier','ExtraTreeClassifier','XGB','BaggingClassifier','Neural Network-1']

f=[KNN_model,RandomForest,AdaBoost,GradientBoosting, ExtraTree,XGB,Bagging,DeepLearning1]

## Table to store training and test measures
table_80_50_train=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR'])
table_80_50_test=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR','auc','MCC','Precision','Recall'])

feat_imp=[None]*5
j=0
for i in range(0,8):
    if i>=1 and i<=5:
        model,cm_train,cm_test,acc_train,acc_test,roc_auc,feat_imp[j],mcc,prec,rec=f[i](scaled_X_train,train_Y,scaled_X_test,test_Y,columns)
        j=j+1
    else:
        model,cm_train,cm_test,acc_train,acc_test,roc_auc,mcc,prec,rec=f[i](scaled_X_train,train_Y,scaled_X_test,test_Y)
    table_80_50_train.loc[i]=   ([acc_train,cm_train.TP,cm_train.TN,cm_train.FP,cm_train.FN,cm_train.FPR,cm_train.FNR])
    table_80_50_test.loc[i] =   ([acc_test ,cm_test.TP, cm_test.TN, cm_test.FP, cm_test.FN, cm_test.FPR, cm_test.FNR,roc_auc,mcc,prec,rec])
    print("\n")

[1mK-Nearest Neighbors[0m
Accuracy of the model for training data is: 0.9804943852996938
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24257    70    24327
True         465  2636     3101
__all__    24722  2706    27428

Accuracy of the model for test data is: 0.9736282433007231
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10369    46    10415
True         264  1076     1340
__all__    10633  1122    11755



[1mRandomForest Classifier[0m
Accuracy of the model for training data is: 0.9978489135190316
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24325     2    24327
True          57  3044     3101
__all__    24382  3046    27428

Accuracy of the model for test data is: 0.9767758400680562
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10360    55    10415
True         218  1122     1340
__all__    10578  1177    11755



[1mAdaoost Classifier[0m
Accuracy of the model for training data is: 0.971306693889456
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24053   274    24327
True         513  2588     3101
__all__    24566  2862    27428

Accuracy of the model for test data is: 0.9676733304976606
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10281   134    10415
True         246  1094     1340
__all__    10527  1228    11755



[1mGradientBoosting Classifier[0m
Accuracy of the model for training data is: 0.9802756307423072
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24191   136    24327
True         405  2696     3101
__all__    24596  2832    27428

Accuracy of the model for test data is: 0.976095278604849
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10339    76    10415
True         205  1135     1340
__all__    10544  1211    11755



[1mExtraTree Classifier[0m
Accuracy of the model for training data is: 1.0
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24327     0    24327
True           0  3101     3101
__all__    24327  3101    27428

Accuracy of the model for test data is: 0.9752445767758401
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10371    44    10415
True         247  1093     1340
__all__    10618  1137    11755



[1mXGB Classifier[0m
Accuracy of the model for training data is: 0.9795464488843517
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24186   141    24327
True         420  2681     3101
__all__    24606  2822    27428

Accuracy of the model for test data is: 0.9764355593364525
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10342    73    10415
True         204  1136     1340
__all__    10546  1209    11755



[1mBagging Classifier[0m
Accuracy of the model for training data is: 0.9976301589616451
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24325     2    24327
True          63  3038     3101
__all__    24388  3040    27428

Accuracy of the model for test data is: 0.9783071033602723
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10354    61    10415
True         194  1146     1340
__all__    10548  1207    11755



[1mNeural Network-1[0m
The accuracy of the model on training data is: 0.9978489135190316


Predicted  False  True  __all__
Actual                         
False      24307    20    24327
True          39  3062     3101
__all__    24346  3082    27428

The accuracy of the model on test data is: 0.968098681412165


Predicted  False  True  __all__
Actual                         
False      10266   149    10415
True         226  1114     1340
__all__    10492  1263    11755





### <a id='1.A' style="text-decoration : none; color : #000000;"> A. Results</a>

In [8]:
trace0 = go.Scatter(x=table_80_50_test.iloc[0].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[0].values[[0,7,8,9,10]],
                    mode = 'lines',
                    name = 'KNN_model')


trace2 = go.Scatter(x=table_80_50_test.iloc[1].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[1].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'Random Forest')


trace3 = go.Scatter(x=table_80_50_test.iloc[2].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[2].values[[0,7,8,9,10]],
                    mode = 'lines',
                    name = 'Ada Boost')

trace4 = go.Scatter(x=table_80_50_test.iloc[3].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[3].values[[0,7,8,9,10]],
                    mode = 'lines+markers',
                    name = 'GradientBoosting')

trace5 = go.Scatter(x=table_80_50_test.iloc[4].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[4].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'ExtraTree Classifier')


trace1 = go.Scatter(x=table_80_50_test.iloc[5].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[5].values[[0,7,8,9,10]],
                    mode = 'lines+markers',
                    name = 'XGB')


trace6 = go.Scatter(x=table_80_50_test.iloc[6].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[6].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'Bagging')

trace7 = go.Scatter(x=table_80_50_test.iloc[7].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[7].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'Neural Network')



data = [trace0, trace1, trace2, trace3, trace4, trace5, trace6, trace7]
layout = go.Layout(yaxis=dict(tick0=0,dtick=0.05),title='Result comparision for Features selected in Section 1')
fig = go.Figure(data=data, layout=layout)

iplot(fig, filename='Result')

In [9]:
table_80_50_train['index']=ind_columns
table_80_50_test['index']=ind_columns


table_80_50_test.set_index(['index'],drop=True,inplace=True)
table_80_50_train.set_index(['index'],drop=True,inplace=True)

print("Training Results")
display(table_80_50_train)
print("Test Results")
display(table_80_50_test)

Training Results


Unnamed: 0_level_0,accuracy,TP,TN,FP,FN,FPR,FNR
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
KNeighborsClassifier,0.980494,2636.0,24257.0,70.0,465.0,0.002877,0.149952
RandomForestClassifier,0.997849,3044.0,24325.0,2.0,57.0,8.2e-05,0.018381
AdaBoostClassifier,0.971307,2588.0,24053.0,274.0,513.0,0.011263,0.165431
GradientBoostingClassifier,0.980276,2696.0,24191.0,136.0,405.0,0.00559,0.130603
ExtraTreeClassifier,1.0,3101.0,24327.0,0.0,0.0,0.0,0.0
XGB,0.979546,2681.0,24186.0,141.0,420.0,0.005796,0.13544
BaggingClassifier,0.99763,3038.0,24325.0,2.0,63.0,8.2e-05,0.020316
Neural Network-1,0.997849,3062.0,24307.0,20.0,39.0,0.000822,0.012577


Test Results


Unnamed: 0_level_0,accuracy,TP,TN,FP,FN,FPR,FNR,auc,MCC,Precision,Recall
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
KNeighborsClassifier,0.973628,1076.0,10369.0,46.0,264.0,0.004417,0.197015,0.899284,0.863714,0.959002,0.802985
RandomForestClassifier,0.976776,1122.0,10360.0,55.0,218.0,0.005281,0.162687,0.916016,0.880912,0.953271,0.837313
AdaBoostClassifier,0.967673,1094.0,10281.0,134.0,246.0,0.012866,0.183582,0.901776,0.834919,0.890879,0.816418
GradientBoostingClassifier,0.976095,1135.0,10339.0,76.0,205.0,0.007297,0.152985,0.919859,0.877891,0.937242,0.847015
ExtraTreeClassifier,0.975245,1093.0,10371.0,44.0,247.0,0.004225,0.184328,0.905723,0.87245,0.961302,0.815672
XGB,0.976436,1136.0,10342.0,73.0,204.0,0.007009,0.152239,0.920376,0.879616,0.93962,0.847761
BaggingClassifier,0.978307,1146.0,10354.0,61.0,194.0,0.005857,0.144776,0.924683,0.88928,0.949461,0.855224
Neural Network-1,0.968099,1114.0,10266.0,149.0,226.0,0.014306,0.168657,0.924683,0.83848,0.882027,0.831343


### <a id='1.B' style="text-decoration : none; color : #000000;"> B. Average Feature Importances</a>

In [10]:
feat_impt_plot(np.average(feat_imp,axis=0),columns)

### <a id='2' style="text-decoration : none; color : #000000;"> 2. OverSampling </a>

In [11]:
sm = SMOTE(random_state=12, ratio = {1:24327})
X_train, Y_train = sm.fit_sample(scaled_X_train,train_Y)


ind_columns=['KNeighborsClassifier','RandomForestClassifier','AdaBoostClassifier',
             'GradientBoostingClassifier','ExtraTreeClassifier','XGB','BaggingClassifier','Neural Network-1']

f=[KNN_model,RandomForest,AdaBoost,GradientBoosting, ExtraTree,XGB,Bagging,DeepLearning1]

## Table to store training and test measures
table_80_50_train=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR'])
table_80_50_test=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR','auc','MCC','Precision','Recall'])

feat_imp=[None]*5
j=0
for i in range(0,8):
    if i>=1 and i<=5:
        model,cm_train,cm_test,acc_train,acc_test,roc_auc,feat_imp[j],mcc,prec,rec=f[i](X_train,Y_train,scaled_X_test,test_Y,columns)
        j=j+1
    else:
        model,cm_train,cm_test,acc_train,acc_test,roc_auc,mcc,prec,rec=f[i](X_train,Y_train,scaled_X_test,test_Y)
    table_80_50_train.loc[i]=   ([acc_train,cm_train.TP,cm_train.TN,cm_train.FP,cm_train.FN,cm_train.FPR,cm_train.FNR])
    table_80_50_test.loc[i] =   ([acc_test ,cm_test.TP, cm_test.TN, cm_test.FP, cm_test.FN, cm_test.FPR, cm_test.FNR,roc_auc,mcc,prec,rec])
    print("\n")

[1mK-Nearest Neighbors[0m
Accuracy of the model for training data is: 0.977720228552637
Confusion Matrix for training data is:


Predicted  False   True  __all__
Actual                          
False      23245   1082    24327
True           2  24325    24327
__all__    23247  25407    48654

Accuracy of the model for test data is: 0.9365376435559336
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       9772   643    10415
True         103  1237     1340
__all__     9875  1880    11755



[1mRandomForest Classifier[0m
Accuracy of the model for training data is: 0.9994450610432852
Confusion Matrix for training data is:


Predicted  False   True  __all__
Actual                          
False      24322      5    24327
True          22  24305    24327
__all__    24344  24310    48654

Accuracy of the model for test data is: 0.9755848575074436
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10329    86    10415
True         201  1139     1340
__all__    10530  1225    11755



[1mAdaoost Classifier[0m
Accuracy of the model for training data is: 0.973301270193612
Confusion Matrix for training data is:


Predicted  False   True  __all__
Actual                          
False      23632    695    24327
True         604  23723    24327
__all__    24236  24418    48654

Accuracy of the model for test data is: 0.9586558911101659
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10088   327    10415
True         159  1181     1340
__all__    10247  1508    11755



[1mGradientBoosting Classifier[0m
Accuracy of the model for training data is: 0.9832490648250914
Confusion Matrix for training data is:


Predicted  False   True  __all__
Actual                          
False      23955    372    24327
True         443  23884    24327
__all__    24398  24256    48654

Accuracy of the model for test data is: 0.9695448745214802
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10224   191    10415
True         167  1173     1340
__all__    10391  1364    11755



[1mExtraTree Classifier[0m
Accuracy of the model for training data is: 1.0
Confusion Matrix for training data is:


Predicted  False   True  __all__
Actual                          
False      24327      0    24327
True           0  24327    24327
__all__    24327  24327    48654

Accuracy of the model for test data is: 0.9753296469587409
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10332    83    10415
True         207  1133     1340
__all__    10539  1216    11755



[1mXGB Classifier[0m
Accuracy of the model for training data is: 0.9832901714144777
Confusion Matrix for training data is:


Predicted  False   True  __all__
Actual                          
False      23969    358    24327
True         455  23872    24327
__all__    24424  24230    48654

Accuracy of the model for test data is: 0.9698851552530838
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10226   189    10415
True         165  1175     1340
__all__    10391  1364    11755



[1mBagging Classifier[0m
Accuracy of the model for training data is: 0.9988901220865705
Confusion Matrix for training data is:


Predicted  False   True  __all__
Actual                          
False      24322      5    24327
True          49  24278    24327
__all__    24371  24283    48654

Accuracy of the model for test data is: 0.9746490854955339
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10309   106    10415
True         192  1148     1340
__all__    10501  1254    11755



[1mNeural Network-1[0m
The accuracy of the model on training data is: 0.9993628478645127


Predicted  False   True  __all__
Actual                          
False      24314     13    24327
True          18  24309    24327
__all__    24332  24322    48654

The accuracy of the model on test data is: 0.9709910676307955


Predicted  False  True  __all__
Actual                         
False      10252   163    10415
True         178  1162     1340
__all__    10430  1325    11755





### <a id='2.A' style="text-decoration : none; color : #000000;"> A. Results</a>

In [12]:
trace0 = go.Scatter(x=table_80_50_test.iloc[0].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[0].values[[0,7,8,9,10]],
                    mode = 'lines',
                    name = 'KNN_model')


trace2 = go.Scatter(x=table_80_50_test.iloc[1].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[1].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'Random Forest')


trace3 = go.Scatter(x=table_80_50_test.iloc[2].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[2].values[[0,7,8,9,10]],
                    mode = 'lines',
                    name = 'Ada Boost')

trace4 = go.Scatter(x=table_80_50_test.iloc[3].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[3].values[[0,7,8,9,10]],
                    mode = 'lines+markers',
                    name = 'GradientBoosting')

trace5 = go.Scatter(x=table_80_50_test.iloc[4].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[4].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'ExtraTree Classifier')


trace1 = go.Scatter(x=table_80_50_test.iloc[5].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[5].values[[0,7,8,9,10]],
                    mode = 'lines+markers',
                    name = 'XGB')


trace6 = go.Scatter(x=table_80_50_test.iloc[6].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[6].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'Bagging')

trace7 = go.Scatter(x=table_80_50_test.iloc[7].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[7].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'Neural Network')



data = [trace0, trace1, trace2, trace3, trace4, trace5, trace6, trace7]
layout = go.Layout(yaxis=dict(tick0=0,dtick=0.05),title='Result comparision for Features selected in Section 1')
fig = go.Figure(data=data, layout=layout)

iplot(fig, filename='Result')

In [13]:
table_80_50_train['index']=ind_columns
table_80_50_test['index']=ind_columns


table_80_50_test.set_index(['index'],drop=True,inplace=True)
table_80_50_train.set_index(['index'],drop=True,inplace=True)

print("Training Results")
display(table_80_50_train)
print("Test Results")
display(table_80_50_test)

Training Results


Unnamed: 0_level_0,accuracy,TP,TN,FP,FN,FPR,FNR
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
KNeighborsClassifier,0.97772,24325.0,23245.0,1082.0,2.0,0.044477,8.2e-05
RandomForestClassifier,0.999445,24305.0,24322.0,5.0,22.0,0.000206,0.000904
AdaBoostClassifier,0.973301,23723.0,23632.0,695.0,604.0,0.028569,0.024828
GradientBoostingClassifier,0.983249,23884.0,23955.0,372.0,443.0,0.015292,0.01821
ExtraTreeClassifier,1.0,24327.0,24327.0,0.0,0.0,0.0,0.0
XGB,0.98329,23872.0,23969.0,358.0,455.0,0.014716,0.018703
BaggingClassifier,0.99889,24278.0,24322.0,5.0,49.0,0.000206,0.002014
Neural Network-1,0.999363,24309.0,24314.0,13.0,18.0,0.000534,0.00074


Test Results


Unnamed: 0_level_0,accuracy,TP,TN,FP,FN,FPR,FNR,auc,MCC,Precision,Recall
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
KNeighborsClassifier,0.936538,1237.0,9772.0,643.0,103.0,0.061738,0.076866,0.930698,0.746857,0.657979,0.923134
RandomForestClassifier,0.975585,1139.0,10329.0,86.0,201.0,0.008257,0.15,0.920871,0.875546,0.929796,0.85
AdaBoostClassifier,0.958656,1181.0,10088.0,327.0,159.0,0.031397,0.118657,0.924973,0.807745,0.783156,0.881343
GradientBoostingClassifier,0.969545,1173.0,10224.0,191.0,167.0,0.018339,0.124627,0.928517,0.850441,0.859971,0.875373
ExtraTreeClassifier,0.97533,1133.0,10332.0,83.0,207.0,0.007969,0.154478,0.918777,0.874033,0.931743,0.845522
XGB,0.969885,1175.0,10226.0,189.0,165.0,0.018147,0.123134,0.929359,0.852113,0.861437,0.876866
BaggingClassifier,0.974649,1148.0,10309.0,106.0,192.0,0.010178,0.143284,0.923269,0.871495,0.91547,0.856716
Neural Network-1,0.970991,1162.0,10252.0,163.0,178.0,0.015651,0.132836,0.923269,0.855704,0.876981,0.867164


### <a id='2.B' style="text-decoration : none; color : #000000;"> B. Average Feature Importances</a>

In [14]:
feat_impt_plot(np.average(feat_imp,axis=0),columns)

### <a id='3' style="text-decoration : none; color : #000000;">3. UnderSampling </a>

In [15]:
df_non=train[train.Target==0]
df_mal=train[train.Target==1]

df_non=df_non.sample(frac=len(df_mal)/len(df_non),random_state=0)
df_u=df_non.append(df_mal)
df_u.Target.value_counts() 

X_train=df_u[columns]
X_train=scaler.transform(X_train)
Y_train=df_u.Target.values

print("Shape after undersampling",X_train.shape)

## Table to store training and test measures so we can compare later
table_80_50_train=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR'])
table_80_50_test=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR','auc','MCC','Precision','Recall'])

feat_imp=[None]*5
j=0
for i in range(0,8):
    if i>=1 and i<=5:
        model,cm_train,cm_test,acc_train,acc_test,roc_auc,feat_imp[j],mcc,prec,rec=f[i](X_train,Y_train,scaled_X_test,test_Y,columns)
        j=j+1
    else:
        model,cm_train,cm_test,acc_train,acc_test,roc_auc,mcc,prec,rec=f[i](X_train,Y_train,scaled_X_test,test_Y)
    table_80_50_train.loc[i]=   ([acc_train,cm_train.TP,cm_train.TN,cm_train.FP,cm_train.FN,cm_train.FPR,cm_train.FNR])
    table_80_50_test.loc[i] =   ([acc_test ,cm_test.TP, cm_test.TN, cm_test.FP, cm_test.FN, cm_test.FPR, cm_test.FNR,roc_auc,mcc,prec,rec])
    print("\n")

Shape after undersampling (6202, 99)
[1mK-Nearest Neighbors[0m
Accuracy of the model for training data is: 0.9506610770719123
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False       2995   106     3101
True         200  2901     3101
__all__     3195  3007     6202

Accuracy of the model for test data is: 0.9445342407486176
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       9881   534    10415
True         118  1222     1340
__all__     9999  1756    11755



[1mRandomForest Classifier[0m
Accuracy of the model for training data is: 0.9970977104159948
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False       3098     3     3101
True          15  3086     3101
__all__     3113  3089     6202

Accuracy of the model for test data is: 0.9515099957464909
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       9958   457    10415
True         113  1227     1340
__all__    10071  1684    11755



[1mAdaoost Classifier[0m
Accuracy of the model for training data is: 0.9396968719767816
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False       2908   193     3101
True         181  2920     3101
__all__     3089  3113     6202

Accuracy of the model for test data is: 0.9374734155678435
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       9776   639    10415
True          96  1244     1340
__all__     9872  1883    11755



[1mGradientBoosting Classifier[0m
Accuracy of the model for training data is: 0.9577555627217027
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False       2977   124     3101
True         138  2963     3101
__all__     3115  3087     6202

Accuracy of the model for test data is: 0.9512547851977882
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       9933   482    10415
True          91  1249     1340
__all__    10024  1731    11755



[1mExtraTree Classifier[0m
Accuracy of the model for training data is: 1.0
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False       3101     0     3101
True           0  3101     3101
__all__     3101  3101     6202

Accuracy of the model for test data is: 0.9524457677584007
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       9965   450    10415
True         109  1231     1340
__all__    10074  1681    11755



[1mXGB Classifier[0m
Accuracy of the model for training data is: 0.9545307965172525
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False       2976   125     3101
True         157  2944     3101
__all__     3133  3069     6202

Accuracy of the model for test data is: 0.9523606975754998
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       9948   467    10415
True          93  1247     1340
__all__    10041  1714    11755



[1mBagging Classifier[0m
Accuracy of the model for training data is: 0.9966139954853274
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False       3096     5     3101
True          16  3085     3101
__all__     3112  3090     6202

Accuracy of the model for test data is: 0.9552530837941302
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10001   414    10415
True         112  1228     1340
__all__    10113  1642    11755



[1mNeural Network-1[0m
The accuracy of the model on training data is: 1.0


Predicted  False  True  __all__
Actual                         
False       3101     0     3101
True           0  3101     3101
__all__     3101  3101     6202

The accuracy of the model on test data is: 0.9437686091025096


Predicted  False  True  __all__
Actual                         
False       9857   558    10415
True         103  1237     1340
__all__     9960  1795    11755





### <a id='3.A' style="text-decoration : none; color : #000000;"> A. Results</a>

In [16]:
trace0 = go.Scatter(x=table_80_50_test.iloc[0].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[0].values[[0,7,8,9,10]],
                    mode = 'lines',
                    name = 'KNN_model')


trace2 = go.Scatter(x=table_80_50_test.iloc[1].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[1].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'Random Forest')


trace3 = go.Scatter(x=table_80_50_test.iloc[2].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[2].values[[0,7,8,9,10]],
                    mode = 'lines',
                    name = 'Ada Boost')

trace4 = go.Scatter(x=table_80_50_test.iloc[3].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[3].values[[0,7,8,9,10]],
                    mode = 'lines+markers',
                    name = 'GradientBoosting')

trace5 = go.Scatter(x=table_80_50_test.iloc[4].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[4].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'ExtraTree Classifier')


trace1 = go.Scatter(x=table_80_50_test.iloc[5].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[5].values[[0,7,8,9,10]],
                    mode = 'lines+markers',
                    name = 'XGB')


trace6 = go.Scatter(x=table_80_50_test.iloc[6].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[6].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'Bagging')

trace7 = go.Scatter(x=table_80_50_test.iloc[7].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[7].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'Neural Network')



data = [trace0, trace1, trace2, trace3, trace4, trace5, trace6, trace7]
layout = go.Layout(yaxis=dict(tick0=0,dtick=0.05),title='Result comparision for Features selected in Section 1')
fig = go.Figure(data=data, layout=layout)

iplot(fig, filename='Result')

In [17]:
table_80_50_train['index']=ind_columns
table_80_50_test['index']=ind_columns


table_80_50_test.set_index(['index'],drop=True,inplace=True)
table_80_50_train.set_index(['index'],drop=True,inplace=True)

print("Training Results")
display(table_80_50_train)
print("Test Results")
display(table_80_50_test)

Training Results


Unnamed: 0_level_0,accuracy,TP,TN,FP,FN,FPR,FNR
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
KNeighborsClassifier,0.950661,2901.0,2995.0,106.0,200.0,0.034183,0.064495
RandomForestClassifier,0.997098,3086.0,3098.0,3.0,15.0,0.000967,0.004837
AdaBoostClassifier,0.939697,2920.0,2908.0,193.0,181.0,0.062238,0.058368
GradientBoostingClassifier,0.957756,2963.0,2977.0,124.0,138.0,0.039987,0.044502
ExtraTreeClassifier,1.0,3101.0,3101.0,0.0,0.0,0.0,0.0
XGB,0.954531,2944.0,2976.0,125.0,157.0,0.04031,0.050629
BaggingClassifier,0.996614,3085.0,3096.0,5.0,16.0,0.001612,0.00516
Neural Network-1,1.0,3101.0,3101.0,0.0,0.0,0.0,0.0


Test Results


Unnamed: 0_level_0,accuracy,TP,TN,FP,FN,FPR,FNR,auc,MCC,Precision,Recall
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
KNeighborsClassifier,0.944534,1222.0,9881.0,534.0,118.0,0.051272,0.08806,0.930334,0.767321,0.6959,0.91194
RandomForestClassifier,0.95151,1227.0,9958.0,457.0,113.0,0.043879,0.084328,0.935896,0.790839,0.728622,0.915672
AdaBoostClassifier,0.937473,1244.0,9776.0,639.0,96.0,0.061354,0.071642,0.933502,0.751235,0.660648,0.928358
GradientBoostingClassifier,0.951255,1249.0,9933.0,482.0,91.0,0.046279,0.06791,0.942905,0.794426,0.721548,0.93209
ExtraTreeClassifier,0.952446,1231.0,9965.0,450.0,109.0,0.043207,0.081343,0.937725,0.794746,0.732302,0.918657
XGB,0.952361,1247.0,9948.0,467.0,93.0,0.044839,0.069403,0.942879,0.797633,0.727538,0.930597
BaggingClassifier,0.955253,1228.0,10001.0,414.0,112.0,0.03975,0.083582,0.938334,0.803693,0.747868,0.916418
Neural Network-1,0.943769,1237.0,9857.0,558.0,103.0,0.053577,0.076866,0.938334,0.768278,0.689136,0.923134


### <a id='3.B' style="text-decoration : none; color : #000000;"> B. Average Feature Importances</a>

In [18]:
feat_impt_plot(np.average(feat_imp,axis=0),columns)

### <a id='4' style="text-decoration : none; color : #000000;">4. Feature Transformation with feature selection </a>

In [19]:
df.reset_index(inplace=True)
columns.append('domain')
df_min=df[columns]

es = ft.EntitySet(id = 'malicious')
es.entity_from_dataframe(entity_id = 'data', dataframe = df_min, index = 'domain')

feature_matrix_sessions, features_defs = ft.dfs(entityset=es,target_entity="data",
                                                trans_primitives = ['add','divide','multiply'],n_jobs=-1,
                                                verbose=1,max_depth=1)

Built 19503 features
EntitySet scattered to workers in 8.145 seconds
Elapsed: 04:32 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 11/11 chunks


Task exception was never retrieved
future: <Task finished coro=<BaseTCPConnector.connect() done, defined at /nethomes/darshan/anaconda3/lib/python3.6/site-packages/distributed/comm/tcp.py:341> exception=CommClosedError('in <distributed.comm.tcp.TCPConnector object at 0x7fc988ace7b8>: ConnectionRefusedError: [Errno 111] Connection refused',)>
Traceback (most recent call last):
  File "/nethomes/darshan/anaconda3/lib/python3.6/site-packages/distributed/comm/tcp.py", line 348, in connect
    ip, port, max_buffer_size=MAX_BUFFER_SIZE, **kwargs
  File "/nethomes/darshan/anaconda3/lib/python3.6/site-packages/tornado/tcpclient.py", line 280, in connect
    af, addr, stream = await connector.start(connect_timeout=timeout)
  File "/nethomes/darshan/anaconda3/lib/python3.6/site-packages/tornado/tcpclient.py", line 143, in on_connect_done
    stream = future.result()
tornado.iostream.StreamClosedError: Stream is closed

During handling of the above exception, another exception occurred:

Tracebac

In [20]:
df.set_index(['domain'],drop=True,inplace=True)

In [21]:
fs=pd.merge(feature_matrix_sessions,pd.DataFrame(df_sel.Target),left_index=True, right_index=True)

In [22]:
del feature_matrix_sessions

In [23]:
fs.replace([np.inf],0,inplace=True)
fs.fillna(value=0,inplace=True)
fs=fs.reindex(df.index.values)
X=fs.loc[:,fs.columns!='Target']
y=fs.Target
feature_name = X.columns.tolist()

In [24]:
feature_name=['url_tlds / total_count',
 'url_length / total_count',
 'url_extensions / total_count',
 'url_char_z / total_count',
 'url_words_with_length_5 / total_count',
 'url_words_with_length_4 / total_count',
 'url_extensions + url_length',
 'url_char_w + url_length',
 'url_char_u / total_count',
 'url_char_t / total_count',
 'url_char_s / total_count',
 'url_char_l / total_count',
 'url_char_i / total_count',
 'url_char_e / total_count',
 'url_char_. / total_count',
 'url_char_. + url_char_z',
 'url_char_. + url_char_y',
 'url_char_. + url_char_x',
 'url_char_. + url_char_w',
 'url_char_.',
 'js_function_String.fromCharCode( + url_length',
 'http_header_content-language_text/html / url_char_.',
 'http_header_content-language_text/html / title_count',
 'http_header_content-language_text/html + url_length',
 'http_header_content-encoding_gzip + http_header_content-language_text/html',
 'frame_src_out_of_domain + url_char_.',
 'frame_count + url_char_.']
print('Features we will be using:\n')
display(feature_name)

Features we will be using:



['url_tlds / total_count',
 'url_length / total_count',
 'url_extensions / total_count',
 'url_char_z / total_count',
 'url_words_with_length_5 / total_count',
 'url_words_with_length_4 / total_count',
 'url_extensions + url_length',
 'url_char_w + url_length',
 'url_char_u / total_count',
 'url_char_t / total_count',
 'url_char_s / total_count',
 'url_char_l / total_count',
 'url_char_i / total_count',
 'url_char_e / total_count',
 'url_char_. / total_count',
 'url_char_. + url_char_z',
 'url_char_. + url_char_y',
 'url_char_. + url_char_x',
 'url_char_. + url_char_w',
 'url_char_.',
 'js_function_String.fromCharCode( + url_length',
 'http_header_content-language_text/html / url_char_.',
 'http_header_content-language_text/html / title_count',
 'http_header_content-language_text/html + url_length',
 'http_header_content-encoding_gzip + http_header_content-language_text/html',
 'frame_src_out_of_domain + url_char_.',
 'frame_count + url_char_.']

In [25]:
fs=fs.reindex(df_sel.index.values)

train=fs.iloc[:27428]
train_Y=train.Target.values
train_X=train[feature_name].values


scaler=StandardScaler()
scaled_X_train=scaler.fit_transform(train_X)

test=fs.iloc[27428:]
test_Y=test.Target.values
test_X=test[feature_name]

scaled_X_test=scaler.transform(test_X)

table_80_50_train=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR'])
table_80_50_test=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR','auc','MCC','Precision','Recall'])

feat_imp=[None]*5
j=0
for i in range(0,8):
    if i>=1 and i<=5:
        model,cm_train,cm_test,acc_train,acc_test,roc_auc,feat_imp[j],mcc,prec,rec=f[i](scaled_X_train,train_Y,scaled_X_test,test_Y,feature_name)
        j=j+1
    else:
        model,cm_train,cm_test,acc_train,acc_test,roc_auc,mcc,prec,rec=f[i](scaled_X_train,train_Y,scaled_X_test,test_Y)
    table_80_50_train.loc[i]=   ([acc_train,cm_train.TP,cm_train.TN,cm_train.FP,cm_train.FN,cm_train.FPR,cm_train.FNR])
    table_80_50_test.loc[i] =   ([acc_test ,cm_test.TP, cm_test.TN, cm_test.FP, cm_test.FN, cm_test.FPR, cm_test.FNR,roc_auc,mcc,prec,rec])
    print("\n")

[1mK-Nearest Neighbors[0m
Accuracy of the model for training data is: 0.9782339215400321
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24181   146    24327
True         451  2650     3101
__all__    24632  2796    27428

Accuracy of the model for test data is: 0.970565716716291
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10315   100    10415
True         246  1094     1340
__all__    10561  1194    11755



[1mRandomForest Classifier[0m
Accuracy of the model for training data is: 0.9971926498468718
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24322     5    24327
True          72  3029     3101
__all__    24394  3034    27428

Accuracy of the model for test data is: 0.9752445767758401
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10343    72    10415
True         219  1121     1340
__all__    10562  1193    11755



[1mAdaoost Classifier[0m
Accuracy of the model for training data is: 0.9682441300860435
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24068   259    24327
True         612  2489     3101
__all__    24680  2748    27428

Accuracy of the model for test data is: 0.9635048915355168
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10295   120    10415
True         309  1031     1340
__all__    10604  1151    11755



[1mGradientBoosting Classifier[0m
Accuracy of the model for training data is: 0.9786714306548053
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24180   147    24327
True         438  2663     3101
__all__    24618  2810    27428

Accuracy of the model for test data is: 0.9725223309230114
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10323    92    10415
True         231  1109     1340
__all__    10554  1201    11755



[1mExtraTree Classifier[0m
Accuracy of the model for training data is: 0.999854163628409
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24327     0    24327
True           4  3097     3101
__all__    24331  3097    27428

Accuracy of the model for test data is: 0.9741386643981285
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10338    77    10415
True         227  1113     1340
__all__    10565  1190    11755



[1mXGB Classifier[0m
Accuracy of the model for training data is: 0.9768849351028146
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24153   174    24327
True         460  2641     3101
__all__    24613  2815    27428

Accuracy of the model for test data is: 0.9721820501914079
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10321    94    10415
True         233  1107     1340
__all__    10554  1201    11755



[1mBagging Classifier[0m
Accuracy of the model for training data is: 0.997156190753974
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24316    11    24327
True          67  3034     3101
__all__    24383  3045    27428

Accuracy of the model for test data is: 0.9764355593364525
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10340    75    10415
True         202  1138     1340
__all__    10542  1213    11755



[1mNeural Network-1[0m
The accuracy of the model on training data is: 0.9907393904039667


Predicted  False  True  __all__
Actual                         
False      24293    34    24327
True         220  2881     3101
__all__    24513  2915    27428

The accuracy of the model on test data is: 0.9724372607401106


Predicted  False  True  __all__
Actual                         
False      10294   121    10415
True         203  1137     1340
__all__    10497  1258    11755





### <a id='4.A' style="text-decoration : none; color : #000000;"> A. Results</a>

In [26]:
trace0 = go.Scatter(x=table_80_50_test.iloc[0].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[0].values[[0,7,8,9,10]],
                    mode = 'lines',
                    name = 'KNN_model')


trace2 = go.Scatter(x=table_80_50_test.iloc[1].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[1].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'Random Forest')


trace3 = go.Scatter(x=table_80_50_test.iloc[2].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[2].values[[0,7,8,9,10]],
                    mode = 'lines',
                    name = 'Ada Boost')

trace4 = go.Scatter(x=table_80_50_test.iloc[3].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[3].values[[0,7,8,9,10]],
                    mode = 'lines+markers',
                    name = 'GradientBoosting')

trace5 = go.Scatter(x=table_80_50_test.iloc[4].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[4].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'ExtraTree Classifier')


trace1 = go.Scatter(x=table_80_50_test.iloc[5].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[5].values[[0,7,8,9,10]],
                    mode = 'lines+markers',
                    name = 'XGB')


trace6 = go.Scatter(x=table_80_50_test.iloc[6].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[6].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'Bagging')

trace7 = go.Scatter(x=table_80_50_test.iloc[7].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[7].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'Neural Network')



data = [trace0, trace1, trace2, trace3, trace4, trace5, trace6, trace7]
layout = go.Layout(yaxis=dict(tick0=0,dtick=0.05),title='Result comparision for Features selected in Section 1')
fig = go.Figure(data=data, layout=layout)

iplot(fig, filename='Result')

In [27]:
table_80_50_train['index']=ind_columns
table_80_50_test['index']=ind_columns


table_80_50_test.set_index(['index'],drop=True,inplace=True)
table_80_50_train.set_index(['index'],drop=True,inplace=True)

print("Training Results")
display(table_80_50_train)
print("Test Results")
display(table_80_50_test)

Training Results


Unnamed: 0_level_0,accuracy,TP,TN,FP,FN,FPR,FNR
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
KNeighborsClassifier,0.978234,2650.0,24181.0,146.0,451.0,0.006002,0.145437
RandomForestClassifier,0.997193,3029.0,24322.0,5.0,72.0,0.000206,0.023218
AdaBoostClassifier,0.968244,2489.0,24068.0,259.0,612.0,0.010647,0.197356
GradientBoostingClassifier,0.978671,2663.0,24180.0,147.0,438.0,0.006043,0.141245
ExtraTreeClassifier,0.999854,3097.0,24327.0,0.0,4.0,0.0,0.00129
XGB,0.976885,2641.0,24153.0,174.0,460.0,0.007153,0.148339
BaggingClassifier,0.997156,3034.0,24316.0,11.0,67.0,0.000452,0.021606
Neural Network-1,0.990739,2881.0,24293.0,34.0,220.0,0.001398,0.070945


Test Results


Unnamed: 0_level_0,accuracy,TP,TN,FP,FN,FPR,FNR,auc,MCC,Precision,Recall
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
KNeighborsClassifier,0.970566,1094.0,10315.0,100.0,246.0,0.009602,0.183582,0.903408,0.848794,0.916248,0.816418
RandomForestClassifier,0.975245,1121.0,10343.0,72.0,219.0,0.006913,0.163433,0.914827,0.873144,0.939648,0.836567
AdaBoostClassifier,0.963505,1031.0,10295.0,120.0,309.0,0.011522,0.230597,0.878941,0.810421,0.895743,0.769403
GradientBoostingClassifier,0.972522,1109.0,10323.0,92.0,231.0,0.008833,0.172388,0.909389,0.859149,0.923397,0.827612
ExtraTreeClassifier,0.974139,1113.0,10338.0,77.0,227.0,0.007393,0.169403,0.911602,0.867324,0.935294,0.830597
XGB,0.972182,1107.0,10321.0,94.0,233.0,0.009025,0.173881,0.908547,0.857381,0.921732,0.826119
BaggingClassifier,0.976436,1138.0,10340.0,75.0,202.0,0.007201,0.150746,0.921026,0.879689,0.93817,0.849254
Neural Network-1,0.972437,1137.0,10294.0,121.0,203.0,0.011618,0.151493,0.921026,0.860354,0.903816,0.848507


### <a id='4.B' style="text-decoration : none; color : #000000;"> B. Average Feature Importances</a>

In [28]:
feat_impt_plot(np.average(feat_imp,axis=0),feature_name)

### <a id='5' style="text-decoration : none; color : #000000;">5. Feature Transformation with PCA </a>

In [39]:
X=fs.loc[:,fs.columns!='Target']
y=fs.Target.values
feature_name = X.columns.tolist()
X=np.nan_to_num(X)
scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)

In [40]:
del fs

In [41]:
pca=PCA(n_components=750,whiten=True,random_state=0)
X_pca_21=pca.fit_transform(scaled_X)
print("Variance explained using 750 components is:",sum(pca.explained_variance_ratio_))

Variance explained using 750 components is: 0.8021089097851566


In [43]:
del scaled_X  

In [44]:
X_train=X_pca_21[:27428]
Y_train=y[0:27428]
X_test=X_pca_21[27428:]
Y_test=y[27428:]

## Table to store training and test measures so we can compare later
table_80_70_train=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR'])
table_80_70_test=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR','auc','mcc'])

In [46]:
table_80_50_train=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR'])
table_80_50_test=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR','auc','MCC','Precision','Recall'])

feat_imp=[None]*5
j=0
for i in range(0,8):
    if i>=1 and i<=5:
        model,cm_train,cm_test,acc_train,acc_test,roc_auc,feat_imp[j],mcc,prec,rec=f[i](X_train,Y_train,X_test,Y_test,np.arange(0,750).astype(str))
        j=j+1
    else:
        model,cm_train,cm_test,acc_train,acc_test,roc_auc,mcc,prec,rec=f[i](X_train,Y_train,X_test,Y_test)
    table_80_50_train.loc[i]=   ([acc_train,cm_train.TP,cm_train.TN,cm_train.FP,cm_train.FN,cm_train.FPR,cm_train.FNR])
    table_80_50_test.loc[i] =   ([acc_test ,cm_test.TP, cm_test.TN, cm_test.FP, cm_test.FN, cm_test.FPR, cm_test.FNR,roc_auc,mcc,prec,rec])
    print("\n")

[1mK-Nearest Neighbors[0m
Accuracy of the model for training data is: 0.9729838121627534
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24307    20    24327
True         721  2380     3101
__all__    25028  2400    27428

Accuracy of the model for test data is: 0.9642705231816249
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10397    18    10415
True         402   938     1340
__all__    10799   956    11755



[1mRandomForest Classifier[0m
Accuracy of the model for training data is: 0.9963176316173253
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24327     0    24327
True         101  3000     3101
__all__    24428  3000    27428

Accuracy of the model for test data is: 0.9646108039132284
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10362    53    10415
True         363   977     1340
__all__    10725  1030    11755



[1mAdaoost Classifier[0m
Accuracy of the model for training data is: 0.9689003937582032
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24001   326    24327
True         527  2574     3101
__all__    24528  2900    27428

Accuracy of the model for test data is: 0.9632496809868141
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10236   179    10415
True         253  1087     1340
__all__    10489  1266    11755



[1mGradientBoosting Classifier[0m
Accuracy of the model for training data is: 0.9791089397695786
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24200   127    24327
True         446  2655     3101
__all__    24646  2782    27428

Accuracy of the model for test data is: 0.9692896639727775
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10306   109    10415
True         252  1088     1340
__all__    10558  1197    11755



[1mExtraTree Classifier[0m
Accuracy of the model for training data is: 1.0
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24327     0    24327
True           0  3101     3101
__all__    24327  3101    27428

Accuracy of the model for test data is: 0.9343258188005105
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10399    16    10415
True         756   584     1340
__all__    11155   600    11755



[1mXGB Classifier[0m
Accuracy of the model for training data is: 0.9798016625346362
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24213   114    24327
True         440  2661     3101
__all__    24653  2775    27428

Accuracy of the model for test data is: 0.970565716716291
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10321    94    10415
True         252  1088     1340
__all__    10573  1182    11755



[1mBagging Classifier[0m
Accuracy of the model for training data is: 0.9969374361965875
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24323     4    24327
True          80  3021     3101
__all__    24403  3025    27428

Accuracy of the model for test data is: 0.9671629094002552
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10307   108    10415
True         278  1062     1340
__all__    10585  1170    11755



[1mNeural Network-1[0m
The accuracy of the model on training data is: 0.9996718681639201


Predicted  False  True  __all__
Actual                         
False      24327     0    24327
True           9  3092     3101
__all__    24336  3092    27428

The accuracy of the model on test data is: 0.9729476818375159


Predicted  False  True  __all__
Actual                         
False      10277   138    10415
True         180  1160     1340
__all__    10457  1298    11755





In [47]:
trace0 = go.Scatter(x=table_80_50_test.iloc[0].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[0].values[[0,7,8,9,10]],
                    mode = 'lines',
                    name = 'KNN_model')


trace2 = go.Scatter(x=table_80_50_test.iloc[1].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[1].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'Random Forest')


trace3 = go.Scatter(x=table_80_50_test.iloc[2].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[2].values[[0,7,8,9,10]],
                    mode = 'lines',
                    name = 'Ada Boost')

trace4 = go.Scatter(x=table_80_50_test.iloc[3].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[3].values[[0,7,8,9,10]],
                    mode = 'lines+markers',
                    name = 'GradientBoosting')

trace5 = go.Scatter(x=table_80_50_test.iloc[4].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[4].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'ExtraTree Classifier')


trace1 = go.Scatter(x=table_80_50_test.iloc[5].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[5].values[[0,7,8,9,10]],
                    mode = 'lines+markers',
                    name = 'XGB')


trace6 = go.Scatter(x=table_80_50_test.iloc[6].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[6].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'Bagging')

trace7 = go.Scatter(x=table_80_50_test.iloc[7].index.values[[0,7,8,9,10]],
                    y=table_80_50_test.iloc[7].values[[0,7,8,9,10]],
                    mode = 'lines+markers',marker=dict(symbol='x'),
                    name = 'Neural Network')



data = [trace0, trace1, trace2, trace3, trace4, trace5, trace6, trace7]
layout = go.Layout(yaxis=dict(tick0=0,dtick=0.05),title='Result comparision for Features selected in Section 1')
fig = go.Figure(data=data, layout=layout)

iplot(fig, filename='Result')

In [48]:
table_80_50_train['index']=ind_columns
table_80_50_test['index']=ind_columns


table_80_50_test.set_index(['index'],drop=True,inplace=True)
table_80_50_train.set_index(['index'],drop=True,inplace=True)

print("Training Results")
display(table_80_50_train)
print("Test Results")
display(table_80_50_test)

Training Results


Unnamed: 0_level_0,accuracy,TP,TN,FP,FN,FPR,FNR
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
KNeighborsClassifier,0.972984,2380.0,24307.0,20.0,721.0,0.000822,0.232506
RandomForestClassifier,0.996318,3000.0,24327.0,0.0,101.0,0.0,0.03257
AdaBoostClassifier,0.9689,2574.0,24001.0,326.0,527.0,0.013401,0.169945
GradientBoostingClassifier,0.979109,2655.0,24200.0,127.0,446.0,0.005221,0.143825
ExtraTreeClassifier,1.0,3101.0,24327.0,0.0,0.0,0.0,0.0
XGB,0.979802,2661.0,24213.0,114.0,440.0,0.004686,0.14189
BaggingClassifier,0.996937,3021.0,24323.0,4.0,80.0,0.000164,0.025798
Neural Network-1,0.999672,3092.0,24327.0,0.0,9.0,0.0,0.002902


Test Results


Unnamed: 0_level_0,accuracy,TP,TN,FP,FN,FPR,FNR,auc,MCC,Precision,Recall
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
KNeighborsClassifier,0.964271,938.0,10397.0,18.0,402.0,0.001728,0.3,0.849136,0.811869,0.981172,0.7
RandomForestClassifier,0.964611,977.0,10362.0,53.0,363.0,0.005089,0.270896,0.862008,0.813791,0.948544,0.729104
AdaBoostClassifier,0.96325,1087.0,10236.0,179.0,253.0,0.017187,0.188806,0.897004,0.813997,0.85861,0.811194
GradientBoostingClassifier,0.96929,1088.0,10306.0,109.0,252.0,0.010466,0.18806,0.900737,0.842236,0.908939,0.81194
ExtraTreeClassifier,0.934326,584.0,10399.0,16.0,756.0,0.001536,0.564179,0.717142,0.627113,0.973333,0.435821
XGB,0.970566,1088.0,10321.0,94.0,252.0,0.009025,0.18806,0.901457,0.848484,0.920474,0.81194
BaggingClassifier,0.967163,1062.0,10307.0,108.0,278.0,0.01037,0.207463,0.891084,0.830316,0.907692,0.792537
Neural Network-1,0.972948,1160.0,10277.0,138.0,180.0,0.01325,0.134328,0.891084,0.864362,0.893683,0.865672


In [49]:
### Original Creator : Darshan Bhansali
### HTML code to hide the input cells 
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
To toggle code, click <a href="javascript:code_toggle()">here</a>.''')