In [4]:
import pandas as pd
import numpy as np
import os
import json
from pandas.io.json import json_normalize
from IPython.display import display,clear_output,HTML
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from tabulate import tabulate

import keras
from keras.layers import Dense,Dropout
from keras.models import Sequential
from keras.wrappers.scikit_learn import KerasClassifier

%matplotlib inline
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows',305)

from sklearn.feature_selection import SelectKBest,chi2,RFE,RFECV,f_regression,SelectFromModel
from sklearn.linear_model import LogisticRegression,RandomizedLasso,LinearRegression, Ridge,Lasso
from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier,BaggingClassifier,VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold,GridSearchCV,train_test_split,cross_val_score

from sklearn.preprocessing import MinMaxScaler,StandardScaler,binarize
from sklearn.ensemble import RandomForestRegressor
from minepy import MINE
from sklearn import metrics

import warnings
warnings.simplefilter("ignore", DeprecationWarning)

from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix,accuracy_score,auc,roc_curve,recall_score
from sklearn import metrics
from pandas_ml import ConfusionMatrix


from imblearn.over_sampling import SMOTE

In [5]:
### Function get_params takes as input a grid search model and prints out the best parameters for the model and 
### prints the best accuracy of the model
def get_params(grid,model_name=''):
    print("Best Parameters are :")
    display(pd.DataFrame.from_dict(grid.best_params_, orient='index').reset_index().transpose())

### The function takes as input the model, X and Y data to generate the confusion matrix  
def get_cm(model,a,b):
    y_pred=model.predict(a)
    cm=ConfusionMatrix(b,y_pred)
    return cm
 
### The get accuracy plot takes as input a dataframe and a string for the title of the plot and displays a plot
### The function is primarily for plotting of accuracies for various classifiers for the same split.
def get_accuracy_plot(data,title=''):
    plt.figure(figsize=(12,8))
    plt.plot(data['accuracy'].values, '--o')
    plt.ylabel('Accuracy', fontsize=20)
    plt.xlabel('Classifier', fontsize=20)
    plt.tick_params(axis='both', which='major', labelsize=18)
    plt.tick_params(axis='both', which='minor', labelsize=12)
    plt.xlim([0, 9])
    plt.title(title,size=18,y=1.05)
    plt.xticks(np.arange(10),data.index.get_values(),rotation=90)
    plt.legend(loc='upper right', fontsize=18)
    plt.show()

### Function grid takes as input an estimator, X_train,Y_Train,X_test and Y_test and a string represetning the name of the model
### We perform StratifiedKFold cross validation and grid search to hypertune the parameters of the model     
def grid(kernel,params,x,y,x_test,y_test,model_name=''):
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
    grid_search=GridSearchCV(estimator=kernel,param_grid=parameters,scoring='accuracy',cv=kfold,n_jobs=3,verbose=True)
    grid_search.fit(x,y)
    get_params(grid_search,model_name)
    acc_train=accuracy_score(y,grid_search.predict(x))
    print("Accuracy of the ",model_name," model for the training data is:",acc_train)
    print("\nConfusion Matrix for Training data is:")
    cm_train=get_cm(grid_search,x,y)
    display(cm_train)
    acc_test=accuracy_score(y_test,grid_search.predict(x_test))
    print("Accuracy of the ",model_name," model for test data is:",acc_test)
    print("\nConfusion Matrix for test data is:")
    cm_test=get_cm(grid_search,x_test,y_test)
    display(cm_test)
    fpr, tpr, threshold = roc_curve(y_test, grid_search.predict(x_test))
    roc_auc =auc(fpr, tpr)
    return grid_search,cm_train,cm_test,acc_train,acc_test,roc_auc  

### Plot roc curve
def get_roc_curve(model,x_test,y_test):
    prob=model.predict_proba(x_test)
    preds=prob[:,1]
    fpr, tpr, threshold = roc_curve(y_test, preds)
    roc_auc =auc(fpr, tpr)
    plt.figure(figsize=(10,8))
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
    i = np.arange(len(tpr)) 
    roc = pd.DataFrame({'fpr' : pd.Series(fpr, index=i),'tpr' : pd.Series(tpr, index = i), '1-fpr' : pd.Series(1-fpr, index = i), 'tf' : pd.Series(tpr - (1-fpr), index = i), 'thresholds' : pd.Series(threshold, index = i)})
    threshold=roc.ix[(roc.tf-0).abs().argsort()[:1]]['thresholds'].values[0]
    print("Optimal Threshold is",threshold)
    y_pred_class=binarize(preds.reshape(1,-1),threshold)[0]
    accuracy=accuracy_score(y_test,y_pred_class)
    print("Accuracy on test data is:",accuracy)
    cm=ConfusionMatrix(y_test,y_pred_class)
    display(cm)
    return cm,accuracy,threshold,roc_auc

def get_roc_curve_deep_learning(model,x_test,y_test):
    prob=model.predict_proba(x_test)
    preds=prob
    fpr, tpr, threshold = roc_curve(y_test, preds)
    roc_auc =auc(fpr, tpr)
    plt.figure(figsize=(10,8))
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
    i = np.arange(len(tpr)) 
    roc = pd.DataFrame({'fpr' : pd.Series(fpr, index=i),'tpr' : pd.Series(tpr, index = i), '1-fpr' : pd.Series(1-fpr, index = i), 'tf' : pd.Series(tpr - (1-fpr), index = i), 'thresholds' : pd.Series(threshold, index = i)})
    threshold=roc.ix[(roc.tf-0).abs().argsort()[:1]]['thresholds'].values[0]
    print("Optimal Threshold is",threshold)
    y_pred_class=binarize(preds.reshape(1,-1),threshold)[0]
    accuracy=accuracy_score(y_test,y_pred_class)
    print("Accuracy of Logistic Model on training data is:",accuracy)
    cm=ConfusionMatrix(y_test,y_pred_class)
    display(cm)
    return cm,accuracy,threshold,roc_auc

In [6]:
df=pd.read_csv('Html.csv',index_col='domain')

df.fillna(value=0,inplace=True)
df.drop(['Target'],axis=1,inplace=True)

In [7]:
X=df.loc[:,df.columns!='Malicious']
X.drop(['number_of_unsecure_cookies','number_of_secure_cookies','number_of_non_http_only_cookies','number_of_http_only_cookies',
 'number_of_cookies'],axis=1,inplace=True)
y=df.Malicious.values
feature_name = X.columns.tolist()

In [8]:
def cor_selector(X, y):
    cor_list = []
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-100:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature


cor_support, cor_feature = cor_selector(X, y)


X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=100)
chi_selector.fit(X_norm, y)
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()


rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=100, step=10, verbose=False)
rfe_selector.fit(X_norm, y)
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()


embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), threshold='23*median')
embeded_rf_selector.fit(X, y)
embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()


feature_selection_df = pd.DataFrame({'Feature':feature_name, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support,
                                    'Random Forest':embeded_rf_support})
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)

In [9]:
X=df[feature_selection_df[feature_selection_df.Total==4]['Feature']]
y=df.Malicious.values
feature_name = X.columns.tolist()

scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)

print("The train/test split ratio is 80:20")
X_train, X_test, Y_train, Y_test = train_test_split(scaled_X,y,random_state=0,test_size=0.2)

X.shape

The train/test split ratio is 80:20


(44665, 44)

In [10]:
## Table to store training and test measures
table_80_50_train=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR'])
table_80_50_test=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR','auc'])

#### 1. Deep Learning

In [11]:
np.random.seed(0)
deep_learning=Sequential()
deep_learning.add(Dense(units=64,activation='relu',use_bias=True,kernel_initializer='uniform',input_dim=X.shape[1]))
deep_learning.add(Dense(units=128,activation='relu',use_bias=True,kernel_initializer='uniform'))
deep_learning.add(Dense(units=1,activation='sigmoid',use_bias=True,kernel_initializer='uniform'))
deep_learning.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
deep_learning.fit(X_train,Y_train,batch_size=20,epochs=20,verbose=False)

acc_train=deep_learning.evaluate(X_train,Y_train,verbose=False)[1]
print("The accuracy of the model on training data is:",acc_train)
cm_train=ConfusionMatrix(Y_train,deep_learning.predict_classes(X_train,batch_size=1,verbose=False).reshape(1,len(X_train))[0])
cm_test=ConfusionMatrix(Y_test,deep_learning.predict_classes(X_test,batch_size=1,verbose=False).reshape(1,len(X_test))[0])
display(cm_train)
acc_test=accuracy_score(Y_test,deep_learning.predict_classes(X_test,batch_size=1,verbose=False))
print("The accuracy of the model on test data is:",acc_test)
display(cm_test)
fpr, tpr, threshold = roc_curve(Y_test, deep_learning.predict_classes(X_test,batch_size=1,verbose=False).reshape(1,len(X_test))[0])
auc=metrics.auc(fpr, tpr)

table_80_50_train.loc[0]=([acc_train,cm_train.TP,cm_train.TN,cm_train.FP,cm_train.FN,cm_train.FPR,cm_train.FNR])
table_80_50_test.loc[0]=([acc_test,cm_test.TP,cm_test.TN,cm_test.FP,cm_test.FN,cm_test.FPR,cm_test.FNR,auc])

The accuracy of the model on training data is: 0.8801354528154035


Predicted  False  True  __all__
Actual                         
False      26754  3889    30643
True         394  4695     5089
__all__    27148  8584    35732

The accuracy of the model on test data is: 0.8722713534087093


Predicted  False  True  __all__
Actual                         
False       6638  1004     7642
True         137  1154     1291
__all__     6775  2158     8933

#### 2. KNN

In [12]:
knn=KNeighborsClassifier()
knn.fit(X_train,Y_train)
acc_train=knn.score(X_train,Y_train)
print("The accuracy of the model on training data is: ",knn.score(X_train,Y_train))
cm_train=get_cm(knn,X_train,Y_train)
display(cm_train)
acc_test=accuracy_score(Y_test,knn.predict(X_test))
print("The accuracy of the model on test data is:",acc_test)
cm_test=get_cm(knn,X_test,Y_test)
display(cm_test)
fpr, tpr, threshold = roc_curve(Y_test,knn.predict(X_test))
auc=metrics.auc(fpr, tpr)

table_80_50_train.loc[1]= ([acc_train,cm_train.TP,cm_train.TN,cm_train.FP,cm_train.FN,cm_train.FPR,cm_train.FNR])
table_80_50_test.loc[1] = ([acc_test ,cm_test.TP, cm_test.TN, cm_test.FP, cm_test.FN, cm_test.FPR, cm_test.FNR,auc])

The accuracy of the model on training data is:  0.8849770513825143


Predicted  False  True  __all__
Actual                         
False      26929  3714    30643
True         396  4693     5089
__all__    27325  8407    35732

The accuracy of the model on test data is: 0.8793238553677376


Predicted  False  True  __all__
Actual                         
False       6681   961     7642
True         117  1174     1291
__all__     6798  2135     8933

#### 3. Logistic Regression

In [13]:
logistic=LogisticRegression()
logistic.fit(X_train,Y_train)
acc_train=logistic.score(X_train,Y_train)
print("The accuracy of the model on training data is: ",logistic.score(X_train,Y_train))
cm_train=get_cm(logistic,X_train,Y_train)
display(cm_train)
acc_test=accuracy_score(Y_test,logistic.predict(X_test))
print("The accuracy of the model on test data is:",acc_test)
cm_test=get_cm(logistic,X_test,Y_test)
display(cm_test)
fpr, tpr, threshold = roc_curve(Y_test,logistic.predict(X_test))
auc=metrics.auc(fpr, tpr)

table_80_50_train.loc[2]=   ([acc_train,cm_train.TP,cm_train.TN,cm_train.FP,cm_train.FN,cm_train.FPR,cm_train.FNR])
table_80_50_test.loc[2] =   ([acc_test ,cm_test.TP, cm_test.TN, cm_test.FP, cm_test.FN, cm_test.FPR, cm_test.FNR,auc])

The accuracy of the model on training data is:  0.8568789880219411


Predicted  False  True  __all__
Actual                         
False      30565    78    30643
True        5036    53     5089
__all__    35601   131    35732

The accuracy of the model on test data is: 0.8553677376021493


Predicted  False  True  __all__
Actual                         
False       7624    18     7642
True        1274    17     1291
__all__     8898    35     8933

#### 4. Random Forest

In [14]:
rf=RandomForestClassifier()
rf.fit(X_train,Y_train)
acc_train=rf.score(X_train,Y_train)
print("The accuracy of the model on training data is: ",rf.score(X_train,Y_train))
cm_train=get_cm(rf,X_train,Y_train)
display(cm_train)
acc_test=accuracy_score(Y_test,rf.predict(X_test))
print("The accuracy of the model on test data is:",acc_test)
cm_test=get_cm(rf,X_test,Y_test)
display(cm_test)
fpr, tpr, threshold = roc_curve(Y_test,rf.predict(X_test))
auc=metrics.auc(fpr, tpr)

table_80_50_train.loc[3]=   ([acc_train,cm_train.TP,cm_train.TN,cm_train.FP,cm_train.FN,cm_train.FPR,cm_train.FNR])
table_80_50_test.loc[3] =   ([acc_test ,cm_test.TP, cm_test.TN, cm_test.FP, cm_test.FN, cm_test.FPR, cm_test.FNR,auc])

The accuracy of the model on training data is:  0.8952479570133214


Predicted  False  True  __all__
Actual                         
False      27036  3607    30643
True         136  4953     5089
__all__    27172  8560    35732

The accuracy of the model on test data is: 0.8835777454382626


Predicted  False  True  __all__
Actual                         
False       6709   933     7642
True         107  1184     1291
__all__     6816  2117     8933

In [15]:
f_r=pd.DataFrame(rf.feature_importances_,index=feature_name)
f_r.columns=['Feature Importance']
f_r.sort_values(by='Feature Importance',ascending=False,inplace=True)
f_r

Unnamed: 0,Feature Importance
a_href_relative,0.201828
script_count,0.146124
script_src_relative,0.131929
a_count,0.078246
script_src_absolute,0.078077
a_href_absolute,0.075551
div_count,0.029226
a_href_out_of_domain,0.02327
script_src_https,0.019883
link_type_text/css,0.019035


##### 5. AdaBoost Classifier

In [16]:
ada=AdaBoostClassifier()
ada.fit(X_train,Y_train)
acc_train=ada.score(X_train,Y_train)
print("The accuracy of the model on training data is: ",ada.score(X_train,Y_train))
cm_train=get_cm(ada,X_train,Y_train)
display(cm_train)
acc_test=accuracy_score(Y_test,ada.predict(X_test))
print("The accuracy of the model on test data is:",acc_test)
cm_test=get_cm(ada,X_test,Y_test)
display(cm_test)
fpr, tpr, threshold = roc_curve(Y_test,ada.predict(X_test))
auc=metrics.auc(fpr, tpr)

table_80_50_train.loc[4]=   ([acc_train,cm_train.TP,cm_train.TN,cm_train.FP,cm_train.FN,cm_train.FPR,cm_train.FNR])
table_80_50_test.loc[4] =   ([acc_test ,cm_test.TP, cm_test.TN, cm_test.FP, cm_test.FN, cm_test.FPR, cm_test.FNR,auc])

The accuracy of the model on training data is:  0.8660304488973469


Predicted  False  True  __all__
Actual                         
False      26517  4126    30643
True         661  4428     5089
__all__    27178  8554    35732

The accuracy of the model on test data is: 0.8619724616590171


Predicted  False  True  __all__
Actual                         
False       6565  1077     7642
True         156  1135     1291
__all__     6721  2212     8933

In [17]:
f_a=pd.DataFrame(ada.feature_importances_,index=feature_name)
f_a.columns=['Feature Importance']
f_a.sort_values(by='Feature Importance',ascending=False,inplace=True)
f_a

Unnamed: 0,Feature Importance
a_href_out_of_domain,0.16
script_count,0.14
link_href_out_of_domain,0.08
script_type_text/javascript,0.06
script_src_https,0.06
script_async_true,0.06
link_type_text/css,0.04
div_count,0.04
iframe_src_.html,0.04
img_src_.jpg,0.04


#### 6. Gradient Boosting Classifier

In [18]:
gbc=GradientBoostingClassifier()
gbc.fit(X_train,Y_train)
acc_train=gbc.score(X_train,Y_train)
print("The accuracy of the model on training data is: ",gbc.score(X_train,Y_train))
cm_train=get_cm(gbc,X_train,Y_train)
display(cm_train)
acc_test=accuracy_score(Y_test,gbc.predict(X_test))
print("The accuracy of the model on test data is:",acc_test)
cm_test=get_cm(gbc,X_test,Y_test)
display(cm_test)
fpr, tpr, threshold = roc_curve(Y_test,gbc.predict(X_test))
auc=metrics.auc(fpr, tpr)

table_80_50_train.loc[5]=   ([acc_train,cm_train.TP,cm_train.TN,cm_train.FP,cm_train.FN,cm_train.FPR,cm_train.FNR])
table_80_50_test.loc[5] =   ([acc_test ,cm_test.TP, cm_test.TN, cm_test.FP, cm_test.FN, cm_test.FPR, cm_test.FNR,auc])

The accuracy of the model on training data is:  0.8828780924661368


Predicted  False  True  __all__
Actual                         
False      26995  3648    30643
True         537  4552     5089
__all__    27532  8200    35732

The accuracy of the model on test data is: 0.8785402440389567


Predicted  False  True  __all__
Actual                         
False       6702   940     7642
True         145  1146     1291
__all__     6847  2086     8933

In [19]:
f_g=pd.DataFrame(gbc.feature_importances_,index=feature_name)
f_g.columns=['Feature Importance']
f_g.sort_values(by='Feature Importance',ascending=False,inplace=True)
f_g

Unnamed: 0,Feature Importance
script_count,0.157672
a_count,0.092821
link_href_out_of_domain,0.081341
a_href_out_of_domain,0.079311
div_count,0.064806
link_type_text/css,0.056499
a_href_absolute,0.052835
a_href_relative,0.040932
script_async_true,0.033906
meta_http-equiv_Content-Type,0.027518


#### 7. Extra Tree Classifier

In [20]:
ext_tree=ExtraTreesClassifier()
ext_tree.fit(X_train,Y_train)
acc_train=ext_tree.score(X_train,Y_train)
print("The accuracy of the model on training data is: ",ext_tree.score(X_train,Y_train))
cm_train=get_cm(ext_tree,X_train,Y_train)
display(cm_train)
acc_test=accuracy_score(Y_test,ext_tree.predict(X_test))
print("The accuracy of the model on test data is:",acc_test)
cm_test=get_cm(ext_tree,X_test,Y_test)
display(cm_test)
fpr, tpr, threshold = roc_curve(Y_test,ext_tree.predict(X_test))
auc=metrics.auc(fpr, tpr)

table_80_50_train.loc[6]=   ([acc_train,cm_train.TP,cm_train.TN,cm_train.FP,cm_train.FN,cm_train.FPR,cm_train.FNR])
table_80_50_test.loc[6] =   ([acc_test ,cm_test.TP, cm_test.TN, cm_test.FP, cm_test.FN, cm_test.FPR, cm_test.FNR,auc])

The accuracy of the model on training data is:  0.8970670547408486


Predicted  False  True  __all__
Actual                         
False      27039  3604    30643
True          74  5015     5089
__all__    27113  8619    35732

The accuracy of the model on test data is: 0.8834658009627225


Predicted  False  True  __all__
Actual                         
False       6711   931     7642
True         110  1181     1291
__all__     6821  2112     8933

In [21]:
f_e=pd.DataFrame(ext_tree.feature_importances_,index=feature_name)
f_e.columns=['Feature Importance']
f_e.sort_values(by='Feature Importance',ascending=False,inplace=True)
f_e

Unnamed: 0,Feature Importance
script_src_absolute,0.097082
script_src_.js,0.059284
div_count,0.052883
meta_charset_utf-8,0.049664
script_async_true,0.048705
link_rel_shortcut icon,0.044377
script_src_out_of_domain,0.03405
iframe_src_relative,0.033504
script_src_https,0.033325
link_rel_canonical,0.033317


#### 8. Bagging

In [22]:
bagging=BaggingClassifier()
bagging.fit(X_train,Y_train)
acc_train=bagging.score(X_train,Y_train)
print("The accuracy of the model on training data is: ",bagging.score(X_train,Y_train))
cm_train=get_cm(bagging,X_train,Y_train)
display(cm_train)
acc_test=accuracy_score(Y_test,bagging.predict(X_test))
print("The accuracy of the model on test data is:",acc_test)
cm_test=get_cm(bagging,X_test,Y_test)
display(cm_test)
fpr, tpr, threshold = roc_curve(Y_test,bagging.predict(X_test))
auc=metrics.auc(fpr, tpr)

table_80_50_train.loc[7]=   ([acc_train,cm_train.TP,cm_train.TN,cm_train.FP,cm_train.FN,cm_train.FPR,cm_train.FNR])
table_80_50_test.loc[7] =   ([acc_test ,cm_test.TP, cm_test.TN, cm_test.FP, cm_test.FN, cm_test.FPR, cm_test.FNR,auc])

The accuracy of the model on training data is:  0.8954718459644017


Predicted  False  True  __all__
Actual                         
False      27030  3613    30643
True         122  4967     5089
__all__    27152  8580    35732

The accuracy of the model on test data is: 0.8820105227807008


Predicted  False  True  __all__
Actual                         
False       6695   947     7642
True         107  1184     1291
__all__     6802  2131     8933

#### 9. Voting Classifier

In [23]:
eclf = VotingClassifier(estimators=[('knn', knn), ('logistic', logistic),('rf',rf),('ada',ada),('gbc',gbc),
                                    ('ext_tree',ext_tree),('bagging',bagging)], voting='soft')

eclf.fit(X_train,Y_train.reshape(1,len(Y_train))[0])
acc_train=eclf.score(X_train,Y_train)
pred=eclf.predict(X_test)
cm_train=get_cm(eclf,X_train,Y_train.reshape(1,len(Y_train))[0])
print("The accuracy of the model on training data is :",acc_train)
display(cm_train)
acc_test=accuracy_score(Y_test.reshape(1,len(Y_test))[0],pred)
print("The accuracy of the model on test data is:",acc_test)
cm_test=ConfusionMatrix(Y_test.reshape(1,len(Y_test))[0],pred)
display(cm_test)
fpr, tpr, threshold = roc_curve(Y_test,eclf.predict(X_test))
auc=metrics.auc(fpr, tpr)

table_80_50_train.loc[8]=   ([acc_train,cm_train.TP,cm_train.TN,cm_train.FP,cm_train.FN,cm_train.FPR,cm_train.FNR])
table_80_50_test.loc[8] =   ([acc_test ,cm_test.TP, cm_test.TN, cm_test.FP, cm_test.FN, cm_test.FPR, cm_test.FNR,auc])

The accuracy of the model on training data is : 0.8920575394604277


Predicted  False  True  __all__
Actual                         
False      27038  3605    30643
True         252  4837     5089
__all__    27290  8442    35732

The accuracy of the model on test data is: 0.8826821896339415


Predicted  False  True  __all__
Actual                         
False       6714   928     7642
True         120  1171     1291
__all__     6834  2099     8933

In [24]:
table_80_50_train['index']=['Neural Network-1','KNeighborsClassifier','LogisticRegression','RandomForestClassifier','AdaBoostClassifier',
                           'GradientBoostingClassifier','ExtraTreeClassifier','BaggingClassifier','VotingClassifier']
table_80_50_test['index']=['Neural Network-1','KNeighborsClassifier','LogisticRegression','RandomForestClassifier','AdaBoostClassifier',
                           'GradientBoostingClassifier','ExtraTreeClassifier','BaggingClassifier','VotingClassifier']

table_80_50_test.set_index(['index'],drop=True,inplace=True)
table_80_50_train.set_index(['index'],drop=True,inplace=True)

print("Comparision Table for Training Accuracies:")
display(table_80_50_train)

print("Comparision Table for Test Accuracies:")
display(table_80_50_test)

Comparision Table for Training Accuracies:


Unnamed: 0_level_0,accuracy,TP,TN,FP,FN,FPR,FNR
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Neural Network-1,0.880135,4695.0,26754.0,3889.0,394.0,0.126913,0.077422
KNeighborsClassifier,0.884977,4693.0,26929.0,3714.0,396.0,0.121202,0.077815
LogisticRegression,0.856879,53.0,30565.0,78.0,5036.0,0.002545,0.989585
RandomForestClassifier,0.895248,4953.0,27036.0,3607.0,136.0,0.11771,0.026724
AdaBoostClassifier,0.86603,4428.0,26517.0,4126.0,661.0,0.134647,0.129888
GradientBoostingClassifier,0.882878,4552.0,26995.0,3648.0,537.0,0.119048,0.105522
ExtraTreeClassifier,0.897067,5015.0,27039.0,3604.0,74.0,0.117613,0.014541
BaggingClassifier,0.895472,4967.0,27030.0,3613.0,122.0,0.117906,0.023973
VotingClassifier,0.892058,4837.0,27038.0,3605.0,252.0,0.117645,0.049519


Comparision Table for Test Accuracies:


Unnamed: 0_level_0,accuracy,TP,TN,FP,FN,FPR,FNR,auc
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Neural Network-1,0.872271,1154.0,6638.0,1004.0,137.0,0.131379,0.106119,0.881251
KNeighborsClassifier,0.879324,1174.0,6681.0,961.0,117.0,0.125752,0.090627,0.89181
LogisticRegression,0.855368,17.0,7624.0,18.0,1274.0,0.002355,0.986832,0.505406
RandomForestClassifier,0.883578,1184.0,6709.0,933.0,107.0,0.122088,0.082881,0.897515
AdaBoostClassifier,0.861972,1135.0,6565.0,1077.0,156.0,0.140932,0.120837,0.869116
GradientBoostingClassifier,0.87854,1146.0,6702.0,940.0,145.0,0.123004,0.112316,0.88234
ExtraTreeClassifier,0.883466,1181.0,6711.0,931.0,110.0,0.121827,0.085205,0.896484
BaggingClassifier,0.882011,1184.0,6695.0,947.0,107.0,0.12392,0.082881,0.896599
VotingClassifier,0.882682,1171.0,6714.0,928.0,120.0,0.121434,0.092951,0.892807


#### Average Feature Importance

In [25]:
(pd.concat((f_r,f_a,f_g,f_e), axis=1).mean(axis=1).sort_values(ascending=False))

script_count                             0.116640
a_href_out_of_domain                     0.073736
a_href_relative                          0.069198
a_count                                  0.052903
script_src_absolute                      0.048346
link_href_out_of_domain                  0.048142
div_count                                0.046729
script_src_relative                      0.042989
script_async_true                        0.037855
a_href_absolute                          0.037802
link_type_text/css                       0.035288
script_src_https                         0.033874
script_type_text/javascript              0.026735
script_src_.js                           0.023567
img_src_.jpg                             0.022227
link_href_.css                           0.020357
meta_charset_utf-8                       0.020258
script_src_out_of_domain                 0.017638
link_href_https                          0.017447
link_rel_shortcut icon                   0.016781


## With transformed variables

In [26]:
df=pd.read_csv('Html.csv',index_col='domain')

df.fillna(value=0,inplace=True)
df.drop(['Target'],axis=1,inplace=True)
df.drop(['number_of_unsecure_cookies','number_of_secure_cookies','number_of_non_http_only_cookies','number_of_http_only_cookies',
 'number_of_cookies'],axis=1,inplace=True)

In [27]:
X=df.loc[:,df.columns!='Malicious']
y=df.Malicious.values

In [28]:
def transformed(col):
    new_col=col+'_^2'
    df[new_col]=df[col].apply(lambda x: np.square(x))
    new_col=col+'_sqr_rt'
    df[new_col]=df[col].apply(lambda x: np.sqrt(x))
    new_col=col+'_^3'
    df[new_col]=df[col].apply(lambda x: np.power(x,3))
    new_col=col+'_log'
    df[new_col]=df[col].apply(lambda x: np.log(x))

In [29]:
for col in X.columns:
    transformed(col)

In [30]:
df.replace(-np.Inf,0,inplace=True)

In [31]:
X=df.loc[:,df.columns!='Malicious']
feature_name = X.columns.tolist()

In [32]:
def cor_selector(X, y):
    cor_list = []
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-100:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature


cor_support, cor_feature = cor_selector(X, y)


X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=100)
chi_selector.fit(X_norm, y)
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()


rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=100, step=10, verbose=False)
rfe_selector.fit(X_norm, y)
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()


embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), threshold='23*median')
embeded_rf_selector.fit(X, y)
embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()


feature_selection_df = pd.DataFrame({'Feature':feature_name, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support,
                                    'Random Forest':embeded_rf_support})
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)

In [33]:
X=df[feature_selection_df[feature_selection_df.Total==4]['Feature']]
y=df.Malicious.values
feature_name = X.columns.tolist()

scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)

print("The train/test split ratio is 80:20")
X_train, X_test, Y_train, Y_test = train_test_split(scaled_X,y,random_state=0,test_size=0.2)

X.shape

The train/test split ratio is 80:20


(44665, 22)

In [34]:
feature_name

['script_type_text/javascript_log',
 'script_src_out_of_domain_sqr_rt',
 'script_src_https_log',
 'script_src_.js_sqr_rt',
 'script_count_sqr_rt',
 'script_count_log',
 'script_async_true_sqr_rt',
 'script_async_true_log',
 'p_count_log',
 'meta_count_log',
 'link_rel_stylesheet_log',
 'link_href_out_of_domain_sqr_rt',
 'link_href_out_of_domain_log',
 'link_href_https_log',
 'img_src_.jpg_sqr_rt',
 'iframe_src_https_sqr_rt',
 'div_count_sqr_rt',
 'a_href_relative_sqr_rt',
 'a_href_out_of_domain_log',
 'a_href_https_log',
 'a_href_absolute_sqr_rt',
 'a_count_sqr_rt']

In [35]:
## Table to store training and test measures
table_80_50_train=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR'])
table_80_50_test=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR','auc'])

#### 1. Deep Learning

In [36]:
np.random.seed(0)
deep_learning=Sequential()
deep_learning.add(Dense(units=64,activation='relu',use_bias=True,kernel_initializer='uniform',input_dim=X.shape[1]))
deep_learning.add(Dense(units=128,activation='relu',use_bias=True,kernel_initializer='uniform'))
deep_learning.add(Dense(units=1,activation='sigmoid',use_bias=True,kernel_initializer='uniform'))
deep_learning.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
deep_learning.fit(X_train,Y_train,batch_size=20,epochs=20,verbose=False)

acc_train=deep_learning.evaluate(X_train,Y_train,verbose=False)[1]
print("The accuracy of the model on training data is:",acc_train)
cm_train=ConfusionMatrix(Y_train,deep_learning.predict_classes(X_train,batch_size=1,verbose=False).reshape(1,len(X_train))[0])
cm_test=ConfusionMatrix(Y_test,deep_learning.predict_classes(X_test,batch_size=1,verbose=False).reshape(1,len(X_test))[0])
display(cm_train)
acc_test=accuracy_score(Y_test,deep_learning.predict_classes(X_test,batch_size=1,verbose=False))
print("The accuracy of the model on test data is:",acc_test)
display(cm_test)
fpr, tpr, threshold = roc_curve(Y_test, deep_learning.predict_classes(X_test,batch_size=1,verbose=False).reshape(1,len(X_test))[0])
auc=metrics.auc(fpr, tpr)

table_80_50_train.loc[0]=([acc_train,cm_train.TP,cm_train.TN,cm_train.FP,cm_train.FN,cm_train.FPR,cm_train.FNR])
table_80_50_test.loc[0]=([acc_test,cm_test.TP,cm_test.TN,cm_test.FP,cm_test.FN,cm_test.FPR,cm_test.FNR,auc])

The accuracy of the model on training data is: 0.8752378820105228


Predicted  False  True  __all__
Actual                         
False      26619  4024    30643
True         434  4655     5089
__all__    27053  8679    35732

The accuracy of the model on test data is: 0.8680174633381843


Predicted  False  True  __all__
Actual                         
False       6588  1054     7642
True         125  1166     1291
__all__     6713  2220     8933

#### 2. KNN

In [37]:
knn=KNeighborsClassifier()
knn.fit(X_train,Y_train)
acc_train=knn.score(X_train,Y_train)
print("The accuracy of the model on training data is: ",knn.score(X_train,Y_train))
cm_train=get_cm(knn,X_train,Y_train)
display(cm_train)
acc_test=accuracy_score(Y_test,knn.predict(X_test))
print("The accuracy of the model on test data is:",acc_test)
cm_test=get_cm(knn,X_test,Y_test)
display(cm_test)
fpr, tpr, threshold = roc_curve(Y_test,knn.predict(X_test))
auc=metrics.auc(fpr, tpr)

table_80_50_train.loc[1]= ([acc_train,cm_train.TP,cm_train.TN,cm_train.FP,cm_train.FN,cm_train.FPR,cm_train.FNR])
table_80_50_test.loc[1] = ([acc_test ,cm_test.TP, cm_test.TN, cm_test.FP, cm_test.FN, cm_test.FPR, cm_test.FNR,auc])

The accuracy of the model on training data is:  0.8747341318705922


Predicted  False  True  __all__
Actual                         
False      30474   169    30643
True        4307   782     5089
__all__    34781   951    35732

The accuracy of the model on test data is: 0.8700324638979067


Predicted  False  True  __all__
Actual                         
False       7589    53     7642
True        1108   183     1291
__all__     8697   236     8933

#### 3. Logistic Regression

In [38]:
logistic=LogisticRegression()
logistic.fit(X_train,Y_train)
acc_train=logistic.score(X_train,Y_train)
print("The accuracy of the model on training data is: ",logistic.score(X_train,Y_train))
cm_train=get_cm(logistic,X_train,Y_train)
display(cm_train)
acc_test=accuracy_score(Y_test,logistic.predict(X_test))
print("The accuracy of the model on test data is:",acc_test)
cm_test=get_cm(logistic,X_test,Y_test)
display(cm_test)
fpr, tpr, threshold = roc_curve(Y_test,logistic.predict(X_test))
auc=metrics.auc(fpr, tpr)

table_80_50_train.loc[2]=   ([acc_train,cm_train.TP,cm_train.TN,cm_train.FP,cm_train.FN,cm_train.FPR,cm_train.FNR])
table_80_50_test.loc[2] =   ([acc_test ,cm_test.TP, cm_test.TN, cm_test.FP, cm_test.FN, cm_test.FPR, cm_test.FNR,auc])

The accuracy of the model on training data is:  0.8719355199820888


Predicted  False  True  __all__
Actual                         
False      26794  3849    30643
True         727  4362     5089
__all__    27521  8211    35732

The accuracy of the model on test data is: 0.8688010746669652


Predicted  False  True  __all__
Actual                         
False       6645   997     7642
True         175  1116     1291
__all__     6820  2113     8933

#### 4. Random Forest

In [39]:
rf=RandomForestClassifier()
rf.fit(X_train,Y_train)
acc_train=rf.score(X_train,Y_train)
print("The accuracy of the model on training data is: ",rf.score(X_train,Y_train))
cm_train=get_cm(rf,X_train,Y_train)
display(cm_train)
acc_test=accuracy_score(Y_test,rf.predict(X_test))
print("The accuracy of the model on test data is:",acc_test)
cm_test=get_cm(rf,X_test,Y_test)
display(cm_test)
fpr, tpr, threshold = roc_curve(Y_test,rf.predict(X_test))
auc=metrics.auc(fpr, tpr)

table_80_50_train.loc[3]=   ([acc_train,cm_train.TP,cm_train.TN,cm_train.FP,cm_train.FN,cm_train.FPR,cm_train.FNR])
table_80_50_test.loc[3] =   ([acc_test ,cm_test.TP, cm_test.TN, cm_test.FP, cm_test.FN, cm_test.FPR, cm_test.FNR,auc])

The accuracy of the model on training data is:  0.8935408037613344


Predicted  False  True  __all__
Actual                         
False      26986  3657    30643
True         147  4942     5089
__all__    27133  8599    35732

The accuracy of the model on test data is: 0.8807791335497593


Predicted  False  True  __all__
Actual                         
False       6691   951     7642
True         114  1177     1291
__all__     6805  2128     8933

In [40]:
f_r=pd.DataFrame(rf.feature_importances_,index=feature_name)
f_r.columns=['Feature Importance']
f_r.sort_values(by='Feature Importance',ascending=False,inplace=True)
f_r

Unnamed: 0,Feature Importance
a_count_sqr_rt,0.203748
a_href_relative_sqr_rt,0.1406
script_count_sqr_rt,0.094196
div_count_sqr_rt,0.088936
a_href_out_of_domain_log,0.082592
script_count_log,0.076158
script_async_true_sqr_rt,0.075043
meta_count_log,0.038358
p_count_log,0.030496
a_href_absolute_sqr_rt,0.024977


##### 5. AdaBoost Classifier

In [41]:
ada=AdaBoostClassifier()
ada.fit(X_train,Y_train)
acc_train=ada.score(X_train,Y_train)
print("The accuracy of the model on training data is: ",ada.score(X_train,Y_train))
cm_train=get_cm(ada,X_train,Y_train)
display(cm_train)
acc_test=accuracy_score(Y_test,ada.predict(X_test))
print("The accuracy of the model on test data is:",acc_test)
cm_test=get_cm(ada,X_test,Y_test)
display(cm_test)
fpr, tpr, threshold = roc_curve(Y_test,ada.predict(X_test))
auc=metrics.auc(fpr, tpr)

table_80_50_train.loc[4]=   ([acc_train,cm_train.TP,cm_train.TN,cm_train.FP,cm_train.FN,cm_train.FPR,cm_train.FNR])
table_80_50_test.loc[4] =   ([acc_test ,cm_test.TP, cm_test.TN, cm_test.FP, cm_test.FN, cm_test.FPR, cm_test.FNR,auc])

The accuracy of the model on training data is:  0.8627560729877981


Predicted  False  True  __all__
Actual                         
False      30493   150    30643
True        4754   335     5089
__all__    35247   485    35732

The accuracy of the model on test data is: 0.8607410724280756


Predicted  False  True  __all__
Actual                         
False       7596    46     7642
True        1198    93     1291
__all__     8794   139     8933

In [42]:
f_a=pd.DataFrame(ada.feature_importances_,index=feature_name)
f_a.columns=['Feature Importance']
f_a.sort_values(by='Feature Importance',ascending=False,inplace=True)
f_a

Unnamed: 0,Feature Importance
a_href_https_log,0.12
script_count_sqr_rt,0.1
a_href_out_of_domain_log,0.1
script_type_text/javascript_log,0.08
p_count_log,0.08
link_rel_stylesheet_log,0.08
img_src_.jpg_sqr_rt,0.06
meta_count_log,0.06
div_count_sqr_rt,0.06
script_src_https_log,0.04


#### 6. Gradient Boosting Classifier

In [43]:
gbc=GradientBoostingClassifier()
gbc.fit(X_train,Y_train)
acc_train=gbc.score(X_train,Y_train)
print("The accuracy of the model on training data is: ",gbc.score(X_train,Y_train))
cm_train=get_cm(gbc,X_train,Y_train)
display(cm_train)
acc_test=accuracy_score(Y_test,gbc.predict(X_test))
print("The accuracy of the model on test data is:",acc_test)
cm_test=get_cm(gbc,X_test,Y_test)
display(cm_test)
fpr, tpr, threshold = roc_curve(Y_test,gbc.predict(X_test))
auc=metrics.auc(fpr, tpr)

table_80_50_train.loc[5]=   ([acc_train,cm_train.TP,cm_train.TN,cm_train.FP,cm_train.FN,cm_train.FPR,cm_train.FNR])
table_80_50_test.loc[5] =   ([acc_test ,cm_test.TP, cm_test.TN, cm_test.FP, cm_test.FN, cm_test.FPR, cm_test.FNR,auc])

The accuracy of the model on training data is:  0.8818146199485055


Predicted  False  True  __all__
Actual                         
False      26967  3676    30643
True         547  4542     5089
__all__    27514  8218    35732

The accuracy of the model on test data is: 0.8765252434792343


Predicted  False  True  __all__
Actual                         
False       6693   949     7642
True         154  1137     1291
__all__     6847  2086     8933

In [44]:
f_g=pd.DataFrame(gbc.feature_importances_,index=feature_name)
f_g.columns=['Feature Importance']
f_g.sort_values(by='Feature Importance',ascending=False,inplace=True)
f_g

Unnamed: 0,Feature Importance
script_count_sqr_rt,0.179214
a_href_out_of_domain_log,0.084726
a_count_sqr_rt,0.077384
a_href_relative_sqr_rt,0.070282
div_count_sqr_rt,0.068041
meta_count_log,0.061823
link_rel_stylesheet_log,0.055933
p_count_log,0.051868
link_href_out_of_domain_sqr_rt,0.049093
a_href_absolute_sqr_rt,0.048364


#### 7. Extra Tree Classifier

In [45]:
ext_tree=ExtraTreesClassifier()
ext_tree.fit(X_train,Y_train)
acc_train=ext_tree.score(X_train,Y_train)
print("The accuracy of the model on training data is: ",ext_tree.score(X_train,Y_train))
cm_train=get_cm(ext_tree,X_train,Y_train)
display(cm_train)
acc_test=accuracy_score(Y_test,ext_tree.predict(X_test))
print("The accuracy of the model on test data is:",acc_test)
cm_test=get_cm(ext_tree,X_test,Y_test)
display(cm_test)
fpr, tpr, threshold = roc_curve(Y_test,ext_tree.predict(X_test))
auc=metrics.auc(fpr, tpr)

table_80_50_train.loc[6]=   ([acc_train,cm_train.TP,cm_train.TN,cm_train.FP,cm_train.FN,cm_train.FPR,cm_train.FNR])
table_80_50_test.loc[6] =   ([acc_test ,cm_test.TP, cm_test.TN, cm_test.FP, cm_test.FN, cm_test.FPR, cm_test.FNR,auc])

The accuracy of the model on training data is:  0.8952199708944364


Predicted  False  True  __all__
Actual                         
False      27008  3635    30643
True         109  4980     5089
__all__    27117  8615    35732

The accuracy of the model on test data is: 0.8808910780252994


Predicted  False  True  __all__
Actual                         
False       6699   943     7642
True         121  1170     1291
__all__     6820  2113     8933

In [46]:
f_e=pd.DataFrame(ext_tree.feature_importances_,index=feature_name)
f_e.columns=['Feature Importance']
f_e.sort_values(by='Feature Importance',ascending=False,inplace=True)
f_e

Unnamed: 0,Feature Importance
script_count_log,0.150078
a_href_out_of_domain_log,0.110076
meta_count_log,0.090695
div_count_sqr_rt,0.089312
script_type_text/javascript_log,0.064058
a_href_absolute_sqr_rt,0.063073
a_href_https_log,0.06092
script_async_true_log,0.051934
script_count_sqr_rt,0.050682
link_href_out_of_domain_sqr_rt,0.039942


#### 8. Bagging

In [47]:
bagging=BaggingClassifier()
bagging.fit(X_train,Y_train)
acc_train=bagging.score(X_train,Y_train)
print("The accuracy of the model on training data is: ",bagging.score(X_train,Y_train))
cm_train=get_cm(bagging,X_train,Y_train)
display(cm_train)
acc_test=accuracy_score(Y_test,bagging.predict(X_test))
print("The accuracy of the model on test data is:",acc_test)
cm_test=get_cm(bagging,X_test,Y_test)
display(cm_test)
fpr, tpr, threshold = roc_curve(Y_test,bagging.predict(X_test))
auc=metrics.auc(fpr, tpr)

table_80_50_train.loc[7]=   ([acc_train,cm_train.TP,cm_train.TN,cm_train.FP,cm_train.FN,cm_train.FPR,cm_train.FNR])
table_80_50_test.loc[7] =   ([acc_test ,cm_test.TP, cm_test.TN, cm_test.FP, cm_test.FN, cm_test.FPR, cm_test.FNR,auc])

The accuracy of the model on training data is:  0.8935687898802194


Predicted  False  True  __all__
Actual                         
False      26977  3666    30643
True         137  4952     5089
__all__    27114  8618    35732

The accuracy of the model on test data is: 0.8805552445986791


Predicted  False  True  __all__
Actual                         
False       6684   958     7642
True         109  1182     1291
__all__     6793  2140     8933

#### 9. Voting Classifier

In [48]:
eclf = VotingClassifier(estimators=[('knn', knn), ('logistic', logistic),('rf',rf),('ada',ada),('gbc',gbc),
                                    ('ext_tree',ext_tree),('bagging',bagging)], voting='soft')

eclf.fit(X_train,Y_train.reshape(1,len(Y_train))[0])
acc_train=eclf.score(X_train,Y_train)
pred=eclf.predict(X_test)
cm_train=get_cm(eclf,X_train,Y_train.reshape(1,len(Y_train))[0])
print("The accuracy of the model on training data is :",acc_train)
display(cm_train)
acc_test=accuracy_score(Y_test.reshape(1,len(Y_test))[0],pred)
print("The accuracy of the model on test data is:",acc_test)
cm_test=ConfusionMatrix(Y_test.reshape(1,len(Y_test))[0],pred)
display(cm_test)
fpr, tpr, threshold = roc_curve(Y_test,eclf.predict(X_test))
auc=metrics.auc(fpr, tpr)

table_80_50_train.loc[8]=   ([acc_train,cm_train.TP,cm_train.TN,cm_train.FP,cm_train.FN,cm_train.FPR,cm_train.FNR])
table_80_50_test.loc[8] =   ([acc_test ,cm_test.TP, cm_test.TN, cm_test.FP, cm_test.FN, cm_test.FPR, cm_test.FNR,auc])

The accuracy of the model on training data is : 0.8806392029553342


Predicted  False  True  __all__
Actual                         
False      30567    76    30643
True        4189   900     5089
__all__    34756   976    35732

The accuracy of the model on test data is: 0.8726071868353297


Predicted  False  True  __all__
Actual                         
False       7614    28     7642
True        1110   181     1291
__all__     8724   209     8933

In [49]:
table_80_50_train['index']=['Neural Network-1','KNeighborsClassifier','LogisticRegression','RandomForestClassifier','AdaBoostClassifier',
                           'GradientBoostingClassifier','ExtraTreeClassifier','BaggingClassifier','VotingClassifier']
table_80_50_test['index']=['Neural Network-1','KNeighborsClassifier','LogisticRegression','RandomForestClassifier','AdaBoostClassifier',
                           'GradientBoostingClassifier','ExtraTreeClassifier','BaggingClassifier','VotingClassifier']

table_80_50_test.set_index(['index'],drop=True,inplace=True)
table_80_50_train.set_index(['index'],drop=True,inplace=True)

print("Comparision Table for Training Accuracies:")
display(table_80_50_train)

print("Comparision Table for Test Accuracies:")
display(table_80_50_test)

Comparision Table for Training Accuracies:


Unnamed: 0_level_0,accuracy,TP,TN,FP,FN,FPR,FNR
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Neural Network-1,0.875238,4655.0,26619.0,4024.0,434.0,0.131319,0.085282
KNeighborsClassifier,0.874734,782.0,30474.0,169.0,4307.0,0.005515,0.846335
LogisticRegression,0.871936,4362.0,26794.0,3849.0,727.0,0.125608,0.142857
RandomForestClassifier,0.893541,4942.0,26986.0,3657.0,147.0,0.119342,0.028886
AdaBoostClassifier,0.862756,335.0,30493.0,150.0,4754.0,0.004895,0.934172
GradientBoostingClassifier,0.881815,4542.0,26967.0,3676.0,547.0,0.119962,0.107487
ExtraTreeClassifier,0.89522,4980.0,27008.0,3635.0,109.0,0.118624,0.021419
BaggingClassifier,0.893569,4952.0,26977.0,3666.0,137.0,0.119636,0.026921
VotingClassifier,0.880639,900.0,30567.0,76.0,4189.0,0.00248,0.823148


Comparision Table for Test Accuracies:


Unnamed: 0_level_0,accuracy,TP,TN,FP,FN,FPR,FNR,auc
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Neural Network-1,0.868017,1166.0,6588.0,1054.0,125.0,0.137922,0.096824,0.882627
KNeighborsClassifier,0.870032,183.0,7589.0,53.0,1108.0,0.006935,0.858249,0.567408
LogisticRegression,0.868801,1116.0,6645.0,997.0,175.0,0.130463,0.135554,0.866991
RandomForestClassifier,0.880779,1177.0,6691.0,951.0,114.0,0.124444,0.088304,0.893626
AdaBoostClassifier,0.860741,93.0,7596.0,46.0,1198.0,0.006019,0.927963,0.533009
GradientBoostingClassifier,0.876525,1137.0,6693.0,949.0,154.0,0.124182,0.119287,0.878265
ExtraTreeClassifier,0.880891,1170.0,6699.0,943.0,121.0,0.123397,0.093726,0.891439
BaggingClassifier,0.880555,1182.0,6684.0,958.0,109.0,0.12536,0.084431,0.895105
VotingClassifier,0.872607,181.0,7614.0,28.0,1110.0,0.003664,0.859799,0.568269


#### Average Feature Importance

In [50]:
(pd.concat((f_r,f_a,f_g,f_e), axis=1).mean(axis=1).sort_values(ascending=False))

script_count_sqr_rt                0.106023
a_href_out_of_domain_log           0.094348
a_count_sqr_rt                     0.080315
div_count_sqr_rt                   0.076572
script_count_log                   0.066654
meta_count_log                     0.062719
a_href_relative_sqr_rt             0.057170
a_href_https_log                   0.054526
p_count_log                        0.046808
script_type_text/javascript_log    0.045779
link_rel_stylesheet_log            0.044360
a_href_absolute_sqr_rt             0.039104
script_async_true_sqr_rt           0.033725
link_href_out_of_domain_sqr_rt     0.028297
script_src_https_log               0.026803
script_async_true_log              0.026695
img_src_.jpg_sqr_rt                0.025142
script_src_.js_sqr_rt              0.025038
link_href_out_of_domain_log        0.023991
script_src_out_of_domain_sqr_rt    0.019603
link_href_https_log                0.009731
iframe_src_https_sqr_rt            0.006596
dtype: float64

In [51]:
### Original Creator : Darshan Bhansali
### HTML code to hide the input cells 
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
To toggle code, click <a href="javascript:code_toggle()">here</a>.''')