In [1]:
import pandas as pd
import numpy as np
import copy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from IPython.display import display,clear_output,HTML
import warnings
warnings.filterwarnings("ignore")

import keras
from keras.layers import Dense,Dropout
from keras.models import Sequential
warnings.simplefilter("ignore", DeprecationWarning)
import researchpy as rp

%matplotlib inline
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows',305)

from sklearn.feature_selection import SelectKBest,chi2,RFE,RFECV,f_regression,SelectFromModel,f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier,BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split,cross_val_score

from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn import metrics



from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score,auc,roc_curve,recall_score,precision_score,matthews_corrcoef

from pandas_ml import ConfusionMatrix


from imblearn.over_sampling import SMOTE

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go

import featuretools as ft

from scipy import stats
import re

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

Using TensorFlow backend.


In [4]:
### Function to plot Feature Importance
def feat_impt_plot(feat_impt,y_labels,width_s=1000,height_s=1000):
    
    m=pd.DataFrame(y_labels,feat_impt).reset_index()
    m.columns=['Feature_Importance','Features']
    m.sort_values(by='Feature_Importance',inplace=True,ascending=False)
    m['Features']=m['Features'].str.replace("dom_function_","")
    m['Features']=m['Features'].str.replace("js_function_","")
    m['Features']=m['Features'].apply(lambda x: str(x).lstrip('.'))
    m['Features']=m['Features'].str.replace("(","")
    
    data = [go.Bar(x=m.Feature_Importance.values,y=m.Features.values,text=np.round(m.Feature_Importance,4),
            textposition = 'outside',
            marker=dict(
                color='rgb(158,202,225)',
                line=dict(
                    color='rgb(8,48,107)',
                    width=1.5),
            ),
            opacity=0.6,
            orientation='h'
        )]
    layout = go.Layout(autosize=False,
    width=width_s,
    height=height_s,
    xaxis=dict(title='Feature Importances',
        tickfont=dict(
            size=12,
            color='black'
        )),
    yaxis=dict(automargin=True))
    fig = go.Figure(data=data, layout=layout)
    iplot(fig)

In [38]:
def model_build(model,X_train,Y_train,X_test,Y_test,tree=False,feat=None):
    
    y_pred_train=model.predict(X_train)
    acc_train=accuracy_score(Y_train,y_pred_train)
    
    print("Accuracy of the model for training data is:",acc_train)
    print("Confusion Matrix for training data is:")
    cm_train=ConfusionMatrix(Y_train,y_pred_train)
    display(cm_train)
    
    
    y_pred_test=model.predict(X_test)
    acc_test=accuracy_score(Y_test,y_pred_test)
    print("Accuracy of the model for test data is:",acc_test)
    print("Confusion Matrix for test data is:")
    cm_test=ConfusionMatrix(Y_test,y_pred_test)
    display(cm_test)

    fpr, tpr, threshold = roc_curve(Y_test, y_pred_test)
    roc_auc =auc(fpr, tpr)
    mcc=matthews_corrcoef(Y_test,y_pred_test)
    
    precision=precision_score(Y_test,y_pred_test)
    recall=recall_score(Y_test,y_pred_test)
    if tree==True:
        feat_impt_plot(model.feature_importances_,feat)
        return model,cm_train,cm_test,acc_train,acc_test,roc_auc,model.feature_importances_,mcc,precision,recall
    
    return model,cm_train,cm_test,acc_train,acc_test,roc_auc,mcc,precision,recall

def KNN_model(X_train,Y_train,X_test,Y_test):
    print('\033[1m' + 'K-Nearest Neighbors' + '\033[0m')
    knn=KNeighborsClassifier()
    knn.fit(X_train,Y_train)

    return model_build(knn,X_train,Y_train,X_test,Y_test)

# def Logistic_model(X_train,Y_train,X_test,Y_test):
#     print('\033[1m' + 'Logistic Regression' + '\033[0m')
#     log=LogisticRegression()
#     log.fit(X_train,Y_train)
    
#     return model_build(log,X_train,Y_train,X_test,Y_test)

def RandomForest(X_train,Y_train,X_test,Y_test,feat):
    print('\033[1m' + 'RandomForest Classifier' + '\033[0m')
    rf=RandomForestClassifier(random_state=0)
    rf.fit(X_train,Y_train)

    
    return model_build(rf,X_train,Y_train,X_test,Y_test,True,feat)

def AdaBoost(X_train,Y_train,X_test,Y_test,feat):
    print('\033[1m' + 'Adaoost Classifier' + '\033[0m')
    ada=AdaBoostClassifier(random_state=0)
    ada.fit(X_train,Y_train)
    
    return model_build(ada,X_train,Y_train,X_test,Y_test,True,feat)

def GradientBoosting(X_train,Y_train,X_test,Y_test,feat):
    print('\033[1m' + 'GradientBoosting Classifier' + '\033[0m')
    gbc=GradientBoostingClassifier(random_state=0)
    gbc.fit(X_train,Y_train)
    
    return model_build(gbc,X_train,Y_train,X_test,Y_test,True,feat)

def ExtraTree(X_train,Y_train,X_test,Y_test,feat):
    print('\033[1m' + 'ExtraTree Classifier' + '\033[0m')
    ext_tree=ExtraTreesClassifier(random_state=0)
    ext_tree.fit(X_train,Y_train)
    
    return model_build(ext_tree,X_train,Y_train,X_test,Y_test,True,feat)

def Bagging(X_train,Y_train,X_test,Y_test):
    print('\033[1m' + 'Bagging Classifier' + '\033[0m')
    bagging=BaggingClassifier(random_state=0)
    bagging.fit(X_train,Y_train)
    
    return model_build(bagging,X_train,Y_train,X_test,Y_test)

def DeepLearning1(X_train,Y_train,X_test,Y_test):
    print('\033[1m' + 'Neural Network-1' + '\033[0m')
    np.random.seed(0)
    deep_learning=Sequential()
    deep_learning.add(Dense(units=64,activation='relu',use_bias=True,kernel_initializer='uniform',input_dim=X_train.shape[1]))
    deep_learning.add(Dense(units=128,activation='relu',use_bias=True,kernel_initializer='uniform'))
    deep_learning.add(Dense(units=1,activation='sigmoid',use_bias=True,kernel_initializer='uniform'))
    deep_learning.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
    deep_learning.fit(X_train,Y_train,batch_size=20,epochs=80,verbose=False)
    acc_train=deep_learning.evaluate(X_train,Y_train,verbose=False)[1]
    print("The accuracy of the model on training data is:",acc_train)
    cm_train=ConfusionMatrix(Y_train,deep_learning.predict_classes(X_train,batch_size=1,verbose=False).reshape(1,len(X_train))[0])
    cm_test=ConfusionMatrix(Y_test,deep_learning.predict_classes(X_test,batch_size=1,verbose=False).reshape(1,len(X_test))[0])
    display(cm_train)
    acc_test=accuracy_score(Y_test,deep_learning.predict_classes(X_test,batch_size=1,verbose=False))
    print("The accuracy of the model on test data is:",acc_test)
    display(cm_test)
    mcc=matthews_corrcoef(Y_test,deep_learning.predict_classes(X_test,batch_size=1,verbose=False))
    precision=precision_score(Y_test,deep_learning.predict_classes(X_test,batch_size=1,verbose=False))
    recall=recall_score(Y_test,deep_learning.predict_classes(X_test,batch_size=1,verbose=False))
    return  deep_learning,cm_train,cm_test,acc_train,acc_test,roc_auc,mcc,precision,recall

#### Read HTML

In [39]:
df_html=pd.read_csv('../html/HTML.csv',index_col='domain')
df_html.drop(['Unnamed: 0'],axis=1,inplace=True)
cookie=[col for col in df_html.columns if 'number' in col]
df_html.drop(cookie,axis=1,inplace=True)
print("Shape of HTML dataset:",df_html.shape)

df_html.Target=df_html.Target.apply(lambda x: 1 if x=='Malicious' else 0)

Shape of HTML dataset: (43491, 1461)


#### Read JS, DOM

In [40]:
df_js=pd.read_csv('Javascript.csv',low_memory=False,index_col='domain')
cookie=[col for col in df_js.columns if 'number' in col]
df_js.drop(cookie,axis=1,inplace=True)

df_js.rename(columns={'Malicious':'Target'},inplace=True)

print("Shape of JS,DOM dataset:",df_js.shape)

df_js.Target=df_js.Target.apply(lambda x: 1 if x=='Malicious' else 0)

Shape of JS,DOM dataset: (43294, 401)


#### Read HTTP

In [41]:
df_http=pd.read_csv('HTTP.csv')

df_http.rename(columns={'Malicious':'Target'},inplace=True)
df_http.fillna(value=0,inplace=True)

cookies=[columns for columns in df_http.columns if 'number' in columns]
df_http.drop(cookies,axis=1,inplace=True)

df_http=df_http[df_http['domain']!=0]

df_http.sort_values(by='Target',inplace=True,ascending=False)
df_http.drop_duplicates(['domain'], keep='first',inplace=True)

df_http=df_http.sample(frac=1,random_state=0)
df_http.set_index(['domain'],drop=True,inplace=True)
print("Shape of HTTP dataset:",df_http.shape)

Shape of HTTP dataset: (45856, 672)


#### Read URL

In [42]:
df_url=pd.read_csv('URL.csv')
df_url.fillna(value=0,inplace=True)
cookies=[columns for columns in df_url.columns if 'number' in columns]
df_url.drop(cookies,axis=1,inplace=True)
df_url.drop(['url_host','url_ip'],axis=1,inplace=True)
df_url=df_url[df_url['domain']!=0]
df_url.sort_values(by='Target',inplace=True,ascending=False)
df_url.drop_duplicates(['domain'], keep='first',inplace=True)
df_url=df_url.sample(frac=1,random_state=0)
print("Shape of URL dataset:",df_url.shape)
df_url.set_index('domain',inplace=True,drop=True)

Shape of URL dataset: (46771, 4194)


#### Merging experiments

In [43]:
df=pd.concat([df_js,df_html,df_http,df_url],axis=1,join='inner')

df['Target_z']=df[['Target','Target','Target','Target']].apply(max,axis=1)
df.drop(['Target','Target','Target','Target'],axis=1,inplace=True)


print("Shape of combined dataset:",df.shape)
df.rename(columns={'Target_z':'Target'},inplace=True)
df.Target.value_counts()

Shape of combined dataset: (39183, 6724)


0    34742
1     4441
Name: Target, dtype: int64

In [44]:
df.drop(['Unnamed: 0'],axis=1,inplace=True)

### 1. Feature selection


####  1.a. Dropping constants 

In [45]:
col_no_unique=[]
for col in df.columns[:]:
    if len(df[col].value_counts())<=1:
        col_no_unique.append(col)
len(col_no_unique)

print("{} columns have only one unique value in them rendering them meaningless for classification".format(len(col_no_unique)))
### Drop those columns
df.drop(col_no_unique,axis=1,inplace=True)

pd.Series(col_no_unique).to_csv('Columns_constant.csv',index=False,header=False)

881 columns have only one unique value in them rendering them meaningless for classification


In [46]:
conv_type=df.dtypes[df.dtypes=='object'].index.values
for col in conv_type:
    try:
        df[col]=df[col].astype(int)
    except:
        print("error during object to int conversion",col)

#### 1.b. Drop features for which 99% of domains have the same values

In [47]:
lst=[]
for col in df.columns[:]:
    lst.append([col,dict(df[col].value_counts()),df[col].value_counts().values[0]>=len(df)*0.99])

col_values=pd.DataFrame(lst,columns=['Feature','Value_Counts','99_%_same_values'])
col_values.set_index('Feature',drop=True,inplace=True)

print("{} columns have more than 99% same values".format(len(col_values[col_values['99_%_same_values']==True].index.values)))

columns=col_values[col_values['99_%_same_values']==False].index.values
columns=columns.tolist()
display(col_values['99_%_same_values'].value_counts())

5283 columns have more than 99% same values


True     5283
False     559
Name: 99_%_same_values, dtype: int64

In [48]:
# rem=['http_header_vary_accept-encoding','http_header_vary_user-agent','js_function_.apply(','js_function_.log(',
#     'js_function_.defineProperty(','js_function_.keys(','js_function_.substr','js_function_.test(',
#      'js_function_unescape(','a_href_relative','link_type_application/wlwmanifest+xml','http_header_content-language_text/html',
#      'http_header_expect-ctreport-uri','tr_count','tbody_count','script_src_relative','script_src_https','link_href_https',
#      'script_src_absolute','img_src_relative','link_count','img_srcset_absolute','img_src_https',
#      'base_href_absolute','iframe_src_https','iframe_src_absolute','base_count','a_href_absolute','link_href_absolute',
#      'form_action_https','form_action_absolute','link_rel_EditURI','form_count','link_type_application/rsd+xml',
#      'nofollow_count','a_count','a_href_https','iframe_sandbox_allow_same_origin_count','img_src_absolute',
#      'base_href_out_of_domain','form_action_relative','link_href_relative','iframe_src_relative','img_srcset_relative',
#      'link_rel_stylesheet','form_enctype_application/x-www-form-urlencoded','link_href_.png','script_type_text/javascript']

# df_sel.drop(rem,axis=1,inplace=True)

In [49]:
df_sel=df[columns].copy(deep=True)
X=df_sel.iloc[:,df_sel.columns!='Target']
y=df_sel.Target
feature_name = X.columns.tolist()

In [50]:
rem=['js_function_Object.defineProperty(','js_function_Object.keys(','js_function_Math.min(','a_href_absolute','a_href_relative',
    'area_href_absolute','area_href_relative','base_href_out_of_domain','base_href_relative','base_href_absolute',
    'form_action_absolute','form_enctype_application/x-www-form-urlencoded','form_action_relative',
    'iframe_sandbox_allow_scripts_count','iframe_src_relative','link_href_absolute','link_href_relative','link_rel_wlwmanifest',
     'link_type_application/opensearchdescription+xml','link_type_application/rsd+xml','link_type_text/xml+oembed',
     'link_type_application/wlwmanifest+xml','source_src_relative','tbody_count','video_small',
     'http_header_content-language_text/html','http_header_expect-ctreport-uri','http_header_vary_accept-encoding',
     'http_header_vary_user-agent','url_word_count_bike',
    'url_extension_endswith_.br','url_extension_endswith_.de','url_extension_endswith_.net','url_extension_endswith_.org',    
    'url_extension_endswith_.ru','url_extension_.xy','url_extension_.x','url_tld_NE']

df_sel.drop(rem,axis=1,inplace=True)

In [51]:
m=df_sel.corr()

In [52]:
for col in m.columns:
    try:
        print(m[(m[col]>=0.95) & (m[col]<1)].index.values[0],"  ",col)
    except:
        continue

In [53]:
df_sel.shape

(39183, 521)

In [54]:
# X = add_constant(df_sel)
# vif_df=pd.Series([variance_inflation_factor(X.values, i) for i in range(X.shape[1])], index=X.columns)
# vif_df[vif_df>5]

In [55]:
X=df_sel.iloc[:,df_sel.columns!='Target']
y=df_sel.Target
feature_name = X.columns.tolist()

In [56]:
def cor_selector(X, y):
    cor_list = []
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-100:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature


cor_support, cor_feature = cor_selector(X, y)

In [57]:
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=100)
chi_selector.fit(X_norm, y)
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()

In [58]:
rfe_selector = RFE(estimator=ExtraTreesClassifier(random_state=0), n_features_to_select=100, step=0.3, verbose=False)
rfe_selector.fit(X_norm, y)
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()

In [59]:
embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100,random_state=0), threshold='23*median')
embeded_rf_selector.fit(X, y)
embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()

In [60]:
feature_selection_df = pd.DataFrame({'Feature':feature_name, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support,
                                    'Random Forest':embeded_rf_support})
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)

In [61]:
X=df_sel[feature_selection_df[feature_selection_df.Total>=4]['Feature']]
X.fillna(value=0,inplace=True)
y=df_sel.Target.values
feature_name = X.columns.tolist()

scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)

print("The train/test split ratio is 80:20")
X_train, X_test, Y_train, Y_test = train_test_split(scaled_X,y,random_state=0,test_size=0.2)

X.shape

The train/test split ratio is 80:20


(39183, 38)

### With features selected by all the methods

In [62]:
feature_name

['url_words_with_length_7',
 'url_words_with_length_5',
 'url_words_with_length_4',
 'url_word_count_cool',
 'url_tlds',
 'url_tld_XYZ',
 'url_tld_BIKE',
 'url_tld_BI',
 'url_length',
 'url_extensions',
 'url_extension_endswith_.xyz',
 'url_extension_endswith_.com',
 'url_extension_.xyz',
 'url_extension_.w',
 'url_extension_.bik',
 'url_extension_.b',
 'url_contains_www',
 'url_char_z',
 'url_char_y',
 'url_char_x',
 'url_char_w',
 'url_char_u',
 'url_char_t',
 'url_char_s',
 'url_char_p',
 'url_char_l',
 'url_char_i',
 'url_char_e',
 'url_char_b',
 'url_char_.',
 'script_src_out_of_domain',
 'script_src_.js',
 'script_count',
 'link_rel_stylesheet',
 'http_header_transfer-encoding_chunked',
 'http_header_server_centos',
 'http_header_marco_contenttext',
 'http_header_content-encoding_gzip']

In [67]:
## Table to store training and test measures so we can compare later
table_80_60_train=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR'])
table_80_60_test=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR','auc','mcc','precision','recall'])

ind_columns=['KNeighborsClassifier','RandomForestClassifier','AdaBoostClassifier',
             'GradientBoostingClassifier','ExtraTreeClassifier','BaggingClassifier','Neural Network-1']

f=[KNN_model,RandomForest,AdaBoost,GradientBoosting, ExtraTree,Bagging,DeepLearning1]

In [68]:
pd.Series(feature_name).to_csv('Features_Selected.csv',index=False,header=False)
feature_name.append('Target')

In [69]:
df_min=df_sel[feature_name]

train=df_min.iloc[:27428,:]
train_X=train.iloc[:,train.columns!='Target'].values
train_Y=train.Target.values

scaler=StandardScaler()
scaled_X_train=scaler.fit_transform(train_X)

test=df_min.iloc[27428:,:]
test_X=test.iloc[:,test.columns!='Target'].values
test_Y=test.Target.values
scaled_X_test=scaler.transform(test_X)

feature_name.remove('Target')

feat_imp=[None]*4
j=0
for i in range(0,7):
    if i>=1 and i<=4:
        model,cm_train,cm_test,acc_train,acc_test,roc_auc,feat_imp[j],mcc,precision,recall=f[i](scaled_X_train,train_Y,scaled_X_test,test_Y,feature_name)
        j=j+1
    else:
        model,cm_train,cm_test,acc_train,acc_test,roc_auc,mcc,precision,recall=f[i](scaled_X_train,train_Y,scaled_X_test,test_Y)
    table_80_60_train.loc[i]=   ([acc_train,cm_train.TP,cm_train.TN,cm_train.FP,cm_train.FN,cm_train.FPR,cm_train.FNR])
    table_80_60_test.loc[i] =   ([acc_test ,cm_test.TP, cm_test.TN, cm_test.FP, cm_test.FN, cm_test.FPR, cm_test.FNR,roc_auc,mcc,precision,recall])
    print("\n")

[1mK-Nearest Neighbors[0m
Accuracy of the model for training data is: 0.982207962666
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24263    64    24327
True         424  2677     3101
__all__    24687  2741    27428

Accuracy of the model for test data is: 0.976010208422
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10370    45    10415
True         237  1103     1340
__all__    10607  1148    11755



[1mRandomForest Classifier[0m
Accuracy of the model for training data is: 0.997666618055
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24324     3    24327
True          61  3040     3101
__all__    24385  3043    27428

Accuracy of the model for test data is: 0.97771161208
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10367    48    10415
True         214  1126     1340
__all__    10581  1174    11755



[1mAdaoost Classifier[0m
Accuracy of the model for training data is: 0.973749453114
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24072   255    24327
True         465  2636     3101
__all__    24537  2891    27428

Accuracy of the model for test data is: 0.969885155253
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10300   115    10415
True         239  1101     1340
__all__    10539  1216    11755



[1mGradientBoosting Classifier[0m
Accuracy of the model for training data is: 0.97910893977
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24203   124    24327
True         449  2652     3101
__all__    24652  2776    27428

Accuracy of the model for test data is: 0.975414717142
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10347    68    10415
True         221  1119     1340
__all__    10568  1187    11755



[1mExtraTree Classifier[0m
Accuracy of the model for training data is: 0.999963540907
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24327     0    24327
True           1  3100     3101
__all__    24328  3100    27428

Accuracy of the model for test data is: 0.97771161208
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10376    39    10415
True         223  1117     1340
__all__    10599  1156    11755



[1mBagging Classifier[0m
Accuracy of the model for training data is: 0.997520781683
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24320     7    24327
True          61  3040     3101
__all__    24381  3047    27428

Accuracy of the model for test data is: 0.977201190983
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10351    64    10415
True         204  1136     1340
__all__    10555  1200    11755



[1mNeural Network-1[0m
The accuracy of the model on training data is: 0.997447863497


Predicted  False  True  __all__
Actual                         
False      24286    41    24327
True          29  3072     3101
__all__    24315  3113    27428

The accuracy of the model on test data is: 0.973373032752


Predicted  False  True  __all__
Actual                         
False      10284   131    10415
True         182  1158     1340
__all__    10466  1289    11755





In [70]:
table_80_60_train['index']=ind_columns
table_80_60_test['index']=ind_columns


table_80_60_test.set_index(['index'],drop=True,inplace=True)
table_80_60_train.set_index(['index'],drop=True,inplace=True)

print("Training Results")
display(table_80_60_train)
print("Test Results")
display(table_80_60_test)

Training Results


Unnamed: 0_level_0,accuracy,TP,TN,FP,FN,FPR,FNR
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
KNeighborsClassifier,0.982208,2677.0,24263.0,64.0,424.0,0.002631,0.13673
RandomForestClassifier,0.997667,3040.0,24324.0,3.0,61.0,0.000123,0.019671
AdaBoostClassifier,0.973749,2636.0,24072.0,255.0,465.0,0.010482,0.149952
GradientBoostingClassifier,0.979109,2652.0,24203.0,124.0,449.0,0.005097,0.144792
ExtraTreeClassifier,0.999964,3100.0,24327.0,0.0,1.0,0.0,0.000322
BaggingClassifier,0.997521,3040.0,24320.0,7.0,61.0,0.000288,0.019671
Neural Network-1,0.997448,3072.0,24286.0,41.0,29.0,0.001685,0.009352


Test Results


Unnamed: 0_level_0,accuracy,TP,TN,FP,FN,FPR,FNR,auc,mcc,precision,recall
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
KNeighborsClassifier,0.97601,1103.0,10370.0,45.0,237.0,0.004321,0.176866,0.909407,0.876597,0.960801,0.823134
RandomForestClassifier,0.977712,1126.0,10367.0,48.0,214.0,0.004609,0.159701,0.917845,0.885788,0.959114,0.840299
AdaBoostClassifier,0.969885,1101.0,10300.0,115.0,239.0,0.011042,0.178358,0.9053,0.845906,0.905428,0.821642
GradientBoostingClassifier,0.975415,1119.0,10347.0,68.0,221.0,0.006529,0.164925,0.914273,0.87393,0.942713,0.835075
ExtraTreeClassifier,0.977712,1117.0,10376.0,39.0,223.0,0.003745,0.166418,0.914919,0.885653,0.966263,0.833582
BaggingClassifier,0.977201,1136.0,10351.0,64.0,204.0,0.006145,0.152239,0.920808,0.883439,0.946667,0.847761
Neural Network-1,0.973373,1158.0,10284.0,131.0,182.0,0.012578,0.135821,0.920808,0.866167,0.898371,0.864179


In [71]:
## Table to store training and test measures so we can compare later
table_80_60_train=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR'])
table_80_60_test=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR','auc','mcc','precision','recall'])

ind_columns=['KNeighborsClassifier','RandomForestClassifier','AdaBoostClassifier',
             'GradientBoostingClassifier','ExtraTreeClassifier','BaggingClassifier','Neural Network-1']

f=[KNN_model,RandomForest,AdaBoost,GradientBoosting, ExtraTree,Bagging,DeepLearning1]

In [72]:
X=df_sel[feature_selection_df[feature_selection_df.Total>=3]['Feature']]
X.fillna(value=0,inplace=True)
y=df_sel.Target.values
feature_name = X.columns.tolist()

scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)

print("The train/test split ratio is 80:20")
X_train, X_test, Y_train, Y_test = train_test_split(scaled_X,y,random_state=0,test_size=0.2)

X.shape

The train/test split ratio is 80:20


(39183, 66)

### With features selected by three methods

In [73]:
feature_name

['url_words_with_length_7',
 'url_words_with_length_5',
 'url_words_with_length_4',
 'url_word_count_cool',
 'url_tlds',
 'url_tld_XYZ',
 'url_tld_BIKE',
 'url_tld_BI',
 'url_length',
 'url_extensions',
 'url_extension_endswith_.xyz',
 'url_extension_endswith_.com',
 'url_extension_.xyz',
 'url_extension_.w',
 'url_extension_.bik',
 'url_extension_.b',
 'url_contains_www',
 'url_char_z',
 'url_char_y',
 'url_char_x',
 'url_char_w',
 'url_char_u',
 'url_char_t',
 'url_char_s',
 'url_char_p',
 'url_char_l',
 'url_char_i',
 'url_char_e',
 'url_char_b',
 'url_char_.',
 'script_src_out_of_domain',
 'script_src_.js',
 'script_count',
 'link_rel_stylesheet',
 'http_header_transfer-encoding_chunked',
 'http_header_server_centos',
 'http_header_marco_contenttext',
 'http_header_content-encoding_gzip',
 'url_words_with_length_6',
 'url_extension_.h',
 'url_extension_.com',
 'url_char_n',
 'url_char_f',
 'url_char_a',
 'url_char_6',
 'url_char_-',
 'script_type_text/javascript',
 'script_src_rela

In [74]:
feature_name.append('Target')
df_min=df_sel[feature_name]

train=df_min.iloc[:27428,:]
train_X=train.iloc[:,train.columns!='Target'].values
train_Y=train.Target.values

scaler=StandardScaler()
scaled_X_train=scaler.fit_transform(train_X)

test=df_min.iloc[27428:,:]
test_X=test.iloc[:,test.columns!='Target'].values
test_Y=test.Target.values
scaled_X_test=scaler.transform(test_X)

feature_name.remove('Target')

feat_imp=[None]*4
j=0
for i in range(0,7):
    if i>=1 and i<=4:
        model,cm_train,cm_test,acc_train,acc_test,roc_auc,feat_imp[j],mcc,precision,recall=f[i](scaled_X_train,train_Y,scaled_X_test,test_Y,feature_name)
        j=j+1
    else:
        model,cm_train,cm_test,acc_train,acc_test,roc_auc,mcc,precision,recall=f[i](scaled_X_train,train_Y,scaled_X_test,test_Y)
    table_80_60_train.loc[i]=   ([acc_train,cm_train.TP,cm_train.TN,cm_train.FP,cm_train.FN,cm_train.FPR,cm_train.FNR])
    table_80_60_test.loc[i] =   ([acc_test ,cm_test.TP, cm_test.TN, cm_test.FP, cm_test.FN, cm_test.FPR, cm_test.FNR,roc_auc,mcc,precision,recall])
    print("\n")

[1mK-Nearest Neighbors[0m
Accuracy of the model for training data is: 0.984213212775
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24249    78    24327
True         355  2746     3101
__all__    24604  2824    27428

Accuracy of the model for test data is: 0.977371331348
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10365    50    10415
True         216  1124     1340
__all__    10581  1174    11755



[1mRandomForest Classifier[0m
Accuracy of the model for training data is: 0.997994749891
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24327     0    24327
True          55  3046     3101
__all__    24382  3046    27428

Accuracy of the model for test data is: 0.978477243726
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10366    49    10415
True         204  1136     1340
__all__    10570  1185    11755



[1mAdaoost Classifier[0m
Accuracy of the model for training data is: 0.975390112294
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24093   234    24327
True         441  2660     3101
__all__    24534  2894    27428

Accuracy of the model for test data is: 0.972352190557
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10299   116    10415
True         209  1131     1340
__all__    10508  1247    11755



[1mGradientBoosting Classifier[0m
Accuracy of the model for training data is: 0.98074959895
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24216   111    24327
True         417  2684     3101
__all__    24633  2795    27428

Accuracy of the model for test data is: 0.976945980434
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10351    64    10415
True         207  1133     1340
__all__    10558  1197    11755



[1mExtraTree Classifier[0m
Accuracy of the model for training data is: 1.0
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24327     0    24327
True           0  3101     3101
__all__    24327  3101    27428

Accuracy of the model for test data is: 0.980859208847
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10386    29    10415
True         196  1144     1340
__all__    10582  1173    11755



[1mBagging Classifier[0m
Accuracy of the model for training data is: 0.997666618055
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      24326     1    24327
True          63  3038     3101
__all__    24389  3039    27428

Accuracy of the model for test data is: 0.979327945555
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False      10365    50    10415
True         193  1147     1340
__all__    10558  1197    11755



[1mNeural Network-1[0m
The accuracy of the model on training data is: 0.998249963541


Predicted  False  True  __all__
Actual                         
False      24296    31    24327
True          17  3084     3101
__all__    24313  3115    27428

The accuracy of the model on test data is: 0.973968524032


Predicted  False  True  __all__
Actual                         
False      10270   145    10415
True         161  1179     1340
__all__    10431  1324    11755





In [75]:
table_80_60_train['index']=ind_columns
table_80_60_test['index']=ind_columns


table_80_60_test.set_index(['index'],drop=True,inplace=True)
table_80_60_train.set_index(['index'],drop=True,inplace=True)

print("Training Results")
display(table_80_60_train)
print("Test Results")
display(table_80_60_test)

Training Results


Unnamed: 0_level_0,accuracy,TP,TN,FP,FN,FPR,FNR
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
KNeighborsClassifier,0.984213,2746.0,24249.0,78.0,355.0,0.003206,0.114479
RandomForestClassifier,0.997995,3046.0,24327.0,0.0,55.0,0.0,0.017736
AdaBoostClassifier,0.97539,2660.0,24093.0,234.0,441.0,0.009619,0.142212
GradientBoostingClassifier,0.98075,2684.0,24216.0,111.0,417.0,0.004563,0.134473
ExtraTreeClassifier,1.0,3101.0,24327.0,0.0,0.0,0.0,0.0
BaggingClassifier,0.997667,3038.0,24326.0,1.0,63.0,4.1e-05,0.020316
Neural Network-1,0.99825,3084.0,24296.0,31.0,17.0,0.001274,0.005482


Test Results


Unnamed: 0_level_0,accuracy,TP,TN,FP,FN,FPR,FNR,auc,mcc,precision,recall
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
KNeighborsClassifier,0.977371,1124.0,10365.0,50.0,216.0,0.004801,0.161194,0.917003,0.884002,0.957411,0.838806
RandomForestClassifier,0.978477,1136.0,10366.0,49.0,204.0,0.004705,0.152239,0.921528,0.889902,0.95865,0.847761
AdaBoostClassifier,0.972352,1131.0,10299.0,116.0,209.0,0.011138,0.15597,0.916446,0.859563,0.906977,0.84403
GradientBoostingClassifier,0.976946,1133.0,10351.0,64.0,207.0,0.006145,0.154478,0.919689,0.882067,0.946533,0.845522
ExtraTreeClassifier,0.980859,1144.0,10386.0,29.0,196.0,0.002784,0.146269,0.925473,0.902301,0.975277,0.853731
BaggingClassifier,0.979328,1147.0,10365.0,50.0,193.0,0.004801,0.14403,0.925585,0.894458,0.958229,0.85597
Neural Network-1,0.973969,1179.0,10270.0,145.0,161.0,0.013922,0.120149,0.925585,0.870477,0.890483,0.879851
