In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt


import seaborn as sns

from sklearn.model_selection import train_test_split,GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve,auc,confusion_matrix, accuracy_score, precision_score, \
                            recall_score, matthews_corrcoef

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier,\
                             VotingClassifier
from sklearn.svm import OneClassSVM
import featuretools as ft
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from IPython.display import display,clear_output,HTML



In [2]:
### Read HTML 
df_html=pd.read_csv('../../html/HTML.csv',index_col='domain')
df_html.drop(['Unnamed: 0'],axis=1,inplace=True)
cookie=[col for col in df_html.columns if 'number' in col]
df_html.drop(cookie,axis=1,inplace=True)
print("Shape of HTML dataset:",df_html.shape)
df_html.Target=df_html.Target.apply(lambda x: 1 if x=='Malicious' else 0)
count_columns=[col for col in df_html.columns if col.endswith('count')]
df_html['total_count']=df_html[count_columns].sum(axis=1)

href_columns=[col for col in df_html.columns if (('href_absolute' in col) | ('href_relative' in col) | ('href_page' in col))  ]
df_html['total_href']=df_html[href_columns].sum(axis=1)

total_img_src=[col for col in df_html.columns if 'img_src_' in col]
df_html['total_img_src']=df_html[total_img_src].sum(axis=1)


### Read JS 
df_js=pd.read_csv('../Javascript.csv',low_memory=False,index_col='domain')
cookie=[col for col in df_js.columns if 'number' in col]
df_js.drop(cookie,axis=1,inplace=True)
df_js.rename(columns={'Malicious':'Target'},inplace=True)
print("Shape of JS,DOM dataset:",df_js.shape)
df_js.Target=df_js.Target.apply(lambda x: 1 if x=='Malicious' else 0)


### Read HTTP
df_http=pd.read_csv('../HTTP.csv')
df_http.rename(columns={'Malicious':'Target'},inplace=True)
df_http.fillna(value=0,inplace=True)
cookies=[columns for columns in df_http.columns if 'number' in columns]
df_http.drop(cookies,axis=1,inplace=True)
df_http=df_http[df_http['domain']!=0]
df_http.sort_values(by='Target',inplace=True,ascending=False)
df_http.drop_duplicates(['domain'], keep='first',inplace=True)
df_http=df_http.sample(frac=1,random_state=0)
df_http.set_index(['domain'],drop=True,inplace=True)
print("Shape of HTTP dataset:",df_http.shape)



### Read URL
df_url=pd.read_csv('../URL.csv')
df_url.fillna(value=0,inplace=True)
cookies=[columns for columns in df_url.columns if 'number' in columns]
df_url.drop(cookies,axis=1,inplace=True)
df_url.drop(['url_host','url_ip'],axis=1,inplace=True)
df_url=df_url[df_url['domain']!=0]
df_url.sort_values(by='Target',inplace=True,ascending=False)
df_url.drop_duplicates(['domain'], keep='first',inplace=True)
df_url=df_url.sample(frac=1,random_state=0)
print("Shape of URL dataset:",df_url.shape)
df_url.set_index('domain',inplace=True,drop=True)

df=pd.concat([df_js,df_html,df_http,df_url],axis=1,join='inner')
df.drop(['Unnamed: 0'],axis=1,inplace=True)
df['Target_z']=df[['Target','Target','Target','Target']].apply(max,axis=1)
df.drop(['Target','Target','Target','Target'],axis=1,inplace=True)
print("Shape of combined dataset:",df.shape)
df.rename(columns={'Target_z':'Target'},inplace=True)
df.Target.value_counts()

Shape of HTML dataset: (43491, 1461)
Shape of JS,DOM dataset: (43294, 401)
Shape of HTTP dataset: (45856, 672)


  interactivity=interactivity, compiler=compiler, result=result)


Shape of URL dataset: (46771, 4194)
Shape of combined dataset: (39183, 6726)


0    34742
1     4441
Name: Target, dtype: int64

In [3]:
columns=['dom_function_document.createElement(',
 'dom_function_document.write(',
 'dom_function_window.addEventListener(',
 'dom_function_window.setInterval(',
 'dom_function_window.setTimeout(',
 'js_function_.charCodeAt(',
 'js_function_.concat(',
 'js_function_.exec(',
 'js_function_.link(',
 'js_function_.search(',
 'js_function_.substring(',
 'js_function_escape(',
 'js_function_eval(',
 'js_function_parseInt(',
 'js_function_String.fromCharCode(',
 'js_function_unescape(',
 'js_function_.replace(',
 'a_count',
 'div_count',
 'embed_count',
 'frame_count',
 'iframe_count',
 'img_count',
 'input_count',
 'link_count',
 'meta_count',
 'object_count',
 'style_count',
 'title_count',
 'total_count',
 'total_href',
 'total_img_src',
 'iframe_hidden_true',
 'script_type_text/javascript',
 'a_href_out_of_domain',
 'area_href_out_of_domain',
 'audio_src_out_of_domain',
 'base_href_out_of_domain',
 'embed_src_out_of_domain',
 'form_action_out_of_domain',
 'frame_src_out_of_domain',
 'iframe_src_out_of_domain',
 'img_src_out_of_domain',
 'img_srcset_out_of_domain',
 'link_href_out_of_domain',
 'object_data_out_of_domain',
 'script_src_out_of_domain',
 'source_src_out_of_domain',
 'source_srcset_out_of_domain',
 'video_src_out_of_domain',
 'http_header_cache-control_set_max-age',
 'http_header_cache-control_set_must-revalidate',
 'http_header_cache-control_set_no-cache',
 'http_header_cache-control_set_no-store',
 'http_header_cache-control_set_private',
 'http_header_cache-control_set_public',
 'http_header_content-encoding_gzip',
 'http_header_content-language_text/html',
 'http_header_content-length',
 'http_header_server_apache',
 'http_header_server_nginx',
 'url_char_-',
 'url_char_.',
 'url_char_a',
 'url_char_b',
 'url_char_c',
 'url_char_d',
 'url_char_e',
 'url_char_f',
 'url_char_g',
 'url_char_h',
 'url_char_i',
 'url_char_j',
 'url_char_k',
 'url_char_l',
 'url_char_m',
 'url_char_n',
 'url_char_o',
 'url_char_p',
 'url_char_r',
 'url_char_s',
 'url_char_t',
 'url_char_u',
 'url_char_v',
 'url_char_w',
 'url_char_x',
 'url_char_y',
 'url_char_z',
 'url_extension_.com',
 'url_extension_.i',
 'url_extension_.net',
 'url_extensions',
 'url_length',
 'url_tlds',
 'url_words_with_length_4',
 'url_words_with_length_5',
 'url_words_with_length_6',
 'url_words_with_length_7',
 'url_words_with_length_8',
 'Target']

print("features being used")
display(columns)
df_sel=df[columns].copy(deep=True)

features being used


['dom_function_document.createElement(',
 'dom_function_document.write(',
 'dom_function_window.addEventListener(',
 'dom_function_window.setInterval(',
 'dom_function_window.setTimeout(',
 'js_function_.charCodeAt(',
 'js_function_.concat(',
 'js_function_.exec(',
 'js_function_.link(',
 'js_function_.search(',
 'js_function_.substring(',
 'js_function_escape(',
 'js_function_eval(',
 'js_function_parseInt(',
 'js_function_String.fromCharCode(',
 'js_function_unescape(',
 'js_function_.replace(',
 'a_count',
 'div_count',
 'embed_count',
 'frame_count',
 'iframe_count',
 'img_count',
 'input_count',
 'link_count',
 'meta_count',
 'object_count',
 'style_count',
 'title_count',
 'total_count',
 'total_href',
 'total_img_src',
 'iframe_hidden_true',
 'script_type_text/javascript',
 'a_href_out_of_domain',
 'area_href_out_of_domain',
 'audio_src_out_of_domain',
 'base_href_out_of_domain',
 'embed_src_out_of_domain',
 'form_action_out_of_domain',
 'frame_src_out_of_domain',
 'iframe_src_o

In [4]:
def model_build(model,X_train,Y_train,X_test,Y_test,tree=False,feat=None):
    
    y_pred_train=model.predict(X_train)
    acc_train=accuracy_score(Y_train,y_pred_train)
    print("Accuracy of the model for training data is:",acc_train)
    print("Confusion Matrix for training data is:")
    display(pd.crosstab(Y_train,y_pred_train))
    
    
    y_pred_test=model.predict(X_test)
    acc_test=accuracy_score(Y_test,y_pred_test)
    print("Accuracy of the model for test data is:",acc_test)
    print("Confusion Matrix for test data is:")
    display(pd.crosstab(Y_test,y_pred_test))
    CM = confusion_matrix(Y_test, y_pred_test)
    TN = CM[0][0]
    FN = CM[1][0]
    TP = CM[1][1]
    FP = CM[0][1]
    FPR = FP/(FP+TN)
    FNR = FN/(TP+FN)
    fpr, tpr, threshold = roc_curve(Y_test, y_pred_test)
    roc_auc =auc(fpr, tpr)
    mcc=matthews_corrcoef(Y_test,y_pred_test)
    
    precision=precision_score(Y_test,y_pred_test)
    recall=recall_score(Y_test,y_pred_test)
    if tree==True:
#         feat_impt_plot(model.feature_importances_,feat)
        return model,acc_train,acc_test,roc_auc,model.feature_importances_,mcc,precision,recall
    
    return model,acc_train,acc_test,roc_auc,mcc,precision,recall, FPR, FNR

### 1. No Sampling

In [5]:
X_train, X_test = train_test_split(df_sel, test_size=0.3, random_state=42)

Y_train = X_train['Target']
X_train = X_train.drop(['Target'], axis=1) 

sc = StandardScaler()
scaled_X_train = sc.fit_transform(X_train)

Y_test = X_test['Target']
X_test = X_test.drop(['Target'], axis=1)
scaled_X_test = sc.transform(X_test)

In [6]:
rf_model = RandomForestClassifier(random_state=0)

et_model = ExtraTreesClassifier(random_state=0)

grad_model = GradientBoostingClassifier(random_state=0)


base_est =[ ( 'rf' , rf_model ) , ( 'gradient' , grad_model ) , ( 'extratree' , et_model ) ]


print('\033[1m' + 'Voting Classifier' + '\033[0m')
vc_model=VotingClassifier(estimators = base_est, voting = 'hard', weights = [1,1,1] )
vc_model.fit(scaled_X_train,Y_train)
vc_model,acc_train,acc_test,roc_auc,mcc,precision,recall,fpr,fnr=model_build(vc_model,scaled_X_train,Y_train,scaled_X_test,Y_test,tree=False,feat=None)
print("MCC : ", mcc)
print("Precison : ",precision)
print("Recall : ",recall)
print("AUC : " , roc_auc)
print("FPR:" ,fpr)
print("FNR:" ,fnr)

[1mVoting Classifier[0m
Accuracy of the model for training data is: 0.9999635409071023
Confusion Matrix for training data is:


col_0,0,1
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,24344,0
1,1,3083


Accuracy of the model for test data is: 0.9823054019566142
Confusion Matrix for test data is:


col_0,0,1
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,10364,34
1,174,1183


MCC :  0.910989404120097
Precison :  0.9720624486442071
Recall :  0.8717759764185704
AUC :  0.9342530584150941
FPR: 0.0032698595883823814
FNR: 0.12822402358142962


### 2. Over-Sampling

In [7]:
sm = SMOTE(random_state=12, ratio = {1:24344})
X_train, Y_train = sm.fit_sample(scaled_X_train,Y_train)
rf_model = RandomForestClassifier(random_state=0)

et_model = ExtraTreesClassifier(random_state=0)

grad_model = GradientBoostingClassifier(random_state=0)


base_est =[ ( 'rf' , rf_model ) , ( 'gradient' , grad_model ) , ( 'extratree' , et_model ) ]


print('\033[1m' + 'Voting Classifier' + '\033[0m')
vc_model=VotingClassifier(estimators = base_est, voting = 'soft', weights = [1,1,2] )
vc_model.fit(X_train,Y_train)
vc_model,acc_train,acc_test,roc_auc,mcc,precision,recall,fpr,fnr=model_build(vc_model,X_train,Y_train,scaled_X_test,Y_test,tree=False,feat=None)
print("MCC : ", mcc)
print("Precison : ",precision)
print("Recall : ",recall)
print("AUC : " , roc_auc)
print("FPR:" ,fpr)
print("FNR:" ,fnr)



[1mVoting Classifier[0m
Accuracy of the model for training data is: 1.0
Confusion Matrix for training data is:


col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,24344,0
1,0,24344


Accuracy of the model for test data is: 0.9821352615908124
Confusion Matrix for test data is:


col_0,0,1
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,10323,75
1,135,1222


MCC :  0.9111067755251203
Precison :  0.9421742482652274
Recall :  0.9005158437730287
AUC :  0.9466514591052102
FPR: 0.007212925562608194
FNR: 0.09948415622697127


### 3. Under-Sampling

In [8]:
X_train, X_test = train_test_split(df_sel, test_size=0.3, random_state=42)


df_non = X_train[X_train.Target==0]
df_mal = X_train[X_train.Target==1]

df_non=df_non.sample(frac=len(df_mal)/len(df_non),random_state=0)
df_u=df_non.append(df_mal)

Y_train = df_u['Target']
X_train = df_u.drop(['Target'], axis=1) 

sc = StandardScaler()
scaled_X_train = sc.fit_transform(X_train)

Y_test = X_test['Target']
X_test = X_test.drop(['Target'], axis=1)
scaled_X_test = sc.transform(X_test)

rf_model = RandomForestClassifier(random_state=0)

et_model = ExtraTreesClassifier(random_state=0)

grad_model = GradientBoostingClassifier(random_state=0)


base_est =[ ( 'rf' , rf_model ) , ( 'gradient' , grad_model ) , ( 'extratree' , et_model ) ]


print('\033[1m' + 'Voting Classifier' + '\033[0m')
vc_model=VotingClassifier(estimators = base_est, voting = 'hard', weights = [1,1,1] )
vc_model.fit(scaled_X_train,Y_train)
vc_model,acc_train,acc_test,roc_auc,mcc,precision,recall,fpr,fnr=model_build(vc_model,scaled_X_train,Y_train,scaled_X_test,Y_test,tree=False,feat=None)
print("MCC : ", mcc)
print("Precison : ",precision)
print("Recall : ",recall)
print("AUC : " , roc_auc)
print("FPR:" ,fpr)
print("FNR:" ,fnr)

[1mVoting Classifier[0m
Accuracy of the model for training data is: 1.0
Confusion Matrix for training data is:


col_0,0,1
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3084,0
1,0,3084


Accuracy of the model for test data is: 0.9599319438536793
Confusion Matrix for test data is:


col_0,0,1
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9991,407
1,64,1293


MCC :  0.8301385452458547
Precison :  0.7605882352941177
Recall :  0.952837140751658
AUC :  0.9568474990159521
FPR: 0.039142142719753796
FNR: 0.04716285924834193


### 4. Feature Transformation

In [9]:
df.reset_index(inplace=True)
columns.remove('Target')
columns.append('domain')
df_min=df[columns]

es = ft.EntitySet(id = 'malicious')
es.entity_from_dataframe(entity_id = 'data', dataframe = df_min, index = 'domain')

feature_matrix_sessions, features_defs = ft.dfs(entityset=es,target_entity="data",
                                                trans_primitives = ['add','divide','multiply'],n_jobs=-1,
                                                verbose=1,max_depth=1)

Built 19503 features
EntitySet scattered to workers in 4.766 seconds
Elapsed: 01:49 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 11/11 chunks


In [14]:
df.set_index(['domain'],drop=True,inplace=True)

In [15]:
feature_name= ['url_tlds / total_count',
 'url_length / total_count',
 'url_extensions / total_count',
 'url_char_z / total_count',
 'url_words_with_length_5 / total_count',
 'url_words_with_length_4 / total_count',
 'url_extensions + url_length',
 'url_char_w + url_length',
 'url_char_u / total_count',
 'url_char_t / total_count',
 'url_char_s / total_count',
 'url_char_l / total_count',
 'url_char_i / total_count',
 'url_char_e / total_count',
 'url_char_. / total_count',
 'url_char_. + url_char_z',
 'url_char_. + url_char_y',
 'url_char_. + url_char_x',
 'url_char_. + url_char_w',
 'url_char_.',
 'js_function_String.fromCharCode( + url_length',
 'http_header_content-language_text/html / url_char_.',
 'http_header_content-language_text/html / title_count',
 'http_header_content-language_text/html + url_length',
 'http_header_content-encoding_gzip + http_header_content-language_text/html',
 'frame_src_out_of_domain + url_char_.',
 'frame_count + url_char_.']
display(feature_name)
feature_matrix_sessions = feature_matrix_sessions[feature_name]

['url_tlds / total_count',
 'url_length / total_count',
 'url_extensions / total_count',
 'url_char_z / total_count',
 'url_words_with_length_5 / total_count',
 'url_words_with_length_4 / total_count',
 'url_extensions + url_length',
 'url_char_w + url_length',
 'url_char_u / total_count',
 'url_char_t / total_count',
 'url_char_s / total_count',
 'url_char_l / total_count',
 'url_char_i / total_count',
 'url_char_e / total_count',
 'url_char_. / total_count',
 'url_char_. + url_char_z',
 'url_char_. + url_char_y',
 'url_char_. + url_char_x',
 'url_char_. + url_char_w',
 'url_char_.',
 'js_function_String.fromCharCode( + url_length',
 'http_header_content-language_text/html / url_char_.',
 'http_header_content-language_text/html / title_count',
 'http_header_content-language_text/html + url_length',
 'http_header_content-encoding_gzip + http_header_content-language_text/html',
 'frame_src_out_of_domain + url_char_.',
 'frame_count + url_char_.']

In [16]:
fs=pd.merge(feature_matrix_sessions,pd.DataFrame(df_sel.Target),left_index=True, right_index=True)
fs.replace([np.inf],0,inplace=True)
fs.fillna(value=0,inplace=True)
fs=fs.reindex(df.index.values)
X=fs.loc[:,fs.columns!='Target']
y=fs.Target
feature_name = X.columns.tolist()

In [17]:
fs=fs.reindex(df_sel.index.values)
X_train, X_test = train_test_split(fs, test_size=0.3, random_state=0)

# X_train.replace([np.inf],0,inplace=True)
# X_train.fillna(value=0,inplace=True)
Y_train = X_train['Target']
X_train = X_train.drop(['Target'], axis=1) 

sc = StandardScaler()
scaled_X_train = sc.fit_transform(X_train)

Y_test = X_test['Target']
X_test = X_test.drop(['Target'], axis=1)
scaled_X_test = sc.transform(X_test)

In [18]:
rf_model = RandomForestClassifier(random_state=0)

et_model = ExtraTreesClassifier(random_state=0)

grad_model = GradientBoostingClassifier(random_state=0)


base_est =[ ( 'rf' , rf_model ) , ( 'gradient' , grad_model ) , ( 'extratree' , et_model ) ]


print('\033[1m' + 'Voting Classifier' + '\033[0m')
vc_model=VotingClassifier(estimators = base_est, voting = 'hard', weights = [1,1,1] )
vc_model.fit(scaled_X_train,Y_train)
vc_model,acc_train,acc_test,roc_auc,mcc,precision,recall,fpr,fnr=model_build(vc_model,scaled_X_train,Y_train,scaled_X_test,Y_test,tree=False,feat=None)
print("MCC : ", mcc)
print("Precison : ",precision)
print("Recall : ",recall)
print("AUC : " , roc_auc)
print("FPR:" ,fpr)
print("FNR:" ,fnr)

[1mVoting Classifier[0m
Accuracy of the model for training data is: 0.9997812454426134
Confusion Matrix for training data is:


col_0,0,1
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,24325,0
1,6,3097


Accuracy of the model for test data is: 0.9785623139089749
Confusion Matrix for test data is:


col_0,0,1
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,10369,48
1,204,1134


MCC :  0.8901862273915041
Precison :  0.9593908629441624
Recall :  0.8475336322869955
AUC :  0.9214628898691386
FPR: 0.004607852548718441
FNR: 0.15246636771300448


### 5. PCA

In [19]:
df.reset_index(inplace=True)
# columns.remove('Target')
# columns.append('domain')
df_min=df[columns]

es = ft.EntitySet(id = 'malicious')
es.entity_from_dataframe(entity_id = 'data', dataframe = df_min, index = 'domain')

feature_matrix_sessions, features_defs = ft.dfs(entityset=es,target_entity="data",
                                                trans_primitives = ['add','divide','multiply'],n_jobs=-1,
                                                verbose=1,max_depth=1)

df.set_index(['domain'],drop=True,inplace=True)
fs=pd.merge(feature_matrix_sessions,pd.DataFrame(df_sel.Target),left_index=True, right_index=True)
fs.replace([np.inf],0,inplace=True)
fs.fillna(value=0,inplace=True)
fs=fs.reindex(df.index.values)

Built 19503 features
EntitySet scattered to workers in 4.768 seconds
Elapsed: 01:47 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 11/11 chunks


In [20]:
X_train, X_test = train_test_split(fs, test_size=0.3, random_state=0)
X=X_train.drop(['Target'] , axis=1)
y=X_train.Target.values
feature_name = X.columns.tolist()
X=np.nan_to_num(X)
scaler=StandardScaler() 
scaled_X=scaler.fit_transform(X)

In [21]:
pca=PCA(n_components=750,whiten=True,random_state=0)
X_pca_21=pca.fit_transform(scaled_X)
print("Variance explained using 750 components is:",sum(pca.explained_variance_ratio_))
y_test = X_test['Target']
X_test = X_test.drop(['Target'], axis=1)
X_test_pca = pca.transform(X_test)

Variance explained using 750 components is: 0.8142755676566173


In [22]:
rf_model = RandomForestClassifier(random_state=0)

et_model = ExtraTreesClassifier(random_state=0)

grad_model = GradientBoostingClassifier(random_state=0)


base_est =[ ( 'rf' , rf_model ) , ( 'gradient' , grad_model ) , ( 'extratree' , et_model ) ]


print('\033[1m' + 'Voting Classifier' + '\033[0m')
vc_model=VotingClassifier(estimators = base_est, voting = 'soft', weights = [1,1,1] )
vc_model.fit(X_pca_21,y)
vc_model,acc_train,acc_test,roc_auc,mcc,precision,recall,fpr,fnr=model_build(vc_model,X_pca_21,y,X_test_pca,y_test,tree=False,feat=None)
print("MCC : ", mcc)
print("Precison : ",precision)
print("Recall : ",recall)
print("AUC : " , roc_auc)
print("FPR:" ,fpr)
print("FNR:" ,fnr)

[1mVoting Classifier[0m
Accuracy of the model for training data is: 1.0
Confusion Matrix for training data is:


col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,24325,0
1,0,3103


Accuracy of the model for test data is: 0.5482773287962569
Confusion Matrix for test data is:


col_0,0,1
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,6286,4131
1,1179,159


MCC :  -0.18322191820810466
Precison :  0.03706293706293706
Recall :  0.11883408071748879
AUC :  0.361135385371704
FPR: 0.39656330997408085
FNR: 0.8811659192825112


In [23]:
### Darshan Bhansali
### HTML code to hide the input cells 
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
To toggle code, click <a href="javascript:code_toggle()">here</a>.''')