In [2]:
import pandas as pd
import numpy as np


from IPython.display import display,clear_output,HTML
import warnings
warnings.filterwarnings("ignore")


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV,StratifiedKFold

from sklearn.preprocessing import StandardScaler,MinMaxScaler



from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score,auc,roc_curve,recall_score,precision_score,matthews_corrcoef,confusion_matrix

In [2]:
### Read HTML 
df_html=pd.read_csv('../../html/HTML.csv',index_col='domain')
df_html.drop(['Unnamed: 0'],axis=1,inplace=True)
cookie=[col for col in df_html.columns if 'number' in col]
df_html.drop(cookie,axis=1,inplace=True)
print("Shape of HTML dataset:",df_html.shape)
df_html.Target=df_html.Target.apply(lambda x: 1 if x=='Malicious' else 0)
count_columns=[col for col in df_html.columns if col.endswith('count')]
df_html['total_count']=df_html[count_columns].sum(axis=1)

href_columns=[col for col in df_html.columns if (('href_absolute' in col) | ('href_relative' in col) | ('href_page' in col))  ]
df_html['total_href']=df_html[href_columns].sum(axis=1)


### Read JS 
df_js=pd.read_csv('../Javascript.csv',low_memory=False,index_col='domain')
cookie=[col for col in df_js.columns if 'number' in col]
df_js.drop(cookie,axis=1,inplace=True)
df_js.rename(columns={'Malicious':'Target'},inplace=True)
print("Shape of JS,DOM dataset:",df_js.shape)
df_js.Target=df_js.Target.apply(lambda x: 1 if x=='Malicious' else 0)


### Read HTTP
df_http=pd.read_csv('../HTTP.csv')
df_http.rename(columns={'Malicious':'Target'},inplace=True)
df_http.fillna(value=0,inplace=True)
cookies=[columns for columns in df_http.columns if 'number' in columns]
df_http.drop(cookies,axis=1,inplace=True)
df_http=df_http[df_http['domain']!=0]
df_http.sort_values(by='Target',inplace=True,ascending=False)
df_http.drop_duplicates(['domain'], keep='first',inplace=True)
df_http=df_http.sample(frac=1,random_state=0)
df_http.set_index(['domain'],drop=True,inplace=True)
print("Shape of HTTP dataset:",df_http.shape)



### Read URL
df_url=pd.read_csv('../URL.csv')
df_url.fillna(value=0,inplace=True)
cookies=[columns for columns in df_url.columns if 'number' in columns]
df_url.drop(cookies,axis=1,inplace=True)
df_url.drop(['url_host','url_ip'],axis=1,inplace=True)
df_url=df_url[df_url['domain']!=0]
df_url.sort_values(by='Target',inplace=True,ascending=False)
df_url.drop_duplicates(['domain'], keep='first',inplace=True)
df_url=df_url.sample(frac=1,random_state=0)
print("Shape of URL dataset:",df_url.shape)
df_url.set_index('domain',inplace=True,drop=True)

Shape of HTML dataset: (43491, 1461)
Shape of JS,DOM dataset: (43294, 401)
Shape of HTTP dataset: (45856, 672)
Shape of URL dataset: (46771, 4194)


In [3]:
df=pd.concat([df_js,df_html,df_http,df_url],axis=1,join='inner')
df.drop(['Unnamed: 0'],axis=1,inplace=True)
df['Target_z']=df[['Target','Target','Target','Target']].apply(max,axis=1)
df.drop(['Target','Target','Target','Target'],axis=1,inplace=True)
print("Shape of combined dataset:",df.shape)
df.rename(columns={'Target_z':'Target'},inplace=True)
df.Target.value_counts()

Shape of combined dataset: (39183, 6725)


0    34742
1     4441
Name: Target, dtype: int64

In [4]:
columns=['js_function_.push(',
 'a_count',
 'a_href_http',
 'a_href_https',
 'a_href_out_of_domain',
 'a_href_relative',
 'center_count',
 'form_action_http',
 'iframe_src_.html',
 'img_src_http',
 'link_href_out_of_domain',
 'link_type_text/css',
 'meta_count',
 'p_count',
 'script_async_true',
 'total_count',
 'total_href',
 'http_header_cache-control_set_max-age',
 'http_header_content-encoding_gzip',
 'http_header_server_apache',
 'http_header_transfer-encoding_chunked',
 'http_header_vary_user-agent',
 'http_header_via_1.1',
 'url_char_.',
 'url_char_f',
 'url_char_i',
 'url_char_l',
 'url_char_p',
 'url_char_w',
 'url_char_y',
 'url_char_z',
 'url_extension_.com',
 'url_extensions',
 'url_length',
 'Target']

In [5]:
df_sel=df[columns].copy(deep=True)
train=df_sel.iloc[:27428,:]
train_X=train.iloc[:,train.columns!='Target'].values
train_Y=train.Target.values

scaler=StandardScaler()
scaled_X_train=scaler.fit_transform(train_X)

test=df_sel.iloc[27428:,:]
test_X=test.iloc[:,test.columns!='Target'].values
test_Y=test.Target.values
scaled_X_test=scaler.transform(test_X)
columns.remove('Target')

In [6]:
table_80_50_test=pd.DataFrame(columns=['accuracy','TP','TN','FP','FN','FPR','FNR','auc','mcc'])

### Inital Case - RFC

In [14]:
n_estimators=[1000,500,2000,50]
criterion=['gini','entropy']
max_depth = [25,10,50]
max_depth.append(None)
min_samples_split=[4,100,50]
min_samples_leaf=[2,100,50]
max_leaf_nodes=[2000,1500]
class_weight=[{0:0.5,1:0.5},{0:0.9,1:0.1},{0:0.1,1:0.9}]


random_grid={'n_estimators':n_estimators,
            'criterion':criterion,
            'max_depth':max_depth,
            'max_leaf_nodes':max_leaf_nodes,
            'min_samples_split':min_samples_split,
            'min_samples_leaf':min_samples_leaf,
            'class_weight':class_weight}

rf = RandomForestClassifier(random_state=0)
rf_random = GridSearchCV(estimator = rf, param_grid = random_grid, cv = StratifiedKFold(10), verbose=1, n_jobs = -1,scoring='precision_weighted')
rf_random.fit(scaled_X_train,train_Y)

Fitting 10 folds for each of 1728 candidates, totalling 17280 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   31.6s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 402 tasks      | elapsed: 15.0min
[Parallel(n_jobs=-1)]: Done 752 tasks      | elapsed: 32.9min
[Parallel(n_jobs=-1)]: Done 1202 tasks      | elapsed: 63.9min
[Parallel(n_jobs=-1)]: Done 1752 tasks      | elapsed: 101.9min
[Parallel(n_jobs=-1)]: Done 2402 tasks      | elapsed: 145.6min
[Parallel(n_jobs=-1)]: Done 3152 tasks      | elapsed: 198.6min
[Parallel(n_jobs=-1)]: Done 4002 tasks      | elapsed: 257.5min
[Parallel(n_jobs=-1)]: Done 4952 tasks      | elapsed: 323.7min
[Parallel(n_jobs=-1)]: Done 6002 tasks      | elapsed: 408.1min
[Parallel(n_jobs=-1)]: Done 7152 tasks      | elapsed: 479.8min
[Parallel(n_jobs=-1)]: Done 8402 tasks      | elapsed: 560.1min
[Parallel(n_jobs=-1)]: Done 9752 tasks      | elapsed: 648.1min
[Parallel(n_jobs=-1)]: Done 11202 ta

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2...
             param_grid={'class_weight': [{0: 0.5, 1: 0.5}, {0: 0.9, 1: 0.1},
                                          {0: 0.1, 1: 0.9}],
                         'criterion': ['gini', 'entr

In [15]:
rf_random.best_params_

{'class_weight': {0: 0.5, 1: 0.5},
 'criterion': 'gini',
 'max_depth': 25,
 'max_leaf_nodes': 2000,
 'min_samples_leaf': 2,
 'min_samples_split': 4,
 'n_estimators': 500}

In [16]:
print("Accuracy of the model for training data is:",accuracy_score(train_Y,rf_random.predict(scaled_X_train)))
print("Confusion Matrix for training data is:")
y_pred_train=rf_random.predict(scaled_X_train)
display(pd.crosstab(train_Y,y_pred_train))
    
y_pred_test=rf_random.predict(scaled_X_test)
acc_test=accuracy_score(test_Y,y_pred_test)
print("Accuracy of the model for test data is:",acc_test)
print("Confusion Matrix for test data is:")
display(pd.crosstab(test_Y,y_pred_test))
fpr, tpr, threshold = roc_curve(test_Y, y_pred_test)
roc_auc =auc(fpr, tpr)
print("ROC_AUC:",roc_auc)
mcc=matthews_corrcoef(test_Y,y_pred_test)
print("MCC:",mcc)

Accuracy of the model for training data is: 0.9876039084147586
Confusion Matrix for training data is:


col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,24258,69
1,271,2830


Accuracy of the model for test data is: 0.9789025946405785
Confusion Matrix for test data is:


col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,10350,65
1,183,1157


ROC_AUC: 0.928595918630563
MCC: 0.8925826032476923


In [None]:
Accuracy of the model for training data is: 0.9896456176170337
Confusion Matrix for training data is:
col_0	0	1
row_0		
0	24204	123
1	161	2940
Accuracy of the model for test data is: 0.9782220331773713
Confusion Matrix for test data is:
col_0	0	1
row_0		
0	10319	96
1	160	1180
ROC_AUC: 0.935689770064703
MCC: 0.8902339191768122
    
{'class_weight': {0: 0.3, 1: 0.7},
 'criterion': 'gini',
 'max_depth': None,
 'max_leaf_nodes': 2000,
 'min_samples_leaf': 2,
 'min_samples_split': 4,
 'n_estimators': 1000}

ROC_AUC: 0.935689770064703
MCC: 0.8902339191768122
    
{'class_weight': {0: 0.3, 1: 0.7},
 'criterion': 'gini',
 'max_depth': None,
 'max_leaf_nodes': 2000,
 'min_samples_leaf': 2,
 'min_samples_split': 4,
 'n_estimators': 1000}

In [None]:
Accuracy of the model for training data is: 0.9876039084147586
Confusion Matrix for training data is:
col_0	0	1
row_0		
0	24255	72
1	268	2833
Accuracy of the model for test data is: 0.979327945555083
Confusion Matrix for test data is:
col_0	0	1
row_0		
0	10351	64
1	179	1161
ROC_AUC: 0.9301364636252247
MCC: 0.8948207827041592
    
{'class_weight': {0: 0.5, 1: 0.5},
 'criterion': 'gini',
 'max_depth': 25,
 'max_leaf_nodes': 2000,
 'min_samples_leaf': 2,
 'min_samples_split': 4,
 'n_estimators': 1000}

In [3]:
### Original Creator : Darshan Bhansali
### HTML code to hide the input cells 
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
To toggle code, click <a href="javascript:code_toggle()">here</a>.''')