In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import researchpy as rp


from sklearn.manifold import TSNE

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV,StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,auc,roc_curve,recall_score,precision_score,matthews_corrcoef
# from pandas_ml import ConfusionMatrix
import warnings
warnings.filterwarnings('ignore')
from imblearn.over_sampling import SMOTE,ADASYN
import imblearn.over_sampling as over
from IPython.display import display,clear_output,HTML

In [2]:
def RandomForest(X_train,Y_train,X_test,Y_test,feat):
    print('\033[1m' + 'RandomForest Classifier' + '\033[0m')
    rf=RandomForestClassifier(n_estimators=100,random_state=10)
    rf.fit(X_train,Y_train)
    return model_build(rf,X_train,Y_train,X_test,Y_test,True,feat)

def model_build(model,X_train,Y_train,X_test,Y_test,tree=False,feat=None):    
    y_pred_train=model.predict(X_train)
    acc_train=accuracy_score(Y_train,y_pred_train)
    print("Accuracy of the model for training data is:",accuracy_score(Y_train,model.predict(X_train)))
    print("Confusion Matrix for training data is:")
    y_pred_train=model.predict(X_train)
    display(pd.crosstab(Y_train,y_pred_train))
    y_pred_test=model.predict(X_test)
    acc_test=accuracy_score(Y_test,y_pred_test)
    print("Accuracy of the model for test data is:",acc_test)
    print("Confusion Matrix for test data is:")
    display(pd.crosstab(Y_test,y_pred_test))
    fpr, tpr, threshold = roc_curve(Y_test, y_pred_test)
    roc_auc =auc(fpr, tpr)
    mcc=matthews_corrcoef(Y_test,y_pred_test)
    precision=precision_score(Y_test,y_pred_test)
    recall=recall_score(Y_test,y_pred_test)
#     if tree==True:
#         feat_impt_plot(model.feature_importances_,feat)
#         return model,cm_train,cm_test,acc_train,acc_test,roc_auc,model.feature_importances_,mcc,precision,recall
    return model,acc_train,acc_test,roc_auc,mcc,precision,recall

### Function to plot Feature Importance
def feat_impt_plot(feat_impt,y_labels,width_s=1000,height_s=1000):
    
    m=pd.DataFrame(y_labels,feat_impt).reset_index()
    m.columns=['Feature_Importance','Features']
    m.sort_values(by='Feature_Importance',inplace=True,ascending=False)
    m['Features']=m['Features'].str.replace("dom_function_","")
    m['Features']=m['Features'].str.replace("js_function_","")
    m['Features']=m['Features'].apply(lambda x: str(x).lstrip('.'))
    m['Features']=m['Features'].str.replace("(","")
    
    data = [go.Bar(x=m.Feature_Importance.values,y=m.Features.values,text=np.round(m.Feature_Importance,4),
            textposition = 'outside',
            marker=dict(
                color='rgb(158,202,225)',
                line=dict(
                    color='rgb(8,48,107)',
                    width=1.5),
            ),
            opacity=0.6,
            orientation='h'
        )]
    layout = go.Layout(autosize=False,
    width=width_s,
    height=height_s,
    xaxis=dict(title='Feature Importances',
        tickfont=dict(
            size=12,
            color='black'
        )),
    yaxis=dict(automargin=True))
    fig = go.Figure(data=data, layout=layout)
    iplot(fig)

In [3]:
df_train=pd.read_csv('../../Data/Training_Data/df_training_prev.csv',index_col='domain')

# df_mal=pd.read_csv('../../Data/Malicious_domains.csv')
# mal_domains=df_mal.queried_domain.values.tolist()

df_test=pd.read_csv('../../Data/Test_Data/df_fin_prev.csv',index_col='domain')
# df_test.reset_index(inplace=True)
# df_test['Target']=df_test['domain'].apply(lambda x: 1 if x in mal_domains else 0)
# df_test.set_index(['domain'],inplace=True)

In [4]:
df_prev=pd.concat([df_test,df_train],axis=0,sort=False)
df_prev=df_prev.sample(frac=1,random_state=10)

In [5]:
corr_df=[]
for col in df_prev.columns:
    if col!='Target':
        corr_df.append((df_prev[col].corr(df_prev['Target']),col))
        
corr_df=pd.DataFrame(corr_df)
corr_df.columns=['Correlation_Value','Columns']
corr_df['Correlation_Value']=corr_df['Correlation_Value'].apply(lambda x: np.absolute(x))
corr_df.sort_values(by='Correlation_Value',inplace=True,ascending=False)

In [6]:
corr_df.head(20)

Unnamed: 0,Correlation_Value,Columns
61,0.345751,url_char_-
57,0.05161,http_header_content-language_text/html
92,0.047706,url_length
84,0.040622,url_char_w
35,0.038754,link_count
91,0.035611,url_extensions
56,0.032799,http_header_content-encoding_gzip
41,0.03245,script_type_text/javascript
36,0.031223,link_href_out_of_domain
88,0.028537,url_extension_.com


#### Training - 20% of Combined Dataset1 and Dataset2 used. 

In [13]:
columns=df_prev.columns.tolist()
train=df_prev.iloc[:int(df_prev.shape[0]*0.2),:]
train_X=train.iloc[:,train.columns!='Target'].values
train_Y=train.Target.values

scaler=StandardScaler()
scaled_X_train=scaler.fit_transform(train_X)

test=df_prev.iloc[int(df_prev.shape[0]*0.2):,:]
test_X=test.iloc[:,test.columns!='Target'].values
test_Y=test.Target.values
scaled_X_test=scaler.transform(test_X)
columns.remove('Target')

model,acc_train,acc_test,roc_auc,mcc,prec,rec=RandomForest(scaled_X_train,train_Y,scaled_X_test,test_Y,columns)
print("MCC:",mcc)
print("AUC:",roc_auc)
print("Precision:",prec)
print("Recall:",rec)

[1mRandomForest Classifier[0m
Accuracy of the model for training data is: 0.999920589543231
Confusion Matrix for training data is:


col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,180330,1
1,14,8547


Accuracy of the model for test data is: 0.9578556586418201
Confusion Matrix for test data is:


col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,720976,623
1,31220,2751


MCC: 0.24899540812403015
AUC: 0.5400587380988467
Precision: 0.8153526970954357
Recall: 0.08098083659592005


#### Training - 30% of Combined Dataset1 and Dataset2 used. 

In [14]:
columns=df_prev.columns.tolist()
train=df_prev.iloc[:int(df_prev.shape[0]*0.3),:]
train_X=train.iloc[:,train.columns!='Target'].values
train_Y=train.Target.values

scaler=StandardScaler()
scaled_X_train=scaler.fit_transform(train_X)

test=df_prev.iloc[int(df_prev.shape[0]*0.3):,:]
test_X=test.iloc[:,test.columns!='Target'].values
test_Y=test.Target.values
scaled_X_test=scaler.transform(test_X)
columns.remove('Target')

model,acc_train,acc_test,roc_auc,mcc,prec,rec=RandomForest(scaled_X_train,train_Y,scaled_X_test,test_Y,columns)
print("MCC:",mcc)
print("AUC:",roc_auc)
print("Precision:",prec)
print("Recall:",rec)

[1mRandomForest Classifier[0m
Accuracy of the model for training data is: 0.9998729432691696
Confusion Matrix for training data is:


col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,270540,3
1,33,12762


Accuracy of the model for test data is: 0.9579503996224611
Confusion Matrix for test data is:


col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,630829,558
1,27242,2495


MCC: 0.25378806130001613
AUC: 0.5415092203883977
Precision: 0.8172289551261055
Recall: 0.08390220936879982


#### Training - 40% of Combined Dataset1 and Dataset2 used. 

In [7]:
columns=df_prev.columns.tolist()
train=df_prev.iloc[:int(df_prev.shape[0]*0.4),:]
train_X=train.iloc[:,train.columns!='Target'].values
train_Y=train.Target.values

scaler=StandardScaler()
scaled_X_train=scaler.fit_transform(train_X)

test=df_prev.iloc[int(df_prev.shape[0]*0.4):,:]
test_X=test.iloc[:,test.columns!='Target'].values
test_Y=test.Target.values
scaled_X_test=scaler.transform(test_X)
columns.remove('Target')

model,acc_train,acc_test,roc_auc,mcc,prec,rec=RandomForest(scaled_X_train,train_Y,scaled_X_test,test_Y,columns)
print("MCC:",mcc)
print("AUC:",roc_auc)
print("Precision:",prec)
print("Recall:",rec)

[1mRandomForest Classifier[0m
Accuracy of the model for training data is: 0.9998517671473647
Confusion Matrix for training data is:


col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,360800,8
1,48,16928


Accuracy of the model for test data is: 0.9580908381832364
Confusion Matrix for test data is:


col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,540583,539
1,23210,2346


MCC: 0.2647635865835293
AUC: 0.5454011624938441
Precision: 0.8131715771230502
Recall: 0.09179840350602599


#### Training - 50% of Combined Dataset1 and Dataset2 used. 

In [8]:
columns=df_prev.columns.tolist()
train=df_prev.iloc[:int(df_prev.shape[0]*0.5),:]
train_X=train.iloc[:,train.columns!='Target'].values
train_Y=train.Target.values

scaler=StandardScaler()
scaled_X_train=scaler.fit_transform(train_X)

test=df_prev.iloc[int(df_prev.shape[0]*0.5):,:]
test_X=test.iloc[:,test.columns!='Target'].values
test_Y=test.Target.values
scaled_X_test=scaler.transform(test_X)
columns.remove('Target')

model,acc_train,acc_test,roc_auc,mcc,prec,rec=RandomForest(scaled_X_train,train_Y,scaled_X_test,test_Y,columns)
print("MCC:",mcc)
print("AUC:",roc_auc)
print("Precision:",prec)
print("Recall:",rec)

[1mRandomForest Classifier[0m
Accuracy of the model for training data is: 0.9998221209535164
Confusion Matrix for training data is:


col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,451060,10
1,74,21087


Accuracy of the model for test data is: 0.9579231350758421
Confusion Matrix for test data is:


col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,450379,481
1,19389,1982


MCC: 0.26455340746779377
AUC: 0.5458378257503953
Precision: 0.8047097036134795
Recall: 0.09274250152075242


#### Training - 60% of Combined Dataset1 and Dataset2 used. 

In [10]:
columns=df_prev.columns.tolist()
train=df_prev.iloc[:int(df_prev.shape[0]*0.6),:]
train_X=train.iloc[:,train.columns!='Target'].values
train_Y=train.Target.values

scaler=StandardScaler()
scaled_X_train=scaler.fit_transform(train_X)

test=df_prev.iloc[int(df_prev.shape[0]*0.6):,:]
test_X=test.iloc[:,test.columns!='Target'].values
test_Y=test.Target.values
scaled_X_test=scaler.transform(test_X)
columns.remove('Target')

model,acc_train,acc_test,roc_auc,mcc,prec,rec=RandomForest(scaled_X_train,train_Y,scaled_X_test,test_Y,columns)
print("MCC:",mcc)
print("AUC:",roc_auc)
print("Precision:",prec)
print("Recall:",rec)

[1mRandomForest Classifier[0m
Accuracy of the model for training data is: 0.999798827197857
Confusion Matrix for training data is:


col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,541155,16
1,98,25408


Accuracy of the model for test data is: 0.9582381513294599
Confusion Matrix for test data is:


col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,360310,449
1,15328,1698


MCC: 0.2717910860314553
AUC: 0.5492426133896037
Precision: 0.7908709827666511
Recall: 0.09972982497356983


#### Training - 70% of Combined Dataset1 and Dataset2 used. 

In [11]:
columns=df_prev.columns.tolist()
train=df_prev.iloc[:int(df_prev.shape[0]*0.7),:]
train_X=train.iloc[:,train.columns!='Target'].values
train_Y=train.Target.values

scaler=StandardScaler()
scaled_X_train=scaler.fit_transform(train_X)

test=df_prev.iloc[int(df_prev.shape[0]*0.7):,:]
test_X=test.iloc[:,test.columns!='Target'].values
test_Y=test.Target.values
scaled_X_test=scaler.transform(test_X)
columns.remove('Target')

model,acc_train,acc_test,roc_auc,mcc,prec,rec=RandomForest(scaled_X_train,train_Y,scaled_X_test,test_Y,columns)
print("MCC:",mcc)
print("AUC:",roc_auc)
print("Precision:",prec)
print("Recall:",rec)

[1mRandomForest Classifier[0m
Accuracy of the model for training data is: 0.9997746259016854
Confusion Matrix for training data is:


col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,631322,20
1,129,29652


Accuracy of the model for test data is: 0.958279657936253
Confusion Matrix for test data is:


col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,270256,332
1,11489,1262


MCC: 0.27092660737062685
AUC: 0.54887283600451
Precision: 0.7917189460476788
Recall: 0.09897262959767861


### OverSampling

#### Training - 20% of Combined Dataset1 and Dataset2 used. 
#### Oversampling ratio 1:1. Technique - SMOTE

In [9]:
columns=df_prev.columns.tolist()
train=df_prev.iloc[:int(df_prev.shape[0]*0.3),:]
train_X=train.iloc[:,train.columns!='Target'].values
train_Y=train.Target.values

scaler=StandardScaler()
scaled_X_train=scaler.fit_transform(train_X)

test=df_prev.iloc[int(df_prev.shape[0]*0.3):,:]
test_X=test.iloc[:,test.columns!='Target'].values
test_Y=test.Target.values
scaled_X_test=scaler.transform(test_X)
columns.remove('Target')

train.Target.value_counts()

0    270543
1     12795
Name: Target, dtype: int64

In [13]:
sm = SMOTE(random_state=12, ratio = {1:270543})
X_train, Y_train = sm.fit_sample(scaled_X_train,train_Y)

model,acc_train,acc_test,roc_auc,mcc,prec,rec=RandomForest(X_train,Y_train,scaled_X_test,test_Y,columns)
print("MCC:",mcc)
print("MCC:",mcc)
print("AUC:",roc_auc)
print("Precision:",prec)
print("Recall:",rec)

[1mRandomForest Classifier[0m
Accuracy of the model for training data is: 0.9999408596784984
Confusion Matrix for training data is:


col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,270532,11
1,21,270522


Accuracy of the model for test data is: 0.956658660100072
Confusion Matrix for test data is:


col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,627205,4182
1,24472,5265


MCC: 0.29762940016295025
MCC: 0.29762940016295025
AUC: 0.5852143221245958
Precision: 0.5573197840584313
Recall: 0.17705215724518278


#### Training - 20% of Combined Dataset1 and Dataset2 used. 
#### Oversampling ratio 1:1. Technique - ADASYN

In [10]:
sm = ADASYN(random_state=12, ratio = {1:270543},n_neighbors=10,n_jobs=-1)
X_train, Y_train = sm.fit_sample(scaled_X_train,train_Y)

model,acc_train,acc_test,roc_auc,mcc,prec,rec=RandomForest(X_train,Y_train,scaled_X_test,test_Y,columns)
print("MCC:",mcc)
print("AUC:",roc_auc)
print("Precision:",prec)
print("Recall:",rec)

[1mRandomForest Classifier[0m
Accuracy of the model for training data is: 0.9999406024765055
Confusion Matrix for training data is:


col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,270533,10
1,22,268178


Accuracy of the model for test data is: 0.9561716107719581
Confusion Matrix for test data is:


col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,626241,5146
1,23830,5907


MCC: 0.3079290285674133
AUC: 0.5952455563971996
Precision: 0.534425042974758
Recall: 0.19864142314288596


In [3]:
### Original Creator : Darshan Bhansali
### HTML code to hide the input cells 
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();g
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
To toggle code, click <a href="javascript:code_toggle()">here</a>.''')