In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import researchpy as rp


from sklearn.manifold import TSNE

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV,StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,auc,roc_curve,recall_score,precision_score,matthews_corrcoef
# from pandas_ml import ConfusionMatrix
import warnings
warnings.filterwarnings('once')
from imblearn.over_sampling import SMOTE,ADASYN
import imblearn.over_sampling as over
from IPython.display import display,clear_output,HTML



In [2]:
def RandomForest(X_train,Y_train,X_test,Y_test,feat):
    print('\033[1m' + 'RandomForest Classifier' + '\033[0m')
    rf=RandomForestClassifier(n_estimators=100,random_state=10)
    rf.fit(X_train,Y_train)
    return model_build(rf,X_train,Y_train,X_test,Y_test,True,feat)

def model_build(model,X_train,Y_train,X_test,Y_test,tree=False,feat=None):    
    y_pred_train=model.predict(X_train)
    acc_train=accuracy_score(Y_train,y_pred_train)
    print("Accuracy of the model for training data is:",acc_train)
    print("Confusion Matrix for training data is:")
    cm_train=ConfusionMatrix(Y_train,y_pred_train)
    display(cm_train)
    y_pred_test=model.predict(X_test)
    acc_test=accuracy_score(Y_test,y_pred_test)
    print("Accuracy of the model for test data is:",acc_test)
    print("Confusion Matrix for test data is:")
    cm_test=ConfusionMatrix(Y_test,y_pred_test)
    display(cm_test)
    fpr, tpr, threshold = roc_curve(Y_test, y_pred_test)
    roc_auc =auc(fpr, tpr)
    mcc=matthews_corrcoef(Y_test,y_pred_test)
    precision=precision_score(Y_test,y_pred_test)
    recall=recall_score(Y_test,y_pred_test)
#     if tree==True:
#         feat_impt_plot(model.feature_importances_,feat)
#         return model,cm_train,cm_test,acc_train,acc_test,roc_auc,model.feature_importances_,mcc,precision,recall
    return model,cm_train,cm_test,acc_train,acc_test,roc_auc,mcc,precision,recall

### Function to plot Feature Importance
def feat_impt_plot(feat_impt,y_labels,width_s=1000,height_s=1000):
    
    m=pd.DataFrame(y_labels,feat_impt).reset_index()
    m.columns=['Feature_Importance','Features']
    m.sort_values(by='Feature_Importance',inplace=True,ascending=False)
    m['Features']=m['Features'].str.replace("dom_function_","")
    m['Features']=m['Features'].str.replace("js_function_","")
    m['Features']=m['Features'].apply(lambda x: str(x).lstrip('.'))
    m['Features']=m['Features'].str.replace("(","")
    
    data = [go.Bar(x=m.Feature_Importance.values,y=m.Features.values,text=np.round(m.Feature_Importance,4),
            textposition = 'outside',
            marker=dict(
                color='rgb(158,202,225)',
                line=dict(
                    color='rgb(8,48,107)',
                    width=1.5),
            ),
            opacity=0.6,
            orientation='h'
        )]
    layout = go.Layout(autosize=False,
    width=width_s,
    height=height_s,
    xaxis=dict(title='Feature Importances',
        tickfont=dict(
            size=12,
            color='black'
        )),
    yaxis=dict(automargin=True))
    fig = go.Figure(data=data, layout=layout)
    iplot(fig)

In [3]:
df_train=pd.read_csv('../../Data/df_training_prev.csv',index_col='domain')

df_mal=pd.read_csv('../../Data/Malicious_domains.csv')
mal_domains=df_mal.queried_domain.values.tolist()

df_test=pd.read_csv('../../Data/df_fin_prev.csv',index_col='domain')
df_test.reset_index(inplace=True)
df_test['Target']=df_test['domain'].apply(lambda x: 1 if x in mal_domains else 0)
df_test.set_index(['domain'],inplace=True)

In [4]:
df_prev=pd.concat([df_test,df_train],axis=0,sort=False)
df_prev=df_prev.sample(frac=1,random_state=10)

In [6]:
corr_df=[]
for col in df_prev.columns:
    if col!='Target':
        corr_df.append((df_prev[col].corr(df_prev['Target']),col))
        
corr_df=pd.DataFrame(corr_df)
corr_df.columns=['Correlation_Value','Columns']
corr_df['Correlation_Value']=corr_df['Correlation_Value'].apply(lambda x: np.absolute(x))
corr_df.sort_values(by='Correlation_Value',inplace=True,ascending=False)

In [7]:
corr_df.head(20)

Unnamed: 0,Correlation_Value,Columns
61,0.345751,url_char_-
57,0.05161,http_header_content-language_text/html
92,0.047706,url_length
84,0.040622,url_char_w
35,0.038754,link_count
91,0.035611,url_extensions
56,0.032799,http_header_content-encoding_gzip
41,0.03245,script_type_text/javascript
36,0.031223,link_href_out_of_domain
88,0.028537,url_extension_.com


#### Training - 20% of Combined Dataset1 and Dataset2 used. 

In [7]:
columns=df_prev.columns.tolist()
train=df_prev.iloc[:int(df_prev.shape[0]*0.2),:]
train_X=train.iloc[:,train.columns!='Target'].values
train_Y=train.Target.values

scaler=StandardScaler()
scaled_X_train=scaler.fit_transform(train_X)

test=df_prev.iloc[int(df_prev.shape[0]*0.2):,:]
test_X=test.iloc[:,test.columns!='Target'].values
test_Y=test.Target.values
scaled_X_test=scaler.transform(test_X)
columns.remove('Target')

model,cm_train,cm_test,acc_train,acc_test,roc_auc,mcc,prec,rec=RandomForest(scaled_X_train,train_Y,scaled_X_test,test_Y,columns)
print("MCC:",mcc)

[1mRandomForest Classifier[0m
Accuracy of the model for training data is: 0.999920589543231
Confusion Matrix for training data is:


Predicted   False  True  __all__
Actual                          
False      180330     1   180331
True           14  8547     8561
__all__    180344  8548   188892

Accuracy of the model for test data is: 0.9578556586418201
Confusion Matrix for test data is:


Predicted   False  True  __all__
Actual                          
False      720976   623   721599
True        31220  2751    33971
__all__    752196  3374   755570

MCC: 0.24899540812403015


#### Training - 30% of Combined Dataset1 and Dataset2 used. 

In [9]:
columns=df_prev.columns.tolist()
train=df_prev.iloc[:int(df_prev.shape[0]*0.3),:]
train_X=train.iloc[:,train.columns!='Target'].values
train_Y=train.Target.values

scaler=StandardScaler()
scaled_X_train=scaler.fit_transform(train_X)

test=df_prev.iloc[int(df_prev.shape[0]*0.3):,:]
test_X=test.iloc[:,test.columns!='Target'].values
test_Y=test.Target.values
scaled_X_test=scaler.transform(test_X)
columns.remove('Target')
model,cm_train,cm_test,acc_train,acc_test,roc_auc,mcc,prec,rec=RandomForest(scaled_X_train,train_Y,scaled_X_test,test_Y,columns)
print("MCC:",mcc)

[1mRandomForest Classifier[0m
Accuracy of the model for training data is: 0.9998729432691696
Confusion Matrix for training data is:


Predicted   False   True  __all__
Actual                           
False      270540      3   270543
True           33  12762    12795
__all__    270573  12765   283338

Accuracy of the model for test data is: 0.9579503996224611
Confusion Matrix for test data is:


Predicted   False  True  __all__
Actual                          
False      630829   558   631387
True        27242  2495    29737
__all__    658071  3053   661124

MCC: 0.25378806130001613


#### Training - 40% of Combined Dataset1 and Dataset2 used. 

In [10]:
columns=df_prev.columns.tolist()
train=df_prev.iloc[:int(df_prev.shape[0]*0.4),:]
train_X=train.iloc[:,train.columns!='Target'].values
train_Y=train.Target.values

scaler=StandardScaler()
scaled_X_train=scaler.fit_transform(train_X)

test=df_prev.iloc[int(df_prev.shape[0]*0.4):,:]
test_X=test.iloc[:,test.columns!='Target'].values
test_Y=test.Target.values
scaled_X_test=scaler.transform(test_X)
columns.remove('Target')
model,cm_train,cm_test,acc_train,acc_test,roc_auc,mcc,prec,rec=RandomForest(scaled_X_train,train_Y,scaled_X_test,test_Y,columns)
print("MCC:",mcc)

[1mRandomForest Classifier[0m
Accuracy of the model for training data is: 0.9998517671473647
Confusion Matrix for training data is:


Predicted   False   True  __all__
Actual                           
False      360800      8   360808
True           48  16928    16976
__all__    360848  16936   377784

Accuracy of the model for test data is: 0.9580908381832364
Confusion Matrix for test data is:


Predicted   False  True  __all__
Actual                          
False      540583   539   541122
True        23210  2346    25556
__all__    563793  2885   566678

MCC: 0.2647635865835293


#### Training - 50% of Combined Dataset1 and Dataset2 used. 

In [11]:
columns=df_prev.columns.tolist()
train=df_prev.iloc[:int(df_prev.shape[0]*0.5),:]
train_X=train.iloc[:,train.columns!='Target'].values
train_Y=train.Target.values

scaler=StandardScaler()
scaled_X_train=scaler.fit_transform(train_X)

test=df_prev.iloc[int(df_prev.shape[0]*0.5):,:]
test_X=test.iloc[:,test.columns!='Target'].values
test_Y=test.Target.values
scaled_X_test=scaler.transform(test_X)
columns.remove('Target')
model,cm_train,cm_test,acc_train,acc_test,roc_auc,mcc,prec,rec=RandomForest(scaled_X_train,train_Y,scaled_X_test,test_Y,columns)
print("MCC:",mcc)

[1mRandomForest Classifier[0m
Accuracy of the model for training data is: 0.9998221209535164
Confusion Matrix for training data is:


Predicted   False   True  __all__
Actual                           
False      451060     10   451070
True           74  21087    21161
__all__    451134  21097   472231

Accuracy of the model for test data is: 0.9579231350758421
Confusion Matrix for test data is:


Predicted   False  True  __all__
Actual                          
False      450379   481   450860
True        19389  1982    21371
__all__    469768  2463   472231

MCC: 0.26455340746779377


#### Training - 60% of Combined Dataset1 and Dataset2 used. 

In [12]:
columns=df_prev.columns.tolist()
train=df_prev.iloc[:int(df_prev.shape[0]*0.6),:]
train_X=train.iloc[:,train.columns!='Target'].values
train_Y=train.Target.values

scaler=StandardScaler()
scaled_X_train=scaler.fit_transform(train_X)

test=df_prev.iloc[int(df_prev.shape[0]*0.6):,:]
test_X=test.iloc[:,test.columns!='Target'].values
test_Y=test.Target.values
scaled_X_test=scaler.transform(test_X)
columns.remove('Target')
model,cm_train,cm_test,acc_train,acc_test,roc_auc,mcc,prec,rec=RandomForest(scaled_X_train,train_Y,scaled_X_test,test_Y,columns)
print("MCC:",mcc)

[1mRandomForest Classifier[0m
Accuracy of the model for training data is: 0.999798827197857
Confusion Matrix for training data is:


Predicted   False   True  __all__
Actual                           
False      541155     16   541171
True           98  25408    25506
__all__    541253  25424   566677

Accuracy of the model for test data is: 0.9582381513294599
Confusion Matrix for test data is:


Predicted   False  True  __all__
Actual                          
False      360310   449   360759
True        15328  1698    17026
__all__    375638  2147   377785

MCC: 0.2717910860314553


#### Training - 70% of Combined Dataset1 and Dataset2 used. 

In [13]:
columns=df_prev.columns.tolist()
train=df_prev.iloc[:int(df_prev.shape[0]*0.7),:]
train_X=train.iloc[:,train.columns!='Target'].values
train_Y=train.Target.values

scaler=StandardScaler()
scaled_X_train=scaler.fit_transform(train_X)

test=df_prev.iloc[int(df_prev.shape[0]*0.7):,:]
test_X=test.iloc[:,test.columns!='Target'].values
test_Y=test.Target.values
scaled_X_test=scaler.transform(test_X)
columns.remove('Target')
model,cm_train,cm_test,acc_train,acc_test,roc_auc,mcc,prec,rec=RandomForest(scaled_X_train,train_Y,scaled_X_test,test_Y,columns)
print("MCC:",mcc)

[1mRandomForest Classifier[0m
Accuracy of the model for training data is: 0.9997746259016854
Confusion Matrix for training data is:


Predicted   False   True  __all__
Actual                           
False      631322     20   631342
True          129  29652    29781
__all__    631451  29672   661123

Accuracy of the model for test data is: 0.958279657936253
Confusion Matrix for test data is:


Predicted   False  True  __all__
Actual                          
False      270256   332   270588
True        11489  1262    12751
__all__    281745  1594   283339

MCC: 0.27092660737062685


### OverSampling

#### Training - 20% of Combined Dataset1 and Dataset2 used. 
#### Oversampling ratio 1:1. Technique - SMOTE

In [17]:
columns=df_prev.columns.tolist()
train=df_prev.iloc[:int(df_prev.shape[0]*0.2),:]
train_X=train.iloc[:,train.columns!='Target'].values
train_Y=train.Target.values

scaler=StandardScaler()
scaled_X_train=scaler.fit_transform(train_X)

test=df_prev.iloc[int(df_prev.shape[0]*0.2):,:]
test_X=test.iloc[:,test.columns!='Target'].values
test_Y=test.Target.values
scaled_X_test=scaler.transform(test_X)
columns.remove('Target')

train.Target.value_counts()

sm = SMOTE(random_state=12, ratio = {1:180331})
X_train, Y_train = sm.fit_sample(scaled_X_train,train_Y)

model,cm_train,cm_test,acc_train,acc_test,roc_auc,mcc,prec,rec=RandomForest(X_train,Y_train,scaled_X_test,test_Y,columns)
print("MCC:",mcc)

[1mRandomForest Classifier[0m
Accuracy of the model for training data is: 0.9999695005295817
Confusion Matrix for training data is:


Predicted   False    True  __all__
Actual                            
False      180328       3   180331
True            8  180323   180331
__all__    180336  180326   360662

Accuracy of the model for test data is: 0.9563415699405746
Confusion Matrix for test data is:


Predicted   False   True  __all__
Actual                           
False      716912   4687   721599
True        28300   5671    33971
__all__    745212  10358   755570

MCC: 0.28591712980552103


#### Training - 20% of Combined Dataset1 and Dataset2 used. 
#### Oversampling ratio 1:1. Technique - ADASYN

In [18]:
sm = ADASYN(random_state=12, ratio = {1:180331},n_neighbors=10,n_jobs=-1)
X_train, Y_train = sm.fit_sample(scaled_X_train,train_Y)

model,cm_train,cm_test,acc_train,acc_test,roc_auc,mcc,prec,rec=RandomForest(X_train,Y_train,scaled_X_test,test_Y,columns)
print("MCC:",mcc)

[1mRandomForest Classifier[0m
Accuracy of the model for training data is: 0.9999695241894819
Confusion Matrix for training data is:


Predicted   False    True  __all__
Actual                            
False      180327       4   180331
True            7  180604   180611
__all__    180334  180608   360942

Accuracy of the model for test data is: 0.9556348187461122
Confusion Matrix for test data is:


Predicted   False   True  __all__
Actual                           
False      715758   5841   721599
True        27680   6291    33971
__all__    743438  12132   755570

MCC: 0.2919540852150978


In [3]:
### Original Creator : Darshan Bhansali
### HTML code to hide the input cells 
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
To toggle code, click <a href="javascript:code_toggle()">here</a>.''')