In [44]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingClassifier,RandomForestClassifier
from sklearn.metrics import accuracy_score,auc,roc_curve,recall_score,precision_score,matthews_corrcoef
from pandas_ml import ConfusionMatrix
import pickle
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display,clear_output,HTML

### Features we Identified

#### On Training data using the best model

In [2]:
df=pd.read_csv('../Data/df_training.csv',index_col='domain')
columns=df.columns.tolist()

train=df.iloc[:31346,:]
train_X=train.iloc[:,train.columns!='Target'].values
train_Y=train.Target.values

scaler=StandardScaler()
scaled_X_train=scaler.fit_transform(train_X)

test=df.iloc[31346:,:]
test_X=test.iloc[:,test.columns!='Target'].values
test_Y=test.Target.values
scaled_X_test=scaler.transform(test_X)
columns.remove('Target')

file = open("RF_80_20.sav",'rb')
rf = pickle.load(file)

print("Accuracy of the model for training data is:",accuracy_score(train_Y,rf.predict(scaled_X_train)))
print("Confusion Matrix for training data is:")
y_pred_train=rf.predict(scaled_X_train)
cm_train=ConfusionMatrix(train_Y,y_pred_train)
display(cm_train)
    
y_pred_test=rf.predict(scaled_X_test)
acc_test=accuracy_score(test_Y,y_pred_test)
print("Accuracy of the model for test data is:",acc_test)
print("Confusion Matrix for test data is:")
cm_test=ConfusionMatrix(test_Y,y_pred_test)
display(cm_test)
fpr, tpr, threshold = roc_curve(test_Y, y_pred_test)
roc_auc =auc(fpr, tpr)
print("ROC_AUC:",roc_auc)
mcc=matthews_corrcoef(test_Y,y_pred_test)
print("MCC:",mcc)

Accuracy of the model for training data is: 0.9873987111593185
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      27731    74    27805
True         321  3220     3541
__all__    28052  3294    31346

Accuracy of the model for test data is: 0.9790736251116499
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6893    44     6937
True         120   780      900
__all__     7013   824     7837

ROC_AUC: 0.9301619335928115
MCC: 0.89423887926436


#### On Validation data 

In [3]:
df_val=pd.read_csv('../Data/df_fin.csv',index_col='domain')
df_val.reset_index(inplace=True)

df_mal=pd.read_csv('../Data/Malicious_domains.csv')
mal_domains=df_mal.queried_domain.values.tolist()

df_val['Target']=df_val['domain'].apply(lambda x: 1 if x in mal_domains else 0)
df_val.set_index(['domain'],inplace=True)

In [5]:
val_X=df_val.iloc[:,df_val.columns!='Target'].values
scaled_X_val=scaler.transform(val_X)
val_Y=df_val.Target.values

In [15]:
y_pred_val_prob=rf.predict_proba(scaled_X_val)

In [6]:
y_pred_val=rf.predict(scaled_X_val)
acc_val=accuracy_score(val_Y,y_pred_val)
print("Accuracy of the model for val data is:",acc_val)
print("Confusion Matrix for val data is:")
cm_val=ConfusionMatrix(val_Y,y_pred_val)
display(cm_val)
fpr, tpr, threshold = roc_curve(val_Y, y_pred_val)
roc_auc =auc(fpr, tpr)
print("ROC_AUC:",roc_auc)
mcc=matthews_corrcoef(val_Y,y_pred_val)
print("MCC:",mcc)

Accuracy of the model for val data is: 0.4405540639628102
Confusion Matrix for val data is:


Predicted   False    True  __all__
Actual                            
False      376379  484533   860912
True        18264   19565    37829
__all__    394643  504098   898741

ROC_AUC: 0.47719109062900944
MCC: -0.01845723481003823


In [20]:
pd.concat([pd.DataFrame(df_val.index),pd.DataFrame(y_pred_val),pd.DataFrame(y_pred_val_prob[::,1]),pd.DataFrame(df_val.Target.values)],axis=1).to_csv('Predictions_with_training_features_we_identified.csv')

### Features previously used
#### On Training data using best model

In [23]:
df=pd.read_csv('../Data/df_training_prev.csv',index_col='domain')
columns=df.columns.tolist()

In [24]:
train=df.iloc[:31346,:]
train_X=train.iloc[:,train.columns!='Target'].values
train_Y=train.Target.values

scaler=StandardScaler()
scaled_X_train=scaler.fit_transform(train_X)

test=df.iloc[31346:,:]
test_X=test.iloc[:,test.columns!='Target'].values
test_Y=test.Target.values
scaled_X_test=scaler.transform(test_X)
columns.remove('Target')

In [40]:
rf_random=BaggingClassifier(bootstrap=True,max_features=0.7,max_samples=1.0,n_estimators=100,random_state=0)
rf_random.fit(scaled_X_train,train_Y)
print("Accuracy of the model for training data is:",accuracy_score(train_Y,rf_random.predict(scaled_X_train)))
print("Confusion Matrix for training data is:")
y_pred_train=rf_random.predict(scaled_X_train)
cm_train=ConfusionMatrix(train_Y,y_pred_train)
display(cm_train)
    
y_pred_test=rf_random.predict(scaled_X_test)
acc_test=accuracy_score(test_Y,y_pred_test)
print("Accuracy of the model for test data is:",acc_test)
print("Confusion Matrix for test data is:")
cm_test=ConfusionMatrix(test_Y,y_pred_test)
display(cm_test)
fpr, tpr, threshold = roc_curve(test_Y, y_pred_test)
roc_auc =auc(fpr, tpr)
print("ROC_AUC:",roc_auc)
mcc=matthews_corrcoef(test_Y,y_pred_test)
print("MCC:",mcc)

Accuracy of the model for training data is: 1.0
Confusion Matrix for training data is:


Predicted  False  True  __all__
Actual                         
False      27805     0    27805
True           0  3541     3541
__all__    27805  3541    31346

Accuracy of the model for test data is: 0.9809876228148526
Confusion Matrix for test data is:


Predicted  False  True  __all__
Actual                         
False       6903    34     6937
True         115   785      900
__all__     7018   819     7837

ROC_AUC: 0.9336604840388897
MCC: 0.9039373157680968


#### On Validation Data

In [41]:
df_val=pd.read_csv('../Data/df_fin_prev.csv',index_col='domain')
val_X=df_val.iloc[:,df_val.columns!='Target'].values
scaled_X_val=scaler.transform(val_X)
val_Y=df_val.Target.values

In [42]:
y_pred_val_prob=rf_random.predict_proba(scaled_X_val)
y_pred_val=rf_random.predict(scaled_X_val)
acc_val=accuracy_score(val_Y,y_pred_val)
print("Accuracy of the model for val data is:",acc_val)
print("Confusion Matrix for val data is:")
cm_val=ConfusionMatrix(val_Y,y_pred_val)
display(cm_val)
fpr, tpr, threshold = roc_curve(val_Y, y_pred_val)
roc_auc =auc(fpr, tpr)
print("ROC_AUC:",roc_auc)
mcc=matthews_corrcoef(val_Y,y_pred_val)
print("MCC:",mcc)

Accuracy of the model for val data is: 0.920307441131408
Confusion Matrix for val data is:


Predicted   False   True  __all__
Actual                           
False      831579  35609   867188
True        36535   1556    38091
__all__    868114  37165   905279

ROC_AUC: 0.4998934687784668
MCC: -0.00021558536269955576


In [43]:
pd.concat([pd.DataFrame(df_val.index),pd.DataFrame(y_pred_val),pd.DataFrame(y_pred_val_prob[::,1]),pd.DataFrame(df_val.Target.values)],axis=1).to_csv('Predictions_with_training_prev_used_features.csv')

In [45]:
### Original Creator : Darshan Bhansali
### HTML code to hide the input cells 
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
To toggle code, click <a href="javascript:code_toggle()">here</a>.''')