# DoH Deception - Build Models
## Author: [Emanuel Valente](https://www.linkedin.com/in/emanuelvalente/) - emanuel.valente@ifood.com.br

This notebook builds the main (target) models used in this research. The models are stored as joblib file following the pattern:

```shell
model_name:
<model_algorithm>-<training-dataset>.joblib
```

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from joblib import dump

# Loading benign dataset

In [None]:
dataset_name = 'e-valente-customized'
# Change the following according to the dataset
X_features_filename = 'x-e-valente-custom-normalized.csv'
y_labels_filename = 'y-evalente-custom-normalized.csv'

# Loading doh tunnel tool (dnstt) dataset

In [None]:
# Concatenate Tunnel Tool dnstt
X = pd.read_csv(X_features_filename, sep=',').drop(columns=['Unnamed: 0'])

# Convert the DataFrame to a NumPy array
X = np.array(X)

In [None]:
# Load the Y_features 
y = pd.read_csv(y_labels_filename, sep=',').drop(columns=['Unnamed: 0'])

# Convert the DataFrame to a NumPy array
y = np.array(y)

# Creating Train and Test Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=3)

# Defining Model

In [None]:
def build_model(models):
    model = models
    model.fit(X_train, y_train.ravel())
        
    pred_prob=model.predict_proba(X_test)
    fpr, tpr, thresh = roc_curve(y_test, pred_prob[:,1], pos_label=1)

    pred = model.predict(X_test)
    acc = accuracy_score(pred, y_test)
    print('Test Accuracy : \033[32m \033[01m {:.5f}% \033[30m \033[0m'.format(acc*100))
    print(classification_report(y_test, pred, digits=4))
    cf_matrix = confusion_matrix(y_test, pred)
    sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True,fmt= '0.2%')
    #return acc
    return model, acc, fpr, tpr, thresh

# Train model

In [None]:
model_output_dir = './'

# Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
model_name = 'GradientBoosting'
model_1, acc_DTC_1, fpr_1, tpr_1, thresh_1 = build_model(GradientBoostingClassifier(max_depth=12, random_state=0, verbose=True))

In [None]:
# Save model
dump(model_1, '{}/{}-{}.joblib'.format(model_output_dir, model_name, dataset_name))