In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# from sklearn.metrics import classification_report
# from sklearn.feature_selection import SelectKBest
# from sklearn.feature_selection import chi2, f_classif

from pickle import dump
from pickle import load

<h4>VI. Test and train data</h4>

In [6]:
# Load the data
df_final = pd.read_csv('C:/Users/Jorge Payà/Desktop/4Geeks/Final Project/Code/DGA-Detection-project2/data/processed/dga_features_final.csv')

In [7]:
# Split the data into features and target
key_features = ['long_consonant_str', 'unique_char_count', 'entropy', 'vowel_ratio', 
                'unique_letter_count', 'd_length', 'consonant_ratio', 'unique_digit_count', 'ngrams']

X = df_final[key_features]
y = df_final['isDGA']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Let's print the dimensions of the training and testing sets to make sure everything is correct
print(f"The entire dataset has {df_final.shape[0]} rows and {df_final.shape[1]} columns.")
print(f"The training set has {X_train.shape[0]} rows and {X_train.shape[1]} columns.")
print(f"The testing set has {X_test.shape[0]} rows and {X_test.shape[1]} columns.")
print(f"The target training set has {y_train.shape[0]} rows.")
print(f"The target testing set has {y_test.shape[0]} rows.")

The entire dataset has 159995 rows and 19 columns.
The training set has 127996 rows and 9 columns.
The testing set has 31999 rows and 9 columns.
The target training set has 127996 rows.
The target testing set has 31999 rows.


<h4>A. Test with Decision Tree Classifier</h4>

In [13]:
%%time

dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train, y_train)

CPU times: total: 469 ms
Wall time: 640 ms


In [14]:
y_pred = dt_clf.predict(X_test)

In [12]:
%%time

cross_val_score(dt_clf, X_train, y_train, cv=5, scoring='accuracy')

CPU times: total: 1.86 s
Wall time: 2.97 s


array([0.86148437, 0.86108832, 0.86077581, 0.86257276, 0.86128364])

In [9]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the model is: %.2f%%" % (accuracy * 100))

Accuracy of the model is: 85.83%


the cross-validation scores are based on the training data and provide an estimate of the model's performance on unseen data, while the accuracy score is the actual performance of the model on the test data. 

In [15]:
%%time

hyperparams = {'criterion': ['gini', 'entropy'],
               'max_depth': [None, 5, 10, 15, 20],
               'min_samples_split': [2, 5, 10, 15, 20],
               'min_samples_leaf': [1, 2, 5, 10, 15]}

df_clf_gs = GridSearchCV(dt_clf, hyperparams, cv=5)
df_clf_gs.fit(X_train, y_train)

CPU times: total: 7min 33s
Wall time: 9min 9s


In [16]:
print(f"Best hyperparameters: {df_clf_gs.best_params_}")
print(f"Best score: {df_clf_gs.best_score_}")

Best hyperparameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 5, 'min_samples_split': 15}
Best score: 0.8979733775611353


In [17]:
%%time

dt_clf_opt = DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_leaf=5, min_samples_split=15, random_state=42)
dt_clf_opt.fit(X_train, y_train)

CPU times: total: 438 ms
Wall time: 577 ms


In [18]:
y_pred_opt = dt_clf_opt.predict(X_test)

In [19]:
accuracy_opt = accuracy_score(y_test, y_pred_opt)
precision_opt = precision_score(y_test, y_pred_opt)
recall_opt = recall_score(y_test, y_pred_opt)
f1_opt = f1_score(y_test, y_pred_opt)

print(f"Accuracy: {accuracy_opt}")
print(f"Precision: {precision_opt}")
print(f"Recall: {recall_opt}")
print(f"F1 Score: {f1_opt}")

Accuracy: 0.8994968592768524
Precision: 0.9606184364060677
Recall: 0.831145885916204
F1 Score: 0.8912043301759134


In [20]:
dump(dt_clf_opt, open('C:/Users/Jorge Payà/Desktop/4Geeks/Final Project/Code/DGA-Detection-project2/models/dt_clf_opt.pkl', 'wb'))

<h4>B. Test with Random Forest Classifier</h4>

In [21]:
%%time

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

CPU times: total: 14.6 s
Wall time: 19 s


In [22]:
y_pred_rf = rf.predict(X_test)

In [23]:
%%time

accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

print(f"Accuracy: {accuracy_rf}")
print(f"Precision: {precision_rf}")
print(f"Recall: {recall_rf}")
print(f"F1 Score: {f1_rf}")

Accuracy: 0.8706834588580893
Precision: 0.8714630123080828
Recall: 0.8667339727410399
F1 Score: 0.8690920594748497
CPU times: total: 15.6 ms
Wall time: 48.7 ms


In [None]:
%%time

hyperparams_rf = {'n_estimators': [50, 100, 150, 200],
                    'criterion': ['gini', 'entropy'],
                    'max_depth': [None, 5, 10, 15, 20],
                    'min_samples_split': [2, 5, 10, 15, 20],
                    'min_samples_leaf': [1, 2, 5, 10, 15]}

rf_gs = GridSearchCV(rf, hyperparams_rf, cv=5)
rf_gs.fit(X_train, y_train)

<h4>C. Logistic Regression</h4>

<h4>D. K-Nearest Neighbor</h4>

<h4>E. Naïve Bayes</h4>

<h4>F. Support Vector Machine</h4>

<h4>G. AdaBoost Classifier</h4>

<h4>H. XGBoost Classifier</h4>