In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.metrics import f1_score, roc_curve, accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, roc_curve
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import random

In [2]:
data = pd.read_pickle("./data.pkl")

# split data into X and y
X = data.loc[:, data.columns != 'bot']
Y = data.loc[:, data.columns == 'bot'].to_numpy()

# split data into train and test sets
seed = 10
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

dataset_train = np.concatenate((X_train.to_numpy(), y_train),axis = 1)

In [21]:
# Number of classifiers to aggregate

num_classifiers = 20

ratio = 0.1 #Ratio Bootstrapped dataset/ original dataset

N = np.shape(dataset_train)[0]

# Initialize accuracy vectors

decision_tree_accuracy = []
KNN_accuracy = []
SVM_accuracy = []
GNB_accuracy = []

decision_tree_f1_score = []
KNN_f1_score = []
SVM_f1_score = []
GNB_f1_score = []


for classifier_iteration in range(num_classifiers):
    
    dataset_train_undersampled = dataset_train[random.sample(range(1,N),int(ratio*N)), :]
    
    X_train_undersampled = dataset_train_undersampled[:,0:59]
    y_train_undersampled = dataset_train_undersampled[:,59].astype(int)
    
    
    ### Train different algorithms
    
    # Decision tree
    decision_tree = DecisionTreeClassifier(max_depth=1, max_leaf_nodes=2)
    decision_tree_model = decision_tree.fit(X_train_undersampled, y_train_undersampled)
    
    decision_tree_pred = decision_tree_model.predict(X_test)
    
    decision_tree_accuracy.append(accuracy_score(y_test, decision_tree_pred))
    
    decision_tree_f1_score.append(f1_score(y_test, decision_tree_pred,average='weighted'))
    
    # K-NN
    KNN = KNeighborsClassifier(n_neighbors=3)
    KNN_model = KNN.fit(X_train_undersampled, y_train_undersampled)
    
    KNN_pred = KNN_model.predict(X_test)
    
    KNN_accuracy.append(accuracy_score(y_test, KNN_pred))
    
    KNN_f1_score.append(f1_score(y_test, KNN_pred,average='weighted'))
    
    # SVM
    SVM = make_pipeline(StandardScaler(), SVC(gamma='auto'))
    SVM_model = SVM.fit(X_train_undersampled, y_train_undersampled)
    
    SVM_pred = SVM_model.predict(X_test)
    
    SVM_accuracy.append(accuracy_score(y_test, SVM_pred))
    
    SVM_f1_score.append(f1_score(y_test, SVM_pred,average='weighted'))
    
    # Bayes    
    GNB = GaussianNB()
    GNB_model = GNB.fit(X_train_undersampled, y_train_undersampled)
    
    GNB_pred = GNB_model.predict(X_test)
    
    GNB_accuracy.append(accuracy_score(y_test, GNB_pred))
    
    GNB_f1_score.append(f1_score(y_test, GNB_pred,average='weighted'))
    
    
    
    

In [22]:
print("Decision tree accuracy: "+str(sum(decision_tree_accuracy) / len(decision_tree_accuracy)))
print("KNN accuracy: "+str(sum(KNN_accuracy) / len(KNN_accuracy)))
print("SVM accuracy: "+str(sum(SVM_accuracy) / len(SVM_accuracy)))
print("Gaussian NB accuracy: "+str(sum(GNB_accuracy) / len(GNB_accuracy)))

Decision tree accuracy: 0.7987049028677149
KNN accuracy: 0.7015058073799981
SVM accuracy: 0.7684551341350601
Gaussian NB accuracy: 0.7095436324390996


In [23]:
print("Decision tree F1 score: "+str(sum(decision_tree_f1_score) / len(decision_tree_f1_score)))
print("KNN F1 score: "+str(sum(KNN_f1_score) / len(KNN_f1_score)))
print("SVM F1 score: "+str(sum(SVM_f1_score) / len(SVM_f1_score)))
print("Gaussian NB F1 score: "+str(sum(GNB_f1_score) / len(GNB_f1_score)))

Decision tree F1 score: 0.7900923672862227
KNN F1 score: 0.6873391436392572
SVM F1 score: 0.7340907038358309
Gaussian NB F1 score: 0.6781254118200644
