In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.metrics import f1_score, roc_curve, accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, roc_curve
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import random

In [2]:
data = pd.read_pickle("./data.pkl")

# split data into X and y
X = data.loc[:, data.columns != 'bot']
Y = data.loc[:, data.columns == 'bot'].to_numpy()

# split data into train and test sets
seed = 10
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Labels for this algortihm has to be either 1 or -1
y_train = np.where(y_train < 0.5, -1, 1)
y_test = np.where(y_test < 0.5, -1, 1)

dataset_train = np.concatenate((X_train.to_numpy(), y_train),axis = 1)

In [3]:
class Bagging_classifier:
    def __init__(self, type_classifier, x=None, y=None, n_iterations = 20, ratio=0.1):
        
        """
        type_classifier: DecisionTree, KNN, SVM, GaussianNB
        """
        self.number_iterations = n_iterations
        self.type_classifier = type_classifier 
        
        self.ratio = ratio #Ratio Bootstrapped dataset/ original dataset
        
                
    def fit(self,X_train,y_train):
        
        dataset_train = np.concatenate((X_train.to_numpy(), y_train),axis = 1)
        N = np.shape(dataset_train)[0]
        
        # There will be as many classifier models as iterations
        self.classifier_models = np.zeros(shape=self.number_iterations, dtype=object)
        
        for classifier_iteration in range(self.number_iterations):
    
            dataset_train_undersampled = dataset_train[random.sample(range(1,N),int(self.ratio*N)), :]

            X_train_undersampled = dataset_train_undersampled[:,0:59]
            y_train_undersampled = dataset_train_undersampled[:,59].astype(int)


            ### Train different algorithms

            # Decision tree
            if self.type_classifier == "DecisionTree": 
                classifier = DecisionTreeClassifier(max_depth=1, max_leaf_nodes=2)
                classifier_model = classifier.fit(X_train_undersampled, y_train_undersampled)


            # K-NN
            elif self.type_classifier == "KNN":
                classifier = KNeighborsClassifier(n_neighbors=3)
                classifier_model = classifier.fit(X_train_undersampled, y_train_undersampled)

        
            # SVM
            elif self.type_classifier == "SVM":
                classifier = make_pipeline(StandardScaler(), SVC(gamma='auto'))
                classifier_model = classifier.fit(X_train_undersampled, y_train_undersampled)

            # Gaussian Naive Bayes    
            elif self.type_classifier == "GaussianNB":
                classifier = GaussianNB()
                classifier_model = classifier.fit(X_train_undersampled, y_train_undersampled) 
            else:
                print("Wrong classifier selection")
                return
                
            self.classifier_models[classifier_iteration] = classifier_model
        
        return
    
    
    def predict(self,X_test):
        
        model_preds = np.array([model.predict(X_test) for model in self.classifier_models])
        y_test_pred = np.sign(np.mean(model_preds,axis = 0))
        return y_test_pred.astype(int)
            
#         accuracy = accuracy_score(y_test, y_test_pred)
#         f1_score = f1_score(y_test, y_test_pred, average='weighted')



In [4]:
BaggingClassifier = Bagging_classifier(type_classifier="DecisionTree")

BaggingClassifier.fit(X_train, y_train)

Bagging_pred = BaggingClassifier.predict(X_test)
print('Accuracy with Bagging: %.4f' % (np.shape(np.where(y_test.flatten()==Bagging_pred))[1]/np.shape(y_test.flatten())[0]))
print('F1 Score with Bagging: %.4f' % (f1_score(y_test, Bagging_pred, average='weighted')))


Accuracy with Bagging: 0.7995
F1 Score with Bagging: 0.7912
