In [1]:
import pandas as pd
import numpy as np
import math
import time

from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours

In [2]:
df_bot = bot_accounts = pd.concat(
    [
        pd.read_csv('../data/set-1/social_spambots_1.csv'),
        pd.read_csv('../data/set-1/social_spambots_2.csv'),
        pd.read_csv('../data/set-1/social_spambots_3.csv')
    ]
).reset_index(drop=True)

df_naive = pd.read_csv('../data/set-1/geniune_accounts.csv')

In [3]:
def feature_engineering(df):
    used_columns = [
        'statuses_count',
        'followers_count',
        'friends_count',
        'favourites_count',
        'listed_count',
        'default_profile',
        'geo_enabled',
        'profile_use_background_image',
        'verified',
        'protected'
    ]
    df_return = df[used_columns].fillna(0.0)
    return df_return

In [4]:
feature_time = time.time()
df_bot = feature_engineering(df_bot)
df_naive = feature_engineering(df_naive)
end_feature_time = time.time()

In [5]:
X = pd.concat([df_bot, df_naive], ignore_index=True).values
y = np.concatenate(
    [
        np.zeros((df_bot.shape[0],)),
        np.ones((df_naive.shape[0],))
    ]
)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [7]:
start_time = time.time()

smote = SMOTE()
smote_X, smote_y = smote.fit_resample(X_train, y_train)

e = EditedNearestNeighbours()
r_X, r_y = e.fit_resample(smote_X, smote_y)

a = AdaBoostClassifier(n_estimators=500, random_state=0)
a.fit(r_X, r_y)

end_time = time.time()

In [8]:
y_predict = a.predict(r_X)

In [9]:
print(classification_report(y_predict, r_y, digits=4))

              precision    recall  f1-score   support

         0.0     0.9903    0.9964    0.9933      3883
         1.0     0.9961    0.9895    0.9928      3619

    accuracy                         0.9931      7502
   macro avg     0.9932    0.9929    0.9931      7502
weighted avg     0.9931    0.9931    0.9931      7502



In [10]:
# ROC AUC score
roc_auc_score(y_predict, r_y)

0.9929472010720881

In [11]:
# Training time and feature engineering time
end_time - start_time, end_feature_time - feature_time

(3.1318323612213135, 0.010021209716796875)

In [12]:
y_test_predict = a.predict(X_test)

In [13]:
print(classification_report(y_test_predict, y_test, digits=4))

              precision    recall  f1-score   support

         0.0     0.9920    0.9765    0.9842      1021
         1.0     0.9643    0.9878    0.9759       657

    accuracy                         0.9809      1678
   macro avg     0.9782    0.9822    0.9801      1678
weighted avg     0.9812    0.9809    0.9810      1678



In [14]:
roc_auc_score(y_test_predict, y_test)

0.9821585367853464