In [1]:
import pandas as pd
import numpy as np
import math
import time


from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split

In [2]:
df_bot = bot_accounts = pd.concat(
    [
        pd.read_csv('../data/set-1/social_spambots_1.csv'),
        pd.read_csv('../data/set-1/social_spambots_2.csv'),
        pd.read_csv('../data/set-1/social_spambots_3.csv')
    ]
).reset_index(drop=True)

df_naive = pd.read_csv('../data/set-1/geniune_accounts.csv')

In [3]:
def feature_engineering(df):
    used_columns = [
        'statuses_count',
        'followers_count',
        'friends_count',
        'favourites_count',
        'listed_count',
        'default_profile',
        'geo_enabled',
        'profile_use_background_image',
        'verified',
        'protected'
    ]
    df_return = df[used_columns].fillna(0.0)
    return df_return

In [4]:
df_bot = feature_engineering(df_bot)
df_naive = feature_engineering(df_naive)

In [5]:
X = pd.concat([df_bot, df_naive]).values
y = np.concatenate(
    [
        np.zeros((df_bot.shape[0],)),
        np.ones((df_naive.shape[0],))
    ]
)

In [6]:
X.shape, y.shape

((8386, 10), (8386,))

In [7]:
X[:5]

array([[1.2990e+03, 2.2000e+01, 4.0000e+01, 1.0000e+00, 0.0000e+00,
        1.0000e+00, 1.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00],
       [1.8665e+04, 1.2561e+04, 3.4420e+03, 1.6358e+04, 1.1000e+02,
        0.0000e+00, 1.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00],
       [2.2987e+04, 6.0000e+02, 7.5500e+02, 1.4000e+01, 6.0000e+00,
        0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00],
       [7.9750e+03, 3.9800e+02, 3.5000e+02, 1.1000e+01, 2.0000e+00,
        0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00],
       [2.0218e+04, 4.1300e+02, 4.0500e+02, 1.6200e+02, 8.0000e+00,
        0.0000e+00, 1.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00]])

In [8]:
y[:5]

array([0., 0., 0., 0., 0.])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [10]:
start_time = time.time()

smote = SMOTE()
smote_X, smote_y = smote.fit_resample(X_train, y_train)

e = EditedNearestNeighbours()
r_X, r_y = e.fit_resample(smote_X, smote_y)

a = AdaBoostClassifier(n_estimators=500, random_state=0)
a.fit(r_X, r_y)

end_time = time.time()

In [11]:
y_predict = a.predict(r_X)

In [12]:
print(classification_report(y_predict, r_y, digits=4))

              precision    recall  f1-score   support

         0.0     0.9908    0.9969    0.9938      3883
         1.0     0.9967    0.9901    0.9934      3632

    accuracy                         0.9936      7515
   macro avg     0.9937    0.9935    0.9936      7515
weighted avg     0.9936    0.9936    0.9936      7515



In [13]:
# ROC AUC score
roc_auc_score(y_predict, r_y)

0.993498855850817

In [14]:
# Training time
end_time - start_time

3.7763688564300537

In [15]:
y_test_predict = a.predict(X_test)

In [16]:
print(classification_report(y_test_predict, y_test, digits=4))

              precision    recall  f1-score   support

         0.0     0.9910    0.9813    0.9861      1015
         1.0     0.9718    0.9864    0.9790       663

    accuracy                         0.9833      1678
   macro avg     0.9814    0.9839    0.9826      1678
weighted avg     0.9834    0.9833    0.9833      1678



In [17]:
roc_auc_score(y_test_predict, y_test)

0.9838530637719279