In [1]:
import pandas as pd
import numpy as np
import time

from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df_bot = bot_accounts = pd.concat(
    [
        pd.read_csv('../data/set-1/social_spambots_1.csv'),
        pd.read_csv('../data/set-1/social_spambots_2.csv'),
        pd.read_csv('../data/set-1/social_spambots_3.csv')
    ]
).reset_index(drop=True)

df_naive = pd.read_csv('../data/set-1/geniune_accounts.csv')

In [3]:
def tfidf(series, ngram_range):
    text = series.values.reshape(-1)
    vectorizer = TfidfVectorizer(analyzer='char', ngram_range=ngram_range, max_features=500)
    m = vectorizer.fit_transform(text).todense()
    vocab = vectorizer.get_feature_names()
    return m, vocab

def string_entropy(string):
    char_dict = {}
    for i in string:
        if i not in char_dict:
            char_dict[i] = 0
        char_dict[i] += 1
    char_count = np.array(list(char_dict.values()))
    char_count = char_count / char_count.sum()
    entropy = (-char_count * np.log2(char_count)).sum()
    return entropy

def feature_engineering(df):
    used_columns = [
        'screen_name'
    ]
    m, vocab = tfidf(df[used_columns], ngram_range=(3,3))
    df_return = pd.DataFrame(m, columns=vocab)
    df_return['entropy'] = df['screen_name'].apply(string_entropy)
    df_return['uppercase'] = df['screen_name'].str.count(r'[A-Z]') / df['screen_name'].str.len()
    df_return['lowercase'] = df['screen_name'].str.count(r'[a-z]') / df['screen_name'].str.len()
    return df_return

In [4]:
df = pd.concat([df_bot, df_naive], ignore_index=True)
feature_time = time.time()
df_new = feature_engineering(df)
end_feature_time = time.time()

In [5]:
df_new

Unnamed: 0,aaa,aba,abe,abi,abo,abr,aca,ace,ach,aci,...,ver,vib,vin,wil,win,yan,yle,entropy,uppercase,lowercase
0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.725481,0.000000,0.777778
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.084963,0.166667,0.833333
2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.121928,0.100000,0.700000
3,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.182006,0.142857,0.857143
4,0.0,0.0,0.0,0.0,0.0,0.721081,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.277613,0.181818,0.727273
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8381,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.323231,0.000000,0.800000
8382,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.182006,0.214286,0.785714
8383,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.155639,0.000000,0.875000
8384,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.807355,0.000000,1.000000


In [6]:
X = df_new.values
y = np.concatenate(
    [
        np.zeros((df_bot.shape[0],)),
        np.ones((df_naive.shape[0],))
    ]
)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [8]:
start_time = time.time()
model = LogisticRegression()
model.fit(X_train, y_train)
end_time = time.time()

In [9]:
y_predict = model.predict(X_train)

In [10]:
print(classification_report(y_predict, y_train, digits=4))

              precision    recall  f1-score   support

         0.0     0.9104    0.8053    0.8546      4417
         1.0     0.6930    0.8472    0.7624      2291

    accuracy                         0.8196      6708
   macro avg     0.8017    0.8263    0.8085      6708
weighted avg     0.8362    0.8196    0.8231      6708



In [11]:
# ROC AUC score
roc_auc_score(y_predict, y_train)

0.8262629989860018

In [12]:
# Training time and feature engineering time
end_time - start_time, end_feature_time - feature_time

(0.5172610282897949, 0.3509805202484131)

In [13]:
y_test_predict = model.predict(X_test)

In [14]:
print(classification_report(y_test_predict, y_test, digits=4))

              precision    recall  f1-score   support

         0.0     0.8866    0.7782    0.8288      1145
         1.0     0.6226    0.7861    0.6949       533

    accuracy                         0.7807      1678
   macro avg     0.7546    0.7821    0.7618      1678
weighted avg     0.8027    0.7807    0.7863      1678



In [15]:
roc_auc_score(y_test_predict, y_test)

0.7821411307831586