# Paper 7

Kai-Cheng Yang, Onur Varol, Pik-Mai Hui, and Filippo Menczer. 2020. Scalable and generalizable social bot detection through data selection. In Proceedings of the AAAI Conference on Artificial Intelligence, Vol. 34. 1096–1103.

In [1]:
import pandas as pd
import numpy as np
import math
import time

from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier

from nltk.lm.preprocessing import padded_everygram_pipeline, pad_both_ends
from nltk.util import trigrams
from nltk.lm import MLE

In [2]:
df_bot = bot_accounts = pd.concat(
    [
        pd.read_csv('../data/set-1/social_spambots_1.csv'),
        pd.read_csv('../data/set-1/social_spambots_2.csv'),
        pd.read_csv('../data/set-1/social_spambots_3.csv')
    ]
).reset_index(drop=True)

df_naive = pd.read_csv('../data/set-1/geniune_accounts.csv')

In [3]:
def get_screen_name_likelihood(series):
    sequence = series.apply(lambda x: list(x.lower())).values.tolist()
    return get_likelihood_array(sequence)
    
def get_likelihood_array(sequence, n=3):
    train_data, padded_sent = padded_everygram_pipeline(n, sequence)
    mle = MLE(n)
    mle.fit(train_data, padded_sent)
    
    s = np.zeros((len(sequence),))
    for i, name in enumerate(sequence):
        tri = trigrams(pad_both_ends(name, n=3))
        total_score = 1
        count = 0
        for ele in tri:
            score = mle.score(ele[2], [ele[0], ele[1]])
            total_score *= score
            count += 1
        s[i] = total_score ** (1/count)
    return s

def feature_engineering(df):
    used_columns = [
        'statuses_count',
        'followers_count',
        'friends_count',
        'favourites_count',
        'listed_count',
        'default_profile',
        'profile_use_background_image',
        'verified',
    ]
    df_return = df[used_columns]
    age = (pd.to_datetime(df['updated']) - pd.to_datetime(df['created_at']).dt.tz_localize(None)) / np.timedelta64(1, 'Y')
    df_return['tweet_freq'] = df['statuses_count'] / age
    df_return['followers_growth_rate'] = df['followers_count'] / age
    df_return['friends_growth_rate'] = df['friends_count'] / age
    df_return['favourites_growth_rate'] = df['favourites_count'] / age
    df_return['listed_growth_rate'] = df['listed_count'] / age
    df_return['followers_friends_ratio'] = df['followers_count'] / np.maximum(df['friends_count'], 1)
    df_return['screen_name_length'] = df['screen_name'].str.len()
    df_return['num_digits_in_screen_name'] = df['screen_name'].str.count('\d')
    df_return['name_length'] = df['name'].str.len()
    df_return['num_digits_in_name'] = df['name'].str.count('\d')
    df_return['description_length'] = df['description'].str.len()
    df_return['screen_name_likelihood'] = get_screen_name_likelihood(df['screen_name'])
    return df_return.fillna(0.0)

In [4]:
df = pd.concat([df_bot, df_naive], ignore_index=True)
feature_time = time.time()
df_new = feature_engineering(df)
end_feature_time = time.time()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_return['tweet_freq'] = df['statuses_count'] / age
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_return['followers_growth_rate'] = df['followers_count'] / age
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_return['friends_growth_rate'] = df['friends_count'] / age
A value is trying to be se

In [5]:
df_new

Unnamed: 0,statuses_count,followers_count,friends_count,favourites_count,listed_count,default_profile,profile_use_background_image,verified,tweet_freq,followers_growth_rate,friends_growth_rate,favourites_growth_rate,listed_growth_rate,followers_friends_ratio,screen_name_length,num_digits_in_screen_name,name_length,num_digits_in_name,description_length,screen_name_likelihood
0,1299,22,40,1,0,1.0,1.0,0.0,185.678511,3.144671,5.717583,0.142940,0.000000,0.550000,9,2,14.0,0.0,0.0,0.137311
1,18665,12561,3442,16358,110,0.0,1.0,0.0,2703.128629,1819.126638,498.482118,2369.021061,15.930573,3.649332,12,0,14.0,0.0,134.0,0.098316
2,22987,600,755,14,6,0.0,1.0,0.0,3361.093391,87.730284,110.393940,2.047040,0.877303,0.794702,10,2,15.0,0.0,23.0,0.075496
3,7975,398,350,11,2,0.0,1.0,0.0,1196.196831,59.697347,52.497667,1.649927,0.299987,1.137143,14,0,18.0,0.0,149.0,0.119180
4,20218,413,405,162,8,0.0,1.0,0.0,3059.385303,62.495110,61.284551,24.513820,1.210559,1.019753,11,0,12.0,0.0,79.0,0.087726
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8381,315,94,597,36,4,0.0,1.0,0.0,58.307409,17.399671,110.506423,6.663704,0.740412,0.157454,15,3,10.0,0.0,0.0,0.105170
8382,4099,5378,1238,471,6,0.0,1.0,0.0,2945.733332,3864.882620,889.684768,338.482654,4.311881,4.344103,14,0,15.0,0.0,74.0,0.053430
8383,199,18,136,6,0,1.0,1.0,0.0,171.318500,15.496146,117.081990,5.165382,0.000000,0.132353,8,1,7.0,0.0,0.0,0.071309
8384,2609,41,263,121,0,0.0,1.0,0.0,433.323459,6.809606,43.681131,20.096642,0.000000,0.155894,7,0,7.0,0.0,88.0,0.057030


In [6]:
X = df_new.values
y = np.concatenate(
    [
        np.zeros((df_bot.shape[0],)),
        np.ones((df_naive.shape[0],))
    ]
)

In [7]:
start_time = time.time()
transformer = FunctionTransformer(np.log1p, validate=True)
X = transformer.transform(X)

decomposer = PCA(2)
X = decomposer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model = KNeighborsClassifier(9)
model.fit(X_train, y_train)
end_time = time.time()

In [8]:
y_predict = model.predict(X_train)

In [9]:
print(classification_report(y_predict, y_train, digits=4))

              precision    recall  f1-score   support

         0.0     0.9803    0.9871    0.9837      3880
         1.0     0.9821    0.9728    0.9774      2828

    accuracy                         0.9811      6708
   macro avg     0.9812    0.9799    0.9806      6708
weighted avg     0.9811    0.9811    0.9811      6708



In [10]:
# ROC AUC score
roc_auc_score(y_predict, y_train)

0.9799428396447892

In [11]:
# Training time and feature engineering time
end_time - start_time, end_feature_time - feature_time

(0.08451080322265625, 5.011690378189087)

In [12]:
y_test_predict = model.predict(X_test)

In [13]:
print(classification_report(y_test_predict, y_test, digits=4))

              precision    recall  f1-score   support

         0.0     0.9881    0.9890    0.9886      1004
         1.0     0.9837    0.9822    0.9829       674

    accuracy                         0.9863      1678
   macro avg     0.9859    0.9856    0.9857      1678
weighted avg     0.9863    0.9863    0.9863      1678



In [14]:
roc_auc_score(y_test_predict, y_test)

0.9856198351992623