## Read the data

In [1]:
import pandas as pd
import numpy as np
from statistics import mean
from numpy import std
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection  import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics 
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold
import seaborn as sns 
plt.style.use('seaborn')

df = pd.read_excel('data/processed_data.xlsx')
df.shape

## Transform the data

In [2]:
metadata_cols = ['statuses_count', 'followers_count', 'friends_count',
                 'favourites_count', 'listed_count']
meta_data = df[metadata_cols]

derived_feature_cols = ['tweet_freq', 'followers_growth_rate', 'friends_growth_rate',
                        'favourites_growth_rate', 'listed_growth_rate', 
                        'follower_friend_ratio', 'follower_favorites_ratio', 'tweet_follower_ratio', 
                        'screen_name_length', 'num_digits_in_screen_name',
                        'name_length', 'num_digits_in_name', 'num_digits_end_screen_name',
                        'description_length', 'screen_name_likelihood']
derived_features = df[derived_feature_cols]

## Standardize the data

In [3]:
sc = StandardScaler()
meta_data_standard=StandardScaler().fit_transform(meta_data)# Gaussian Standardisation
meta_data_standard=pd.DataFrame(meta_data_standard,columns=metadata_cols)
derived_features_standard=StandardScaler().fit_transform(derived_features)# Gaussian Standardisation
derived_features_standard=pd.DataFrame(derived_features_standard,columns=derived_feature_cols)

## Combine the scaled features

In [4]:
binary_metadata_cols = ['has_default_profile', 'verified']
binary_metadata = df[binary_metadata_cols]
response = df['bot']

merge = pd.concat([binary_metadata.reset_index(drop=True), meta_data_standard], axis=1)
merge = pd.concat([merge.reset_index(drop=True), derived_features_standard], axis=1)
merge = pd.concat([merge.reset_index(drop=True), response], axis=1)
merge.head()

Unnamed: 0,has_default_profile,verified,statuses_count,followers_count,friends_count,favourites_count,listed_count,tweet_freq,followers_growth_rate,friends_growth_rate,...,follower_favorites_ratio,tweet_follower_ratio,screen_name_length,num_digits_in_screen_name,name_length,num_digits_in_name,num_digits_end_screen_name,description_length,screen_name_likelihood,bot
0,0,0,-0.402257,-0.014926,0.542074,-0.477201,-0.008162,-0.1385,-0.017317,-0.007543,...,-0.015606,-0.018669,-0.690522,-0.458132,-0.292792,-0.165977,-0.441154,0.379984,-0.770636,False
1,0,0,-0.511078,-0.01061,1.697445,-0.510996,-0.055039,-0.146914,-0.000919,0.998811,...,-0.012485,-0.01839,0.479344,-0.458132,2.521351,-0.165977,-0.441154,1.502738,1.112019,False
2,0,0,-0.424411,-0.016429,-0.269437,-0.452793,-0.056551,-0.129266,-0.018716,-0.109824,...,-0.016164,-0.018827,-0.690522,1.968166,-1.277743,-0.165977,2.110246,-1.210585,-0.444243,False
3,0,0,-0.472891,-0.014573,0.790513,-0.519394,-0.061087,-0.149587,-0.016511,0.08823,...,-0.01457,-0.018672,1.64921,-0.458132,-0.574207,-0.165977,-0.441154,0.342559,1.164265,False
4,0,0,-0.453512,-0.016407,0.178441,-0.447583,-0.02782,-0.152851,-0.019099,-0.122647,...,-0.016164,-0.018934,0.869299,-0.458132,0.270036,-0.165977,-0.441154,0.829085,1.08145,False


## Train a random forrest model

Perform a 10-fold cross validation

In [5]:
cv = KFold(n_splits=10, random_state = 1, shuffle = True)
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion="gini")
features = binary_metadata_cols + metadata_cols + derived_feature_cols
train_X = merge[features]
train_y = merge.bot
f1_scores = cross_val_score(rf, train_X, train_y, scoring = 'f1', cv = cv)
f1_scores = list(f1_scores)
precision_scores = cross_val_score(rf, train_X, train_y, scoring = 'precision', cv = cv)
precision_scores = list(precision_scores)
recall_scores = cross_val_score(rf, train_X, train_y, scoring = 'recall', cv = cv)
recall_scores = list(recall_scores)

print("Number of annotated bots: " + str(len(merge[merge['bot'] == True])))
print('f1 score: ' + str(mean(f1_scores))[0:5] + " +/- " + str(std(f1_scores))[0:5])
print('precision: ' + str(mean(precision_scores))[0:5] + " +/- " + str(std(precision_scores))[0:5])
print('recall: '+ str(mean(recall_scores))[0:5] + " +/- " + str(std(recall_scores))[0:5])

Number of annotated bots: 65
f1 score: 0.681 +/- 0.206
precision: 0.941 +/- 0.118
recall: 0.578 +/- 0.214


Predict continuous values the same way BotometerLite presents results

In [6]:
proba = cross_val_predict(rf, train_X, train_y, cv=cv, method='predict_proba')
df['sklearn_p'] = list(proba[:,1])
df[['bot', 'sklearn_p']]

Unnamed: 0,bot,sklearn_p
0,False,0.00
1,False,0.01
2,False,0.00
3,False,0.00
4,False,0.00
...,...,...
7180,True,0.08
7181,True,0.71
7182,True,0.06
7183,True,0.36


In [None]:
def divide(a, b):
    if b == 0:
        return 0
    else: 
        return a/b  

def tune_threshold(data, k):
    score_threshold = k
    benchmark = pd.DataFrame({'sklearn_prediction': data['sklearn_p'] >= score_threshold,
                              'bot': data['bot']})
    true_pos = sum((benchmark['sklearn_prediction'] == True) & (benchmark['bot'] == True))
    false_pos = sum((benchmark['sklearn_prediction'] == True) & (benchmark['bot'] == False))
    true_neg = sum((benchmark['sklearn_prediction'] == False) & (benchmark['bot'] == False))
    false_neg = sum((benchmark['sklearn_prediction'] == False) & (benchmark['bot'] == True))
    precision = divide(true_pos,(true_pos + false_pos))
    recall = divide(true_pos,(true_pos + false_neg))
    f1 = 2*divide(precision*recall, precision+recall)
    return f1

f1_scores = []
for k in range(0,100):
    f1 = pd.DataFrame({'k': [k/100],
                       'f1': tune_threshold(data = df, k = k/100)})    
    f1_scores.append(f1)
f1_scores = pd.concat(f1_scores)

score_threshold = list(f1_scores['k'][f1_scores['f1'] == max(f1_scores['f1'])][0])
score_threshold = score_threshold[0]
benchmark = pd.DataFrame({'sklearn_prediction': df['sklearn_p'] >= score_threshold,
                          'bot': df['bot']})
true_pos = sum((benchmark['sklearn_prediction'] == True) & (benchmark['bot'] == True))
false_pos = sum((benchmark['sklearn_prediction'] == True) & (benchmark['bot'] == False))
true_neg = sum((benchmark['sklearn_prediction'] == False) & (benchmark['bot'] == False))
false_neg = sum((benchmark['sklearn_prediction'] == False) & (benchmark['bot'] == True))

precision = true_pos/(true_pos + false_pos)
recall = true_pos/(true_pos + false_neg)

print(str(f1_scores[f1_scores['f1'] == max(f1_scores['f1'])]))
print("")
print("Precision - sklearn predicted bot " + str(sum(benchmark['sklearn_prediction'] == True)) + " times and was correct " + str(true_pos) + " times: " + str(precision)[0:5])
print("Recall - sklearn predicted " + str(true_pos) + " out of the " + str(sum(benchmark['bot'] == True)) + " bots: " + str(recall)[0:5])
print("")
print("sklearn's total accuracy was " + str((true_pos + true_neg)/len(benchmark)) + "%")
print("sklearn's F1 score was " + str(2*(precision*recall)/(precision+recall))[0:5])

## BotometerLite Predictions

In [None]:
def divide(a, b):
    if b == 0:
        return 0
    else: 
        return a/b  

def tune_threshold(data, k):
    bot_lite_threshold = k
    benchmark = pd.DataFrame({'bot_lite_prediction': data['bot_lite'] >= bot_lite_threshold,
                              'bot': data['bot']})
    true_pos = sum((benchmark['bot_lite_prediction'] == True) & (benchmark['bot'] == True))
    false_pos = sum((benchmark['bot_lite_prediction'] == True) & (benchmark['bot'] == False))
    true_neg = sum((benchmark['bot_lite_prediction'] == False) & (benchmark['bot'] == False))
    false_neg = sum((benchmark['bot_lite_prediction'] == False) & (benchmark['bot'] == True))
    precision = divide(true_pos,(true_pos + false_pos))
    recall = divide(true_pos,(true_pos + false_neg))
    f1 = 2*divide(precision*recall, precision+recall)
    return f1

f1_scores = []
for k in range(0,100):
    f1 = pd.DataFrame({'k': [k/100],
                       'f1': tune_threshold(data = df, k = k/100)})    
    f1_scores.append(f1)
f1_scores = pd.concat(f1_scores)

bot_lite_threshold = f1_scores['k'][f1_scores['f1'] == max(f1_scores['f1'])][0]
benchmark = pd.DataFrame({'bot_lite_prediction': df['bot_lite'] >= bot_lite_threshold,
                          'bot': df['bot']})
true_pos = sum((benchmark['bot_lite_prediction'] == True) & (benchmark['bot'] == True))
false_pos = sum((benchmark['bot_lite_prediction'] == True) & (benchmark['bot'] == False))
true_neg = sum((benchmark['bot_lite_prediction'] == False) & (benchmark['bot'] == False))
false_neg = sum((benchmark['bot_lite_prediction'] == False) & (benchmark['bot'] == True))

precision = true_pos/(true_pos + false_pos)
recall = true_pos/(true_pos + false_neg)

print(str(f1_scores[f1_scores['f1'] == max(f1_scores['f1'])]))
print("")
print("Precision - BotometerLite predicted bot " + str(sum(benchmark['bot_lite_prediction'] == True)) + " times and was correct " + str(true_pos) + " times: " + str(precision)[0:5])
print("Recall - BotometerLite predicted " + str(true_pos) + " out of the " + str(sum(benchmark['bot'] == True)) + " bots: " + str(recall)[0:5])
print("")
print("BotometerLite's total accuracy was " + str((true_pos + true_neg)/len(benchmark)) + "%")
print("BotometerLite's F1 score was " + str(2*(precision*recall)/(precision+recall))[0:5])

## Bonus material - verify precision and recall corrections are done correctly

In [None]:
features = binary_metadata_cols + metadata_cols + derived_feature_cols
train, test = train_test_split(merge, test_size = 0.2)
train_X = train[features]
train_y = train.bot
test_X = test[features] 
test_y = test.bot  
model=RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion="gini")
model.fit(train_X,train_y)
prediction=model.predict(test_X)
print('The accuracy of the Random Forrest Model is',metrics.accuracy_score(prediction,test_y))
cm = metrics.confusion_matrix(test_y, prediction)
true_pos = cm[1,1]
true_neg = cm[0,0]
false_pos = cm[0,1]
false_neg = cm[1,0]
precision = true_pos/(true_pos + false_pos)
recall = true_pos/(true_pos + false_neg)

print("When we check precision of our RFM model against the test data set, " + str(len(test_y)) + " accounts")
print("")
print("Precision - We predicted fake follower " + str(cm[1,1] + cm[0,1]) + " times and were correct " + str(cm[1,1]) + " times: " + str(cm[1,1]/(cm[1,1] + cm[0,1]))[0:5])
print("Recall - We predicted " + str(cm[1,1]) + " out of the " + str(cm[1,1] + cm[1,0]) + " fake followers: " + str(recall)[0:5])
print("")
print("Our total accuracy was " + str((cm[0,0] + cm[1,1])/len(test_y))[0:5])
print("Our F1 score was " + str(2*(precision*recall)/(precision+recall))[0:5])

In [None]:
metrics.precision_score(test_y, prediction)
cm = metrics.confusion_matrix(test_y, prediction)

true_neg = str(cm[0,0]) + "/" + str(cm[0,0] + cm[1,0]) + " (" + str(cm[0,0]/(cm[0,0] + cm[1,0]))[0:5] + ")"
false_pos = str(cm[0,1]) + "/" + str(cm[0,1] + cm[1,1]) + " (" + str(cm[0,1]/(cm[0,1] + cm[1,1]))[0:5] + ")"
false_neg = str(cm[1,0]) + "/" + str(cm[1,0] + cm[0,0]) + " (" + str(cm[1,0]/(cm[1,0] + cm[0,0]))[0:5] + ")"
true_pos = str(cm[1,1]) + "/" + str(cm[1,1] + cm[0,1]) + " (" + str(cm[1,1]/(cm[1,1] + cm[0,1]))[0:5] + ")"

conf_matrix = pd.DataFrame({'Not Bot': [true_neg, false_neg],
                            'Bot': [false_pos, true_pos],
                            'Support': [cm[0,0] + cm[0,1], cm[1,0] + cm[1,1]]},
                    index = ['Not Bot', 'Bot'])
conf_matrix

In [None]:
print(metrics.classification_report(test_y,prediction))

In [None]:
pd.Series(model.feature_importances_,index=train_X.columns).sort_values(ascending=False)