In [1]:
import pandas as pd
import numpy as np
from pandasql import sqldf
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
from joypy import joyplot
sns.set_theme(style="whitegrid")

In [2]:
users = pd.read_csv("users2.csv")
users_cresci = pd.read_csv("users_cresci.csv")
users_total = users.append(users_cresci, ignore_index=True)

In [3]:
#Create baseline RF classifier with base features from Ferrara et al.
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler


X = users_total[["dataset","label","statuses_count","followers_count","friends_count","favourites_count", \
            "listed_count","default_profile","geo_enabled","profile_use_background_image","verified","protected"]].fillna(0)

X["label"] = X["label"].apply(lambda x: 0 if x == 'human' else 1)

X[["statuses_count","followers_count","friends_count","favourites_count","listed_count"]] = X[["statuses_count","followers_count","friends_count","favourites_count","listed_count"]].apply(lambda x: np.log10(x+1))

X.reset_index()

X_indomain = X[X["dataset"].isin(["fake_followers.csv","genuine_accounts.csv","social_spambots_1.csv","social_spambots_2.csv",
                                  "social_spambots_3.csv","traditional_spambots_1.csv","traditional_spambots_2.csv","traditional_spambots_3.csv"])].copy()
X_train, X_test, y_train, y_test = train_test_split(X_indomain.drop(["dataset","label"], axis=1), X_indomain["label"], test_size=0.25, random_state=42)

transformer = MinMaxScaler().fit(X_train)

X_train = pd.DataFrame(transformer.transform(X_train),columns=["statuses_count","followers_count","friends_count","favourites_count", \
            "listed_count","default_profile","geo_enabled","profile_use_background_image","verified","protected"])
X_test = pd.DataFrame(transformer.transform(X_test),columns=["statuses_count","followers_count","friends_count","favourites_count", \
            "listed_count","default_profile","geo_enabled","profile_use_background_image","verified","protected"])


X_outdomain = X[~X["dataset"].isin(["fake_followers.csv","genuine_accounts.csv","social_spambots_1.csv","social_spambots_2.csv",
                                  "social_spambots_3.csv","traditional_spambots_1.csv","traditional_spambots_2.csv","traditional_spambots_3.csv"])].copy()


X_outdomain_scaled = pd.DataFrame(transformer.transform(X_outdomain.drop(["dataset","label"], axis=1)),columns=["statuses_count","followers_count","friends_count","favourites_count", \
            "listed_count","default_profile","geo_enabled","profile_use_background_image","verified","protected"])

In [4]:
from sklearn.calibration import CalibratedClassifierCV

clf_ = RandomForestClassifier(random_state=42)
clf = CalibratedClassifierCV(base_estimator=clf_)

clf.fit(X_train,y_train)


y_test_scored = clf.predict_proba(X_test)

auc = roc_auc_score(y_test,y_test_scored[:,1])
auc

0.9970051061649495

In [5]:
### AUC out of domain

y_test_scored_out = clf.predict_proba(X_outdomain_scaled)

auc_out = roc_auc_score(X_outdomain["label"],y_test_scored_out[:,1])
auc_out

0.7155416924773859

In [6]:
#Create clusters and voting scheme for RF classifiers with base features from Ferrara et al.
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

blobs = pd.DataFrame(y_train,columns=["label"]).reset_index(drop=True)

df = pd.concat([X_train[["friends_count","followers_count"]], blobs],axis=1)
df_to_cluster = df#[df["label"] == 1]            
df_to_cluster.drop(columns=["label"])

for n_clusters in range(2,10):

    clusterer = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = clusterer.fit_predict(df_to_cluster)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(df_to_cluster, cluster_labels)
    print(f"n_clusters = {n_clusters}, silhouette_score = {silhouette_avg}")

n_clusters = 2, silhouette_score = 0.7981877340898921
n_clusters = 3, silhouette_score = 0.6247571681907987
n_clusters = 4, silhouette_score = 0.6416087998182678
n_clusters = 5, silhouette_score = 0.6814548406530508
n_clusters = 6, silhouette_score = 0.5687976809207187
n_clusters = 7, silhouette_score = 0.5575827261639363
n_clusters = 8, silhouette_score = 0.5509587297937667
n_clusters = 9, silhouette_score = 0.5517069959295546


In [7]:
#Looks like 2 clusters is the best

X_train["index"] = X_train.index

blobs = pd.DataFrame(y_train,columns=["label"]).reset_index(drop=True)

df = pd.concat([X_train[["friends_count","followers_count"]], blobs],axis=1)
#df_to_cluster_bot = df[df["label"] == 1]
#df_to_cluster_human = df[df["label"] == 0]
df_to_cluster = df.drop(columns=["label"])

clusterer = KMeans(n_clusters=2, random_state=42)

clusterer.fit(df_to_cluster)

X_train_cluster = pd.DataFrame(clusterer.predict(X_train[["friends_count","followers_count"]]),columns=["cluster"])

X_train_cluster_dict = dict()
clf_ = dict()


for current_cluster in range(0,2):

    X_train_cluster_dict[current_cluster] = X_train_cluster[X_train_cluster["cluster"] == current_cluster].copy()
    
    X_train_cluster_dict[current_cluster]["index"] = X_train_cluster_dict[current_cluster].index

    X_train_cluster_dict[current_cluster] = pd.merge(X_train, X_train_cluster_dict[current_cluster], on="index", how="inner")
    y_train_segment = np.take(y_train,X_train_cluster_dict[current_cluster]["index"],axis=0)

    clf_[current_cluster] = RandomForestClassifier(random_state=42)
    clf_[current_cluster].fit(X_train_cluster_dict[current_cluster].drop(columns=["index","cluster"]).copy(),y_train_segment)



In [8]:
#scoring and combining the scores
scores = dict()
for current_cluster in range(0,2):
    scores[current_cluster] = clf_[current_cluster].predict_proba(X_test)
    
    
avg_score = scores[1]
auc = roc_auc_score(y_test,avg_score[:,1])
auc

0.9912686615578183

In [9]:
### AUC out of domain
scores_out = dict()
for current_cluster in range(0,2):
    scores_out[current_cluster] = clf_[current_cluster].predict_proba(X_outdomain_scaled)
    

avg_score = scores_out[0]
auc_out = roc_auc_score(X_outdomain["label"],avg_score[:,1])
auc_out

0.7711316479126705