In [1]:
import pandas as pd
import numpy as np
import math
import time

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours

from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [2]:
df_bot = bot_accounts = pd.concat(
    [
        pd.read_csv('../data/set-1/social_spambots_1.csv'),
        pd.read_csv('../data/set-1/social_spambots_2.csv'),
        pd.read_csv('../data/set-1/social_spambots_3.csv')
    ]
).reset_index(drop=True)

df_naive = pd.read_csv('../data/set-1/geniune_accounts.csv')

In [3]:
def feature_engineering(df):
    used_columns = [
        'statuses_count',
        'followers_count',
        'friends_count',
        'favourites_count',
        'listed_count',
        'geo_enabled',
        'profile_use_background_image',
        'default_profile',
        'default_profile_image',
        'profile_banner_url',
        'profile_text_color',
        'profile_sidebar_border_color',
        'profile_background_tile',
        'profile_sidebar_fill_color',
        'profile_background_color',
        'profile_link_color'
    ]
    df_return = df[used_columns]
    
    df_int = df_return.select_dtypes('int64')
    df_float = df_return.select_dtypes('float64')
    df_other = df_return.select_dtypes('object')
    
    df_float = 1 - df_float.isna()
    df_profile_banner_url = 1 - df_other['profile_banner_url'].isna()
    df_other = df_other.drop('profile_banner_url', axis=1)
    for i in df_other.columns:
        df_other[i] = df_other[i].apply(lambda x: int(x, 16) if isinstance(x, str) else 0)
    return pd.concat([df_int, df_float, df_profile_banner_url, df_other], axis=1)

In [4]:
df_new = feature_engineering(
    pd.concat([df_bot, df_naive])
)
df_new.head()

Unnamed: 0,statuses_count,followers_count,friends_count,favourites_count,listed_count,geo_enabled,profile_use_background_image,default_profile,default_profile_image,profile_background_tile,profile_banner_url,profile_text_color,profile_sidebar_border_color,profile_sidebar_fill_color,profile_background_color,profile_link_color
0,1299,22,40,1,0,1,1,1,1,0,0,3355443,12639981,14544630,12639981,33972
1,18665,12561,3442,16358,110,1,1,0,0,1,1,3355443,16777215,15987699,15461355,10027008
2,22987,600,755,14,6,0,1,0,0,1,1,3355443,16777215,15724527,1250582,39321
3,7975,398,350,11,2,0,1,0,0,1,1,4080661,8559966,9160466,15074692,4966668
4,20218,413,405,162,8,1,1,0,0,0,1,14054213,1459997,14676207,15461355,0


In [5]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8386 entries, 0 to 3473
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype
---  ------                        --------------  -----
 0   statuses_count                8386 non-null   int64
 1   followers_count               8386 non-null   int64
 2   friends_count                 8386 non-null   int64
 3   favourites_count              8386 non-null   int64
 4   listed_count                  8386 non-null   int64
 5   geo_enabled                   8386 non-null   int32
 6   profile_use_background_image  8386 non-null   int32
 7   default_profile               8386 non-null   int32
 8   default_profile_image         8386 non-null   int32
 9   profile_background_tile       8386 non-null   int32
 10  profile_banner_url            8386 non-null   int32
 11  profile_text_color            8386 non-null   int64
 12  profile_sidebar_border_color  8386 non-null   int64
 13  profile_sidebar_fill_color    838

In [6]:
# Select 2 feature sets as in Figure 2 of the paper
set_1 = [0, 6, 10, 11, 13, 14]
set_2 = [1, 3, 9, 13, 14, 15]
set_3 = [2, 4, 10, 13, 14, 15]
#00101000001000111
df_set_1 = df_new.iloc[:, set_1]
df_set_2 = df_new.iloc[:, set_2]
df_set_3 = df_new.iloc[:, set_3]

In [7]:
X = df_set_2.values
y = y = np.concatenate(
    [
        np.zeros((df_bot.shape[0],)),
        np.ones((df_naive.shape[0],))
    ]
)

In [8]:
def svm_nn_algorithm(X_train, X_test, y_train, y_test):
    # SVM-NN implementation for only 1 subset feature
    svm = SVC()

    # Use SVM classification algorithm to Train the model using the training set,
    # and the identifying labels rLable
    svm.fit(X_train, y_train)
    
    # Predict the output using the SVM trained model,
    # and set the output decision-values to decisionV
    decisionV = svm.predict_proba(X_train)
    
# Currently pending

In [9]:
svm = SVC()
svm.fit(X, y)

SVC()

In [10]:
decision_values = svm.decision_function(X)

In [11]:
decision_values

array([ 1.00030271,  0.92518797, -1.00002731, ...,  1.00030284,
        1.29440755,  1.02525795])

In [12]:
import tensorflow as tf

In [13]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(7, activation='relu', input_shape=(1,)),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [14]:
model.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.003),
    metrics=['accuracy'],
)

In [15]:
model.fit(
    decision_values,
    y,
    epochs=50,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor="loss",
            patience=10,
        )
    ]
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50


<tensorflow.python.keras.callbacks.History at 0x2211aedaa90>

In [16]:
model.predict(X_train)

NameError: name 'X_train' is not defined