In [18]:
!pip install logitboost;

Collecting logitboost
  Downloading logitboost-0.7-py3-none-any.whl (9.1 kB)
Installing collected packages: logitboost
Successfully installed logitboost-0.7


In [35]:
# imports
import pandas as pd
import numpy as np
import sklearn as sk
import time

from sklearn.ensemble import RandomForestClassifier
from logitboost import LogitBoost
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize

from sklearn.metrics import classification_report, roc_auc_score
# seed
np.random.seed(0)

In [3]:
# where data is located
!ls ./drive/MyDrive/Thesis\ Workspace/Notebooks/data/sb15-set2
DATA_PATH = "./drive/MyDrive/Thesis Workspace/Notebooks/data/sb15-set2/"
# load user data
df_bot = pd.read_csv(DATA_PATH + 'bot.csv')
df_naive = pd.read_csv(DATA_PATH + 'naive.csv')
print('Shapes', df_bot.shape, df_naive.shape)

bot.csv  naive.csv
Shapes (4912, 8) (3394, 8)


In [5]:
df = pd.concat([df_bot, df_naive], ignore_index=True)
label = y = np.concatenate(
    [
        np.zeros((df_bot.shape[0],)),
        np.ones((df_naive.shape[0],))
    ]
)
df.head()

Unnamed: 0,avg_characters,std_characters,avg_hashtags,avg_mentions,avg_urls,favorites_received,retweets_received,avg_tweet_same_time
0,98.02992,37.936346,0.377545,0.190623,0.719001,437,0.194941,1.005895
1,60.562248,46.763043,0.002794,0.003726,0.868053,23,0.008693,1.000932
2,94.849587,32.945352,0.155372,0.57686,0.343802,84,482.842975,1.003317
3,90.990078,25.0931,0.64031,0.017984,0.03845,1872,2.408992,1.0
4,91.341674,31.425261,0.114612,0.068273,0.494285,125,1.559778,1.001857


In [8]:
x_train, x_val, y_train, y_val = train_test_split(df, label, test_size=0.2)
print([x.shape for x in[x_train, x_val, y_train, y_val]])

[(6644, 8), (1662, 8), (6644,), (1662,)]


# Models
We will be implementing:
- Random Forest
- Logit Boost
- SVM Poly
- SVM RBF
- Two settings of Multilayer Perceptron (FC)

## Random Forest
- 10 trees with unlimited depth

In [16]:
random_forest_model = RandomForestClassifier(
    n_estimators = 10,
    max_depth = None,
    criterion = 'gini'
)
random_forest_model.fit(x_train,y_train)
pred = random_forest_model.predict(x_val)
print(classification_report(pred, y_val, digits=4))
print('ROC acc: ',roc_auc_score(pred, y_val))


              precision    recall  f1-score   support

         0.0     0.9869    0.9879    0.9874       994
         1.0     0.9820    0.9805    0.9813       668

    accuracy                         0.9850      1662
   macro avg     0.9845    0.9842    0.9844      1662
weighted avg     0.9850    0.9850    0.9850      1662

ROC acc:  0.9842332437740213


## Logit Boost
- Base estimator is decision stump

In [31]:
logit_boost_model = LogitBoost(
    base_estimator = None, #default None equals decision stump
    n_estimators=30, #just a number i choose
)
logit_boost_model.fit(x_train,y_train)
pred = logit_boost_model.predict(x_val.values)
print(classification_report(pred, y_val, digits=4))
print('ROC acc: ',roc_auc_score(pred, y_val))

              precision    recall  f1-score   support

         0.0     0.9829    0.9849    0.9839       993
         1.0     0.9775    0.9746    0.9760       669

    accuracy                         0.9807      1662
   macro avg     0.9802    0.9797    0.9800      1662
weighted avg     0.9807    0.9807    0.9807      1662

ROC acc:  0.9797415992666152


## SVM Poly
- Complexity param c = 1
- Paper state that data is to be normalized when used with SVM

In [30]:
x_train_norm, x_val_norm = [normalize(X, norm='l2') for X in [x_train, x_val]]

In [32]:
svm_poly_model = SVC(
    C = 1,
    kernel = 'poly',
    degree = 3
)
svm_poly_model.fit(x_train_norm,y_train)
pred = svm_poly_model.predict(x_val_norm)
print(classification_report(pred, y_val, digits=4))
print('ROC acc: ',roc_auc_score(pred, y_val))


              precision    recall  f1-score   support

         0.0     0.9839    0.9184    0.9500      1066
         1.0     0.8696    0.9732    0.9184       596

    accuracy                         0.9380      1662
   macro avg     0.9267    0.9458    0.9342      1662
weighted avg     0.9429    0.9380    0.9387      1662

ROC acc:  0.9457704269866654


## SVM RBF
- Similar

In [33]:
svm_rbf_model = SVC(
    C = 1,
    kernel = 'rbf',
)
svm_rbf_model.fit(x_train_norm,y_train)
pred = svm_rbf_model.predict(x_val_norm)
print(classification_report(pred, y_val, digits=4))
print('ROC acc: ',roc_auc_score(pred, y_val))

              precision    recall  f1-score   support

         0.0     0.9849    0.9116    0.9469      1075
         1.0     0.8576    0.9744    0.9123       587

    accuracy                         0.9338      1662
   macro avg     0.9212    0.9430    0.9296      1662
weighted avg     0.9399    0.9338    0.9346      1662

ROC acc:  0.9430371221425458


## Multilayer Perceptron 10
- Gradient Decent
- Normalization
- 500 train epoch
- Learn rate 0.3
- Momentum 0.2
- Hidden layer with 10 nodes
- Sigmoid activation
- Validation threshold 20

### NOTE:
validation threshold is the number of traing epochs that we allow the model to not improve, if error does not decrease within this validation threshold epochs, we terminate training. This param in sklearn is `n_iter_no_change`

In [38]:
mlp_10_model = MLPClassifier(
    hidden_layer_sizes=(10,), #means only one hidden layer with 10 nodes
    learning_rate='constant',
    learning_rate_init=0.3,
    max_iter=500,
    solver='sgd',
    activation='logistic', #this is sigmoid
    momentum=0.2,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=20
)
mlp_10_model.fit(x_train_norm,y_train)
pred = mlp_10_model.predict(x_val_norm)
print(classification_report(pred, y_val, digits=4))
print('ROC acc: ',roc_auc_score(pred, y_val))

              precision    recall  f1-score   support

         0.0     0.9849    0.9280    0.9556      1056
         1.0     0.8861    0.9752    0.9285       606

    accuracy                         0.9452      1662
   macro avg     0.9355    0.9516    0.9421      1662
weighted avg     0.9489    0.9452    0.9457      1662

ROC acc:  0.9516389138913891


## Multilayer Perceptron 20
Same but hidden layer has 20 nodes

In [40]:
mlp_20_model = MLPClassifier(
    hidden_layer_sizes=(20,), #means only one hidden layer with 10 nodes
    learning_rate='constant',
    learning_rate_init=0.3,
    max_iter=500,
    solver='sgd',
    activation='logistic', #this is sigmoid
    momentum=0.2,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=20
)
mlp_20_model.fit(x_train_norm,y_train)
pred = mlp_20_model.predict(x_val_norm)
print(classification_report(pred, y_val, digits=4))
print('ROC acc: ',roc_auc_score(pred, y_val))

              precision    recall  f1-score   support

         0.0     0.9849    0.9307    0.9570      1053
         1.0     0.8906    0.9754    0.9310       609

    accuracy                         0.9471      1662
   macro avg     0.9377    0.9530    0.9440      1662
weighted avg     0.9503    0.9471    0.9475      1662

ROC acc:  0.953021861067838


## Conclude
Random 4Rest > LogiBoost > MLP20 > MLP10 > SVM-P > SVM-R