# SVM

In [1]:
# CONTEXT
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# IMPORTING THE UTILS FOR MANIPULATE THE DATASET
import sys
sys.path.append('../Classifiers/')
from dataset_utils import *

In [3]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler

Import the dataset and generate the one we'll use for the classification with the fake and the real users

In [4]:
# IMPORT THE ORIGINAL DATASET
df_real = pd.read_csv('../../data/balanced_real_data.csv')
df_fake = pd.read_csv('../../data/balanced_fake_data.csv')
dataset=pd.concat([df_real,df_fake],ignore_index=True)
dataset=fix_private_entries(dataset)

In [5]:
dataset.head(10)

Unnamed: 0,profile_pic,biography,follows_count,followed_by_count,ff_ratio,media_count,is_private,is_verified,is_business_account,is_joined_recently,...,std_comments,var_comments,skw_comments,mean_time_between_posts,max_time_between_posts,min_time_between_posts,std_time_between_posts,var_time_between_posts,skw_time_between_posts,real_account
0,True,45,2979,846,0.283988,278,False,False,False,False,...,4.964205,24.64333,1.759163,1688451.0,7930005.0,75731.0,1964306.0,3858496000000.0,1.947676,True
1,True,131,966,5172,5.354037,150,False,False,True,False,...,0.472582,0.2233333,2.951336,197835.2,851011.0,3855.0,222204.1,49374670000.0,1.785389,True
2,True,128,1662,1475,0.887485,403,False,False,True,False,...,5.961543,35.54,1.546427,295033.8,1128895.0,9212.0,241783.2,58459110000.0,1.726161,True
3,True,0,236,111,0.470339,43,True,False,False,False,...,,,,,,,,,,True
4,True,16,386,967516,2506.518135,34,False,True,True,False,...,1180.090381,1392613.0,0.319856,2418942.0,12726352.0,36.0,2814164.0,7919517000000.0,2.273966,True
5,True,103,2544,1652183,649.443003,9670,False,True,False,False,...,49.513466,2451.583,1.172048,79489.5,287585.0,5895.0,73820.74,5449502000.0,1.315021,True
6,True,150,410,70481,171.904878,2459,False,False,True,False,...,82.691253,6837.843,0.989691,374287.8,827086.0,86472.0,201948.1,40783030000.0,0.286078,True
7,True,15,1269,1311,1.033097,24,False,False,True,False,...,1.742479,3.036232,0.746906,4613034.0,24784763.0,433680.0,5338513.0,28499720000000.0,2.462336,True
8,True,0,755,2959,3.919205,225,False,False,True,False,...,4.325506,18.71,1.069453,100300.2,413788.0,16256.0,87685.68,7688778000.0,2.289149,True
9,True,116,336,1693459,5040.056548,4214,False,True,True,False,...,153.991472,23713.37,2.85183,59100.04,278030.0,1313.0,71921.36,5172681000.0,2.133841,True


In [6]:
# TAKE THE TRAINSET AND THE TARGET FROM DATASET
trainset=get_trainset(dataset)
targets=get_target_dataset(dataset)

# Classification without the Statistics from the Media of the Users 

In [7]:
# DELETING THE STATISTICS OF THE USERS CONTENTS
trainset_without_stats=drop_stats(trainset)

In [8]:
trainset_without_stats=StandardScale_dataset(trainset_without_stats)

In [9]:
# TRAINING THE DECISION TREE WITHOUT STATISTICS OF THE USERS CONTENTS
x_train, x_test, y_train, y_test = train_test_split(trainset_without_stats, targets, test_size = 0.2, random_state = 12345)
svc = SVC(gamma='scale')
svc.fit(x_train, y_train)
predictions = svc.predict(x_test)

# Confusion Matrix 

In [10]:
print(confusion_matrix(y_test, predictions))


[[1316  245]
 [ 203 1298]]


# Classification Report

In [11]:
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

       False       0.87      0.84      0.85      1561
        True       0.84      0.86      0.85      1501

    accuracy                           0.85      3062
   macro avg       0.85      0.85      0.85      3062
weighted avg       0.85      0.85      0.85      3062



## CrossValidation with 10-Fold

In [12]:
svc_cross = SVC(gamma='scale')
# Perform 10-fold cross validation 
scores = cross_val_score(estimator=svc_cross, X=trainset_without_stats, y=targets, cv=10, n_jobs=4)
print('Accuracy with 10-Fold CrossValidation:  '+str(scores.mean()))

Accuracy with 10-Fold CrossValidation:  0.8453740678168569


# Classification with only Public Profile (with Media Stats)

In [13]:
# CLASSIFICATION OF ONLY PUBLIC PROFILES
dataset_publics=drop_NaN_entries(dataset)
trainset_publics=get_trainset(dataset_publics)
targets_publics=get_target_dataset(dataset_publics)

In [14]:
trainset_publics=StandardScale_dataset(trainset_publics)

In [15]:
x_train, x_test, y_train, y_test = train_test_split(trainset_publics, targets_publics, test_size = 0.2, random_state = 12345)
svc = SVC(gamma='scale')
svc.fit(x_train, y_train)
predictions = svc.predict(x_test)

# Confusion Matrix 

In [16]:
print(confusion_matrix(y_test, predictions))

[[ 991  162]
 [ 130 1009]]


# Classification Report

In [17]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

       False       0.88      0.86      0.87      1153
        True       0.86      0.89      0.87      1139

    accuracy                           0.87      2292
   macro avg       0.87      0.87      0.87      2292
weighted avg       0.87      0.87      0.87      2292



## CrossValidation with 10-Fold

In [18]:
svc_cross = SVC(gamma='scale')
# Perform 10-fold cross validation 
scores = cross_val_score(estimator=svc_cross, X=trainset_publics, y=targets_publics, cv=10, n_jobs=4)
print('Accuracy with 10-Fold CrossValidation:  '+str(scores.mean()))

Accuracy with 10-Fold CrossValidation:  0.8815681467076584


# Classification with NaN Values replaced with Median

In [19]:
# CLASSIFICATION WITH NaN VALUES REPLACED WITH STATISTICS
dataset_median=fill_NaN_median(dataset)
trainset_median=get_trainset(dataset_median)
targets_median=get_target_dataset(dataset_median)

In [20]:
trainset_median=StandardScale_dataset(trainset_median)

In [21]:
x_train, x_test, y_train, y_test = train_test_split(trainset_median, targets_median, test_size = 0.2, random_state = 12345)
svc = SVC(gamma='scale')
svc.fit(x_train, y_train)
predictions = svc.predict(x_test)

# Confusion Matrix 

In [22]:
print(confusion_matrix(y_test, predictions))

[[1321  240]
 [ 141 1360]]


# Classification Report

In [23]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

       False       0.90      0.85      0.87      1561
        True       0.85      0.91      0.88      1501

    accuracy                           0.88      3062
   macro avg       0.88      0.88      0.88      3062
weighted avg       0.88      0.88      0.88      3062



## CrossValidation with 10-Fold

In [24]:
svc_cross = SVC(gamma='scale')
# Perform 10-fold cross validation 
scores = cross_val_score(estimator=svc_cross, X=trainset_median, y=targets_median, cv=10, n_jobs=4)
print('Accuracy with 10-Fold CrossValidation:  '+str(scores.mean()))

Accuracy with 10-Fold CrossValidation:  0.8725494462362839
