# Modelo para clasificar usuarios de Twitter

In [75]:
import pandas as pd
import numpy as np

import os
import pickle

from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model  import LogisticRegression

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [30]:
#importamos features generadas a partir de tweets de bots y humanos
users = pd.read_csv('features_generadas.csv')
users.drop(['user_id', 'user_id.1','tweetsCount', 'semanas'], axis=1,inplace=True)
users = users.fillna(0)

print(len(users), 'usuarios')
users.head()

5995 usuarios


Unnamed: 0,bot,Cat,lun,mar,mie,jue,vie,sab,dom,times_mean,...,num_mentions_mean,num_mentions_median,num_mentions_std,num_mentions_fq,num_mentions_tq,reply_count_mean,reply_count_median,reply_count_std,reply_count_fq,reply_count_tq
0,False,Humans,20.56,24.32,19.56,18.2,17.72,11.44,16.0,4710.0,...,9019.78022,10000.0,8720.070307,0.0,10000.0,0.0,0.0,0.0,0.0,0.0
1,False,Humans,6.678571,5.069767,6.321429,5.602273,4.951807,4.654762,4.609756,16468.0,...,5439.727066,0.0,6453.941616,0.0,10000.0,0.0,0.0,0.0,0.0,0.0
2,False,Humans,11.423077,16.666667,18.464286,31.448276,23.586207,7.285714,6.041667,5342.0,...,9337.349398,10000.0,10512.287386,0.0,20000.0,0.0,0.0,0.0,0.0,0.0
3,False,Humans,5.046512,5.227273,6.72619,5.116279,5.845238,4.858824,4.954023,16588.0,...,6102.096011,10000.0,6578.193523,0.0,10000.0,0.0,0.0,0.0,0.0,0.0
4,False,Humans,24.0,19.285714,27.047619,31.631579,29.6,15.55,15.894737,3867.0,...,7184.534271,10000.0,8842.028988,0.0,10000.0,0.0,0.0,0.0,0.0,0.0


In [31]:
 users.groupby('Cat').count()[['bot']].rename(index=str, columns={'bot':'Cantidad'})

Unnamed: 0_level_0,Cantidad
Cat,Unnamed: 1_level_1
Humans,1083
SocialSpambot1,991
SocialSpambot2,3457
SocialSpambot3,464


In [40]:
le = LabelEncoder()
scaler = StandardScaler()

X = users.drop(['Cat', 'bot'], axis=1)
y = le.fit_transform(users['bot'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [53]:
logistic =  LogisticRegression(penalty='l1', C=1.2, class_weight='balanced')
hyperparameters = {

}
param_trees = {
    'n_estimators': [50, 100, 200], 
   'max_features': [1, 5, 8, 10, 21]
    
}
kf = StratifiedKFold(n_splits=3, shuffle=True)

bdt = BaggingClassifier(base_estimator=logistic, n_jobs=-1, random_state=7, verbose=1)
grid_search_bdt = GridSearchCV(bdt, param_grid=param_trees, cv=kf, verbose=1, n_jobs=-1)
bestModel = grid_search_bdt.fit(X_train, y_train)

Fitting 3 folds for each of 15 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  2.5min finished
[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:    3.0s remaining:   15.6s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:   15.3s finished


In [57]:
finalParams = grid_search_bdt.best_estimator_.get_params()

In [56]:
y_pred = bestModel.predict(X_test)
print(accuracy_score(y_test, y_pred))

conf = confusion_matrix(y_test, y_pred)
pd.DataFrame(conf, index = le.classes_, columns = ['pred_'+str(c) for c in le.classes_])

[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:    1.8s remaining:    9.6s


0.9649805447470817


[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    7.1s finished


Unnamed: 0,pred_False,pred_True
False,334,11
True,52,1402


In [70]:
X_final = scaler.transform(users.drop(['Cat', 'bot'], axis=1))
y_final = le.transform(users['bot'])

In [74]:
finalModel = grid_search_bdt.best_estimator_
finalModel.verbose = 0
finalModel.fit(X_final, y_final)
finalModel

BaggingClassifier(base_estimator=LogisticRegression(C=1.2, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
         bootstrap=True, bootstrap_features=False, max_features=21,
         max_samples=1.0, n_estimators=50, n_jobs=-1, oob_score=False,
         random_state=7, verbose=0, warm_start=False)

In [76]:
#guardar modelo
filename = 'twitterClassifier.pkl'
with open(filename, 'wb') as twModel:
    pickle.dump(finalModel, twModel)