In [22]:
import pandas as pd
import numpy as np
import re
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.io as pio

init_notebook_mode(connected=True)
plotly.__version__

import os
import pickle

from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model  import LogisticRegression

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier

from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin

In [23]:
tweets = pd.read_pickle('../datasets/tweetsProcesados_KM.pkl')

In [24]:
scraped = pd.concat([pd.read_pickle('../datasets/TweetsScrapedUsers_k1.pkl'),\
                     pd.read_pickle('../datasets/TweetsScrapedUsers_k2.pkl'),\
                     pd.read_pickle('../datasets/TweetsScrapedUsers_k3.pkl'),\
                     pd.read_pickle('../datasets/TweetsScrapedUsers_m1.pkl'),\
                     pd.read_pickle('../datasets/TweetsScrapedUsers_m2.pkl'),\
                     pd.read_pickle('../datasets/TweetsScrapedUsers_m3.pkl')])
original = pd.concat([pd.read_pickle('../datasets/Users_K.pkl'),\
                      pd.read_pickle('../datasets/Users_M.pkl')])
users = original[original.id.isin(scraped.user_id)]
users.drop_duplicates(subset='id',inplace=True)
users = users[pd.DataFrame(users.id.astype(str)).id.isin(tweets.user_id)]

In [25]:
#tweets = pd.read_pickle('../datasets/tweetsProcesados_KM.pkl')
#users = pd.read_pickle('../datasets/UsersDataset.pkl')

tweets = tweets[tweets.tweetsCount.apply(int) > 50]

users.index = users.index.map(int).values
tweets.index = tweets.index.map(int).values

print(len(users))
print(len(tweets))
display(users.head(2))
display(tweets.head(2))

787
756


Unnamed: 0,id,screen_name,complete_name,tweet_count,description,location,birthdate,url,listed_count,created_at,crawled_at,favs_count,followers_count,following_count,verified
347513688,347513688,cinedelabestia,El cine de La Bestia,14046,Cine y televisión bestiales.,Buenos Aires,False,False,0,2011-08-01 00:00:00,2019-06-29 23:15:54.650537,21379,2368,543,False
470238248,470238248,14capo,HUGO,32185,Hacia la Victoria Siempre ......,Por Ahí,False,False,0,2012-01-01 00:00:00,2019-07-12 23:36:30.289443,16309,2511,3996,False


Unnamed: 0,user_id,tweetsCount,semanas,lun,mar,mie,jue,vie,sab,dom,...,num_mentions_mean,num_mentions_median,num_mentions_std,num_mentions_fq,num_mentions_tq,reply_count_mean,reply_count_median,reply_count_std,reply_count_fq,reply_count_tq
1001242472738156547,1001242472738156547,758,27,3.8,4.428571,10.428571,7.789474,4.380952,4.2,3.833333,...,4274.193548387097,0.0,6269.475295633572,0.0,10000.0,241.93548387096772,0.0,1542.7299619963394,0.0,0.0
1001522970580398082,1001522970580398082,614,14,9.0,9.363636,9.583333,7.833333,7.5,5.0,4.818182,...,0.0,0.0,0.0,0.0,0.0,11486.486486486489,0.0,33306.75535378815,0.0,10000.0


In [26]:
completeDf = tweets.join(users, how='inner')
completeDf.head()

Unnamed: 0,user_id,tweetsCount,semanas,lun,mar,mie,jue,vie,sab,dom,...,location,birthdate,url,listed_count,created_at,crawled_at,favs_count,followers_count,following_count,verified
1001242472738156547,1001242472738156547,758,27,3.8,4.428571,10.428571,7.789474,4.380952,4.2,3.833333,...,False,False,False,0,2018-05-01 00:00:00,2019-07-12 23:51:46.289253,4632,50,159,False
1001522970580398082,1001522970580398082,614,14,9.0,9.363636,9.583333,7.833333,7.5,5.0,4.818182,...,False,False,False,0,2018-05-01 00:00:00,2019-07-12 23:40:00.405996,30576,1048,1190,False
1001725257647034368,1001725257647034368,661,26,4.434783,4.714286,4.045455,4.272727,4.521739,4.0,4.65,...,Mdq,False,False,0,2018-05-01 00:00:00,2019-07-12 23:52:06.755854,5847,979,1383,False
1002307032748261377,1002307032748261377,661,25,4.045455,6.041667,4.333333,4.695652,4.73913,4.470588,2.307692,...,"Comuna 15, CABA",False,False,0,2018-05-01 00:00:00,2019-07-12 23:40:23.990932,5,111,49,False
1005247915231719430,1005247915231719430,1168,56,4.705882,4.967742,6.514286,6.393939,5.633333,5.096774,4.190476,...,"Buenos Aires, Argentina",False,False,0,2018-06-01 00:00:00,2019-07-12 23:40:44.857124,2568,259,643,False


In [27]:
#crear clase
class twDataTransform(TransformerMixin):
    def fit(self, X, Y):
        return self
    def transform(self, X, Y=None):
        df = X
        try:
            df.loc[:, 'crawled_at'] = pd.DatetimeIndex(pd.to_datetime(df.crawled_at, utc=None)).tz_convert(tz='UTC')
            df.loc[:, 'created_at'] = pd.DatetimeIndex(pd.to_datetime(df.created_at, utc=None)).tz_convert(tz='UTC')
        except:
            try:
                df.loc[:, 'crawled_at'] = pd.DatetimeIndex(pd.to_datetime(df.crawled_at, utc=None)).tz_localize(tz='UTC')
                df.loc[:, 'created_at'] = pd.DatetimeIndex(pd.to_datetime(df.created_at, utc=None)).tz_localize(tz='UTC')
            except:
                df.loc[:, 'crawled_at'] = pd.DatetimeIndex(pd.to_datetime(df.crawled_at, utc=None)).tz_convert(tz='UTC')
                df.loc[:, 'created_at'] = pd.DatetimeIndex(pd.to_datetime(df.created_at, utc=None)).tz_convert(tz='UTC')
        account_life = df.crawled_at - df.created_at
        account_life_meses = np.round(account_life.dt.days / 30)
        account_life_meses = account_life_meses.replace(0, 1)
        account_life_semanas = np.round(account_life.dt.days / 7)
        account_life_semanas = account_life_semanas.replace(0, 1)
        numValues = ['tweet_count', 'listed_count', 'followers_count', 'following_count']
        for i in numValues:
            colName_mes = i+'_mes'
            df.loc[:,colName_mes] = df[i].astype(float) / account_life_meses

            colName_semana = i+'_semana'
            df.loc[:,colName_semana] = df[i].astype(float) / account_life_semanas
        columnas = np.concatenate(
            [[x+'_mes'] for x in numValues]
        )
        columnasTweets = np.concatenate(
            [
                [x+'_mean',x+'_median', x+'_std', x+'_fq', x+'_tq'] for x in
                ['times','timesRT','timesUser','favorite_count','retweet_count','num_mentions','reply_count']
            ]
        )
        columnas = np.concatenate([
            ['semanas', 'lun', 'mar', 'mie', 'jue', 'vie', 'sab', 'dom'],columnas,columnasTweets
        ])
        df = df[columnas]
        df = df.fillna(0)
        df = df.applymap(lambda x: np.round(float(x), 10) if np.isfinite(float(x)) else np.nan)
        df = df.fillna(0)
        return df
    
#load model
import pickle
with open('../modelo/modeloCompleto.pkl', 'rb') as m_pred:
    model = pickle.load(m_pred)

In [28]:
predicciones = model.predict_proba(completeDf)
predicciones = pd.DataFrame(predicciones)
predicciones.index = completeDf.index
predicciones.columns = ['Prob_Human', 'Prob_Bot']
usuarios_prob = pd.concat([predicciones['Prob_Bot'], completeDf], axis=1)

In [29]:
yval = []
xval = np.linspace(0,1, 300)
for i in xval:
    cantUsuarios = len(usuarios_prob[usuarios_prob['Prob_Bot']>i])
    yval.append(cantUsuarios)
trace1 = go.Scatter(
    name='Inscriptos',
    x=xval,
    y=yval,
#     line={
#         'shape': 'spline',
#         'smoothing': 0.7,
#     }
)
layout = go.Layout(
    title=go.layout.Title(
        text='Cantidad bots',
        xref='paper',
        x=0
    ),
)
fig = go.Figure(data=[trace1], layout=layout)
print(len(usuarios_prob))
iplot(fig)

754


In [30]:
probaFilter = .7
print(len(usuarios_prob[usuarios_prob.Prob_Bot > probaFilter]) / len(usuarios_prob) * 100)
usuarios_prob[usuarios_prob.Prob_Bot > probaFilter].sort_values('Prob_Bot', ascending=False)[['Prob_Bot','screen_name','complete_name','description']]

5.835543766578249


Unnamed: 0,Prob_Bot,screen_name,complete_name,description
762062108674711552,0.973787,elcorreoradio,El Correo Radio,|Magazine de noticias| • |Lunes a Viernes: 7:3...
3042848837,0.971794,Pcia_Noticias,Provincia Noticias,Síntesis Informativa de la Provincia de Buenos...
1006342696838057991,0.960016,elagoradigital,El Ágora,
3434951685,0.953304,QestionPolitica,Cuestión Política,Noticias políticas de la cuarta sección electo...
1121417139863326720,0.928259,Xqestendencia1,por qué carajo es tendencia?,entérate al toque porque algo es tendencia en ...
532911351,0.923779,enSEMANAecg,enSEMANA (El Correo Gráfico),enSEMANA es ahora El Correo Gráfico. Visite ht...
199464393,0.91905,TrendsRosario,Trendsmap Rosario,En tiempo real las tendencias de Rosario
1095362389,0.903691,Pat60Patricia,PATRICIA,ESPOSA Y COMPAÑERA DE MI COMPAÑERO MADRE DE 3 ...
1121853703596670978,0.903054,verde_pantera,Pantera Verde 💚 💚,"Humor político nacional, popular, feminista y ..."
1103227572496879616,0.900948,soledad_urruti,Soledad Urruti,Vive y deja vivir


In [31]:
# Drop a un archivo excel para procesar
import openpyxl
usuarios_prob[usuarios_prob.Prob_Bot > probaFilter].sort_values('Prob_Bot', ascending=False)[['Prob_Bot','screen_name','complete_name','tweet_count','description']]\
.to_excel('../datasets/predictedbots_km.xlsx',index=False)

In [32]:
usuarios_prob[usuarios_prob.Prob_Bot > probaFilter].sort_values('Prob_Bot', ascending=False)[['Prob_Bot','user_id']].to_pickle('../datasets/usersprobasbot.pkl')