In [14]:
import pandas as pd
import numpy as np
import re
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.io as pio

init_notebook_mode(connected=True)
plotly.__version__

import os
import pickle

from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model  import LogisticRegression

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier

from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin

In [2]:
tweets = pd.read_pickle('../datasets/tweetsProcesados.pkl')
users = pd.read_pickle('../datasets/UsersDataset.pkl')

tweets = tweets[tweets.tweetsCount.apply(int) > 50]

users.index = users.index.map(int).values
tweets.index = tweets.index.map(int).values

print(len(users))
print(len(tweets))
display(users.head(2))
display(tweets.head(2))

15482
1499


Unnamed: 0,id,screen_name,complete_name,tweet_count,description,location,birthdate,url,listed_count,created_at,crawled_at,favs_count,followers_count,following_count,verified
2300560809,2300560809,NMoraniB,Nicole Morani Brown,593,Gerencia General de @PuertoBsAs Ministerio de ...,"Buenos Aires, Argentina",False,False,0,2014-01-01 00:00:00,2019-06-29 23:04:21.059135,2377,1630,1510,False
1536923558,1536923558,FabianPereyra87,Fabian Pereyra,2532,,False,False,False,0,2013-06-01 00:00:00,2019-06-29 23:04:21.289518,4307,53,284,False


Unnamed: 0,user_id,tweetsCount,semanas,lun,mar,mie,jue,vie,sab,dom,...,num_mentions_mean,num_mentions_median,num_mentions_std,num_mentions_fq,num_mentions_tq,reply_count_mean,reply_count_median,reply_count_std,reply_count_fq,reply_count_tq
102407528,102407528,168,10,3.166667,3.666667,3.833333,5.142857,4.625,3.5,3.333333,...,833.3333333333333,0.0,2776.739162008521,0.0,0.0,21574.074074074077,10000.0,31330.67640797719,0.0,30000.0
1032436344759504896,1032436344759504896,87,4,3.666667,3.5,3.0,4.5,6.0,5.666667,10.333333,...,0.0,0.0,0.0,0.0,0.0,,,,,


In [3]:
completeDf = tweets.join(users, how='inner')
completeDf.head()

Unnamed: 0,user_id,tweetsCount,semanas,lun,mar,mie,jue,vie,sab,dom,...,location,birthdate,url,listed_count,created_at,crawled_at,favs_count,followers_count,following_count,verified
102407528,102407528,168,10,3.166667,3.666667,3.833333,5.142857,4.625,3.5,3.333333,...,False,False,tril.us,1,2010-01-01 00:00:00,2019-07-01 00:21:59.629914,18341,7119,365,False
1032436344759504896,1032436344759504896,87,4,3.666667,3.5,3.0,4.5,6.0,5.666667,10.333333,...,False,False,False,0,2018-08-01 00:00:00,2019-07-02 19:32:30.291094,7070,41,50,False
1032441989915140096,1032441989915140096,178,18,2.333333,2.166667,2.0,2.555556,1.666667,2.416667,1.636364,...,"Caleta Olivia, Argentina",False,False,0,2018-08-01 00:00:00,2019-07-02 19:32:37.788036,983,51,82,False
1032737865602674690,1032737865602674690,177,9,2.75,7.333333,2.0,7.333333,14.333333,11.75,6.5,...,@brknyouth en ig skeree,False,curiouscat.me/etherealpilots,0,2018-08-01 00:00:00,2019-07-02 19:32:45.626155,2628,460,300,False
1032811143553789952,1032811143553789952,109,3,6.5,22.0,24.0,16.0,4.0,4.0,9.0,...,"Córdoba, Argentina",False,False,0,2018-08-01 00:00:00,2019-06-29 23:08:43.663503,11963,114,204,False


In [105]:
#crear clase
class twDataTransform(TransformerMixin):
    def fit(self, X, Y):
        return self
    def transform(self, X, Y=None):
        df = X
        try:
            df.loc[:, 'crawled_at'] = pd.DatetimeIndex(pd.to_datetime(df.crawled_at, utc=None)).tz_convert(tz='UTC')
            df.loc[:, 'created_at'] = pd.DatetimeIndex(pd.to_datetime(df.created_at, utc=None)).tz_convert(tz='UTC')
        except:
            try:
                df.loc[:, 'crawled_at'] = pd.DatetimeIndex(pd.to_datetime(df.crawled_at, utc=None)).tz_localize(tz='UTC')
                df.loc[:, 'created_at'] = pd.DatetimeIndex(pd.to_datetime(df.created_at, utc=None)).tz_localize(tz='UTC')
            except:
                df.loc[:, 'crawled_at'] = pd.DatetimeIndex(pd.to_datetime(df.crawled_at, utc=None)).tz_convert(tz='UTC')
                df.loc[:, 'created_at'] = pd.DatetimeIndex(pd.to_datetime(df.created_at, utc=None)).tz_convert(tz='UTC')
        account_life = df.crawled_at - df.created_at
        account_life_meses = np.round(account_life.dt.days / 30)
        account_life_meses = account_life_meses.replace(0, 1)
        account_life_semanas = np.round(account_life.dt.days / 7)
        account_life_semanas = account_life_semanas.replace(0, 1)
        numValues = ['tweet_count', 'listed_count', 'followers_count', 'following_count']
        for i in numValues:
            colName_mes = i+'_mes'
            df.loc[:,colName_mes] = df[i].astype(float) / account_life_meses

            colName_semana = i+'_semana'
            df.loc[:,colName_semana] = df[i].astype(float) / account_life_semanas
        columnas = np.concatenate(
            [[x+'_mes'] for x in numValues]
        )
        columnasTweets = np.concatenate(
            [
                [x+'_mean',x+'_median', x+'_std', x+'_fq', x+'_tq'] for x in
                ['times','timesRT','timesUser','favorite_count','retweet_count','num_mentions','reply_count']
            ]
        )
        columnas = np.concatenate([
            ['semanas', 'lun', 'mar', 'mie', 'jue', 'vie', 'sab', 'dom'],columnas,columnasTweets
        ])
        df = df[columnas]
        df = df.fillna(0)
        df = df.applymap(lambda x: np.round(float(x), 10) if np.isfinite(float(x)) else np.nan)
        df = df.fillna(0)
        return df
    
#load model
import pickle
with open('../modelo/modeloCompleto.pkl', 'rb') as m_pred:
    model = pickle.load(m_pred)

In [126]:
predicciones = model.predict_proba(completeDf)
predicciones = pd.DataFrame(predicciones)
predicciones.index = completeDf.index
predicciones.columns = ['Prob_Human', 'Prob_Bot']
usuarios_prob = pd.concat([predicciones['Prob_Bot'], completeDf], axis=1)

In [127]:
yval = []
xval = np.linspace(0,1, 300)
for i in xval:
    cantUsuarios = len(usuarios_prob[usuarios_prob['Prob_Bot']>i])
    yval.append(cantUsuarios)
trace1 = go.Scatter(
    name='Inscriptos',
    x=xval,
    y=yval,
#     line={
#         'shape': 'spline',
#         'smoothing': 0.7,
#     }
)
layout = go.Layout(
    title=go.layout.Title(
        text='Cantidad bots',
        xref='paper',
        x=0
    ),
)
fig = go.Figure(data=[trace1], layout=layout)
print(len(usuarios_prob))
iplot(fig)

1499


In [139]:
probaFilter = .7
print(len(usuarios_prob[usuarios_prob.Prob_Bot > probaFilter]) / len(usuarios_prob) * 100)
usuarios_prob[usuarios_prob.Prob_Bot > probaFilter].sort_values('Prob_Bot', ascending=False)[['Prob_Bot','screen_name','complete_name','tweet_count','description']]

11.474316210807205


Unnamed: 0,Prob_Bot,screen_name,complete_name,tweet_count,description
1036196229422755840,0.995879,Abogadoexitoso2,Abogadoexitoso- Bruno del Oeste,4288,"Nací zurdo, pobre y en La Matanza, mi rebeldía..."
3374187035,0.995659,CPB_Noticias,CPB NOTICIAS,9591,Portal de noticias #Quilmes / Whapp: 112190269...
150470216,0.994469,SitiosArgentina,Sitios Argentina !,33188,Noticias interesantes increíbles e insólitas. ...
220475305,0.993677,infolaplata,infolaplata,26408,Te contamos lo que nadie te va a decir #LaPlat...
2874688017,0.993651,Tu67Line,diffusion line,31840,
938912443702677505,0.992634,absolutkilam,mart!na,5329,soy un cementerio de canelones
190061386,0.992607,NestorLuis,Néstor Luis Alvarez ⚡️,3379,Abogado - Docente - Cmte. de la Orden San Greg...
1033013247484588032,0.992093,Teto1316,Ernesto Marín Querol,4224,"Español, de derechas , seguidor del FC Barcelo..."
51890039,0.988970,Nikgaturro,Nik,45031,Amar Reír Disfrutar Soñar Vivir ★http://insta...
233411013,0.987344,NotasJBiebs,4Sexys Beliebers,75682,1 person+ 1 dream+ 2009+ 1 country+ believe= O...
