In [6]:
# !pip install import_ipynb --user
import pandas as pd
from catboost import CatBoostClassifier
from sklearn import preprocessing
import pickle
import import_ipynb
import numpy as np

import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.io as pio

init_notebook_mode(connected=True)
plotly.__version__

'3.8.1'

In [7]:
users = pd.read_pickle('../datasets/UsersDataset.pkl')
users.head(3)

Unnamed: 0_level_0,id,screen_name,complete_name,tweet_count,description,location,birthdate,url,listed_count,created_at,crawled_at,favs_count,followers_count,following_count,verified
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2300560809,2300560809,NMoraniB,Nicole Morani Brown,593,Gerencia General de @PuertoBsAs Ministerio de ...,"Buenos Aires, Argentina",False,False,0,2014-01-01 00:00:00,2019-06-29 23:04:21.059135,2377,1630,1510,False
1536923558,1536923558,FabianPereyra87,Fabian Pereyra,2532,,False,False,False,0,2013-06-01 00:00:00,2019-06-29 23:04:21.289518,4307,53,284,False
235491971,235491971,gsrcaballero,Gus Caballero,21635,"Papa de Cami y Rochi, FANA de La Academia, PRO...",False,False,False,0,2011-01-01 00:00:00,2019-06-29 23:04:21.030207,5683,1691,1360,False


In [8]:
# Cargar Modelo
# model = CatBoostClassifier().load_model("../modelo/modelo")

# import os
# os.chdir("../modelo/")
# from generacionModelo import transform_x
# os.chdir("../dataAnalysis/")

# from '../modelo/Model UserBasicClassifier.ipynb' import transform_x

def transform_x(df):
    df.loc[:, 'crawled_at'] = pd.DatetimeIndex(pd.to_datetime(df.crawled_at, utc=None))
    df.loc[:, 'created_at'] = pd.DatetimeIndex(pd.to_datetime(df.created_at, utc=None))
    
    
    account_life = df.crawled_at - df.created_at
    account_life_meses = np.round(account_life.dt.days / 30)
    account_life_meses = account_life_meses.replace(0, 1)
    account_life_semanas = np.round(account_life.dt.days / 7)
    account_life_semanas = account_life_semanas.replace(0, 1)
    numValues = ['tweet_count', 'listed_count', 'favs_count', 'followers_count', 'following_count']
    for i in numValues:
        colName_mes = i+'_mes'
        df.loc[:,colName_mes] = df[i].astype(float) / account_life_meses

        colName_semana = i+'_semana'
        df.loc[:,colName_semana] = df[i].astype(float) / account_life_semanas

    df.loc[:,'verified'] = ~df.verified.isna()
    df.loc[:,'description'] = ~df.description.isna()
    df.loc[:,'location'] = ~df.location.isna()
    df.loc[:,'url'] = ~df.url.isna()

    columnas = np.concatenate(
        [[x, x+'_mes', x+'_semana'] for x in numValues]
    )
    columnas = np.concatenate([
        [
            'verified',
            'description',
            'location',
            'url',
        ],
        columnas
    ])
    df = df[columnas]
    df = df.fillna(0)
    return df

In [9]:
with open('../modelo/modelPipeline.pkl', 'rb') as m_pred:
    model = pickle.load(m_pred)
model

Pipeline(memory=None,
     steps=[('cat', <catboost.core.CatBoostClassifier object at 0x0000019C1A468C18>)])

In [16]:
predicciones = model.predict_proba(transform_x(users))

In [17]:
# Joinear con nombres de usuarios, limpiar indices y joinear por orden de aparicion
usuarios_prob = pd.DataFrame(users.screen_name.to_list()).join(pd.DataFrame(predicciones)[1])
usuarios_prob.columns = ['Usuario','Prob_Bot']
# Quedarnos con los que dan probabilidad mayor al 99% de ser bots segun el modelo basico
# usuarios_prob.sort_values(by='Prob_Bot',ascending=False)

In [18]:
yval = []
xval = np.linspace(0,1, 200)
for i in xval:
    cantUsuarios = len(usuarios_prob[usuarios_prob['Prob_Bot']>i])
    yval.append(cantUsuarios)
trace1 = go.Scatter(
    name='Inscriptos',
    x=xval,
    y=yval,
#     line={
#         'shape': 'spline',
#         'smoothing': 0.7,
#     }
)
layout = go.Layout(
    title=go.layout.Title(
        text='Cantidad bots',
        xref='paper',
        x=0
    ),
)
fig = go.Figure(data=[trace1], layout=layout)
print(len(usuarios_prob))
iplot(fig)

15482


In [19]:
usuarios_prob[usuarios_prob['Prob_Bot']>.2].sort_values(by='Prob_Bot', ascending=False)

Unnamed: 0,Usuario,Prob_Bot
9657,paxbisonica,0.999482
13335,ConsultoraPrax1,0.999007
6223,Moka_Fin,0.998156
5158,infpolitico,0.997068
8325,Carlosofando,0.997002
3912,ElNacionalNoti,0.996371
11193,kgarcia0191,0.996234
4661,Mario_PSCL,0.995950
10661,DosTitulos,0.995772
5961,RicardoIorio_,0.995530


In [36]:
import math
usuarios_prob.loc[:,'probLog'] = usuarios_prob.Prob_Bot.apply(lambda x: math.sqrt(x))

yval = []
xval = np.linspace(0,1, 200)
for i in xval:
    cantUsuarios = len(usuarios_prob[usuarios_prob['probLog']>i])
    yval.append(cantUsuarios)
trace1 = go.Scatter(
    name='Inscriptos',
    x=xval,
    y=yval,
#     line={
#         'shape': 'spline',
#         'smoothing': 0.7,
#     }
)
layout = go.Layout(
    title=go.layout.Title(
        text='Cantidad bots',
        xref='paper',
        x=0
    ),
)
fig = go.Figure(data=[trace1], layout=layout)
print(len(usuarios_prob))
iplot(fig)

15482
