In [1]:
import pandas as pd
import numpy as np
import re
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.io as pio

init_notebook_mode(connected=True)
plotly.__version__

import os
import pickle

from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model  import LogisticRegression

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier

from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin

In [2]:
scraped = pd.read_pickle('../datasets/TweetsScrapedUsers_final.pkl')
original = pd.concat([pd.read_pickle('../datasets/UsersDataset_2.pkl'),\
                 pd.read_pickle('../datasets/UsersDataset_3.pkl'),\
                 pd.read_pickle('../datasets/UsersDataset_4.pkl'),\
                 pd.read_pickle('../datasets/UsersDataset_5.pkl'),\
                 pd.read_pickle('../datasets/UsersDataset_6.pkl'),\
                 pd.read_pickle('../datasets/UsersDataset_7.pkl')])
users = original[original.id.isin(scraped.user_id)] 

In [3]:
tweets = pd.read_pickle('../datasets/tweetsProcesados.pkl')
#users = pd.read_pickle('../datasets/UsersDataset.pkl')

tweets = tweets[tweets.tweetsCount.apply(int) > 50]

users.index = users.index.map(int).values
tweets.index = tweets.index.map(int).values

print(len(users))
print(len(tweets))
display(users.head(2))
display(tweets.head(2))

1577
1671


Unnamed: 0,id,screen_name,complete_name,tweet_count,description,location,birthdate,url,listed_count,created_at,crawled_at,favs_count,followers_count,following_count,verified
146431317,146431317,alitapaz,Ale,370962,Inventamos o erramos,Buenos Aires,False,False,1,2010-05-01 00:00:00,2019-06-29 23:15:56.175044,260133,4242,4988,False
4915730830,4915730830,Posadas_Unida,Unidad Posadeña.,1965,Defensor de la cultura y de toda Posadas. Siem...,"Posadas, Argentina",False,False,0,2016-02-01 00:00:00,2019-06-29 23:16:17.191700,55,4939,5407,False


Unnamed: 0,user_id,tweetsCount,semanas,lun,mar,mie,jue,vie,sab,dom,...,num_mentions_mean,num_mentions_median,num_mentions_std,num_mentions_fq,num_mentions_tq,reply_count_mean,reply_count_median,reply_count_std,reply_count_fq,reply_count_tq
10012122,10012122,803,2,177.0,186.0,164.0,141.0,10.0,125.0,531.0,...,0.0,2544.546934352676,0.0,0.0,32571.785268414485,10000.0,114786.22588505047,0.0,30000.0,
10122672,10122672,771,128,2.490566,2.0,2.068966,2.15625,2.272727,1.74359,2.242424,...,4805.194805194805,0.0,7688.441304995766,0.0,10000.0,38311.68831168831,20000.0,65167.50859297861,0.0,50000.0


In [4]:
completeDf = tweets.join(users, how='inner')
completeDf.head()

Unnamed: 0,user_id,tweetsCount,semanas,lun,mar,mie,jue,vie,sab,dom,...,location,birthdate,url,listed_count,created_at,crawled_at,favs_count,followers_count,following_count,verified
10012122,10012122,803,2,177.0,186.0,164.0,141.0,10.0,125.0,531.0,...,"Londres, Inglaterra.",False,bbcmundo.com,10,2007-11-01 00:00:00,2019-06-29 23:32:57.072146,920,3141578,399,True
10122672,10122672,771,128,2.490566,2.0,2.068966,2.15625,2.272727,1.74359,2.242424,...,"Rosario, Argentina",22 de septiembre,penotti.com.ar,186,2007-11-01 00:00:00,2019-07-02 18:59:47.874537,11197,56336,37916,False
10162502,10162502,818,100,2.7,2.210526,2.563636,2.959184,2.346154,2.03125,2.0,...,False,False,miriamruiz.es,25,2007-11-01 00:00:00,2019-06-30 21:29:20.733391,485,1345,1001,False
10246422,10246422,832,26,6.565217,6.26087,5.272727,5.130435,5.166667,3.5,4.857143,...,Neuquén,False,sigloxxieditores.com.ar/fichaLibro.php…,0,2007-11-01 00:00:00,2019-06-30 00:04:08.174415,70187,29614,7898,False
10283902,10283902,847,4,49.0,49.666667,50.333333,46.5,39.75,18.333333,2252.0,...,Chile,False,lanacion.cl,6,2007-11-01 00:00:00,2019-07-02 18:59:54.679406,804,323425,4354,True


In [5]:
#crear clase
class twDataTransform(TransformerMixin):
    def fit(self, X, Y):
        return self
    def transform(self, X, Y=None):
        df = X
        try:
            df.loc[:, 'crawled_at'] = pd.DatetimeIndex(pd.to_datetime(df.crawled_at, utc=None)).tz_convert(tz='UTC')
            df.loc[:, 'created_at'] = pd.DatetimeIndex(pd.to_datetime(df.created_at, utc=None)).tz_convert(tz='UTC')
        except:
            try:
                df.loc[:, 'crawled_at'] = pd.DatetimeIndex(pd.to_datetime(df.crawled_at, utc=None)).tz_localize(tz='UTC')
                df.loc[:, 'created_at'] = pd.DatetimeIndex(pd.to_datetime(df.created_at, utc=None)).tz_localize(tz='UTC')
            except:
                df.loc[:, 'crawled_at'] = pd.DatetimeIndex(pd.to_datetime(df.crawled_at, utc=None)).tz_convert(tz='UTC')
                df.loc[:, 'created_at'] = pd.DatetimeIndex(pd.to_datetime(df.created_at, utc=None)).tz_convert(tz='UTC')
        account_life = df.crawled_at - df.created_at
        account_life_meses = np.round(account_life.dt.days / 30)
        account_life_meses = account_life_meses.replace(0, 1)
        account_life_semanas = np.round(account_life.dt.days / 7)
        account_life_semanas = account_life_semanas.replace(0, 1)
        numValues = ['tweet_count', 'listed_count', 'followers_count', 'following_count']
        for i in numValues:
            colName_mes = i+'_mes'
            df.loc[:,colName_mes] = df[i].astype(float) / account_life_meses

            colName_semana = i+'_semana'
            df.loc[:,colName_semana] = df[i].astype(float) / account_life_semanas
        columnas = np.concatenate(
            [[x+'_mes'] for x in numValues]
        )
        columnasTweets = np.concatenate(
            [
                [x+'_mean',x+'_median', x+'_std', x+'_fq', x+'_tq'] for x in
                ['times','timesRT','timesUser','favorite_count','retweet_count','num_mentions','reply_count']
            ]
        )
        columnas = np.concatenate([
            ['semanas', 'lun', 'mar', 'mie', 'jue', 'vie', 'sab', 'dom'],columnas,columnasTweets
        ])
        df = df[columnas]
        df = df.fillna(0)
        df = df.applymap(lambda x: np.round(float(x), 10) if np.isfinite(float(x)) else np.nan)
        df = df.fillna(0)
        return df
    
#load model
import pickle
with open('../modelo/modeloCompleto.pkl', 'rb') as m_pred:
    model = pickle.load(m_pred)

In [6]:
predicciones = model.predict_proba(completeDf)
predicciones = pd.DataFrame(predicciones)
predicciones.index = completeDf.index
predicciones.columns = ['Prob_Human', 'Prob_Bot']
usuarios_prob = pd.concat([predicciones['Prob_Bot'], completeDf], axis=1)

In [7]:
yval = []
xval = np.linspace(0,1, 300)
for i in xval:
    cantUsuarios = len(usuarios_prob[usuarios_prob['Prob_Bot']>i])
    yval.append(cantUsuarios)
trace1 = go.Scatter(
    name='Inscriptos',
    x=xval,
    y=yval,
#     line={
#         'shape': 'spline',
#         'smoothing': 0.7,
#     }
)
layout = go.Layout(
    title=go.layout.Title(
        text='Cantidad bots',
        xref='paper',
        x=0
    ),
)
fig = go.Figure(data=[trace1], layout=layout)
print(len(usuarios_prob))
iplot(fig)

1564


In [8]:
probaFilter = .7
print(len(usuarios_prob[usuarios_prob.Prob_Bot > probaFilter]) / len(usuarios_prob) * 100)
usuarios_prob[usuarios_prob.Prob_Bot > probaFilter].sort_values('Prob_Bot', ascending=False)[['Prob_Bot','screen_name','complete_name','tweet_count','description']]

5.818414322250639


Unnamed: 0,Prob_Bot,screen_name,complete_name,tweet_count,description
1658740507,0.989592,twitpoliticoOK,twit político,4777,Todas las noticias que buscás están en http://...
827352011595194368,0.987680,QuorumARG,Quorum Digital,1334,Portal de noticias y actualidad del Conurbano ...
783129248,0.976391,Federico2015,Justicia y Libertad,9059,Soy un Argentino que me interesa saber a quien...
771243914,0.965900,Winifreda_Noti,Winifreda La Pampa,15799,"Noticias de la localidad de Winifreda , provin..."
461714626,0.954018,soywashington,Washington Party,115998,CANTANTE - ESCRITOR - PRODUCTOR - ARTISTA - EM...
7848912,0.950499,abc_gente,Gente en ABC,37889,"Noticias de la secciones Estilo, Gente y Moda ..."
8173372,0.940797,vinchusky,Very Hinchusky,28416,"Tomá, pa' los chicles. Y guardá el vuelto."
1455253418,0.936993,TVCanalC,Canal C Córdoba,23783,Cuenta oficial de Canal C Córdoba. En el 7 de ...
8973622,0.936785,InfobaeEconomia,Infobae Economía,28686,Todo el tiempo. Todas las noticias de economía...
2246294855,0.932285,_Nango_,N A N G O,19733,Cine - Series - Misceláneas (?) - Cuervo. Fan ...


In [9]:
# Drop a un archivo excel para procesar
import openpyxl
usuarios_prob[usuarios_prob.Prob_Bot > probaFilter].sort_values('Prob_Bot', ascending=False)[['Prob_Bot','screen_name','complete_name','tweet_count','description']]\
\.to_excel('../datasets/predictedbots.xlsx',index=False)

SyntaxError: unexpected character after line continuation character (<ipython-input-9-f111128d8be4>, line 4)