# Técnicas de Machine Learning para Classificação Supervisionada de Contas Bots do Twitter.

## FIA LABDATA - Turma 13

### Descrição dos Data Sets utilizados: 

Contas classificadas como reais e bots, anotadas pela [CrowdFlower](https://en.wikipedia.org/wiki/Figure_Eight_Inc.), conforme descrita no paper:[The Paradigm-Shift of Social Spambots: Evidence, Theories, and Tools for the Arms Race](http://dl.acm.org/citation.cfm?doid=3041021.3055135)

fonte: http://mib.projects.iit.cnr.it/dataset.html


In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import datetime

# Importando os Data Sets

In [3]:
df_genuine_accounts = pd.read_csv('data/cresci-2017/genuine_accounts.csv/tweets.csv')
df_genuine_accounts['classification'] = 'human'
df_genuine_accounts['dataset'] = 'genuine_accounts'

# elimina timezone da data
#df_genuine_accounts['created_at'] = pd.to_datetime(df_genuine_accounts['created_at']).dt.tz_localize(None)

In [4]:
df_social_spambots_1 = pd.read_csv('data/cresci-2017/social_spambots_1.csv/tweets.csv')
df_social_spambots_1['classification'] = 'bot'
df_social_spambots_1['dataset'] = 'social_spambots_1'

# elimina timezone da data
#df_social_spambots_1['created_at'] = pd.to_datetime(df_social_spambots_1['created_at']).dt.tz_localize(None)

In [5]:
df_social_spambots_2 = pd.read_csv('data/cresci-2017/social_spambots_2.csv/tweets.csv')
df_social_spambots_2['classification'] = 'bot'
df_social_spambots_2['dataset'] = 'social_spambots_2'

# elimina timezone da data
#df_social_spambots_2['created_at'] = pd.to_datetime(df_social_spambots_2['created_at']).dt.tz_localize(None)

In [6]:
df_social_spambots_3 = pd.read_csv('data/cresci-2017/social_spambots_3.csv/tweets.csv')
df_social_spambots_3['classification'] = 'bot'
df_social_spambots_3['dataset'] = 'social_spambots_3'

# elimina timezone da data
#df_social_spambots_3['created_at'] = pd.to_datetime(df_social_spambots_3['created_at']).dt.tz_localize(None)

In [8]:
df_traditional_spambots_1 = pd.read_csv('data/cresci-2017/traditional_spambots_1.csv/tweets.csv')
df_traditional_spambots_1['classification'] = 'bot'
df_traditional_spambots_1['dataset'] = 'traditional_spambots_1'

# Este dataset traz a variavel created_at em formato timestamp, em formato string com o 
# ultimo caracter = L (antiga marcação Python2 para Long)

# Elimina o ultimo caracter L das datas com formato timestamp 
df_traditional_spambots_1['created_at'] = pd.Series(df_traditional_spambots_1['created_at']).str[0:13]

# converte timestamp to datetime
df_traditional_spambots_1['created_at'] = df_traditional_spambots_1['created_at'].apply(
    lambda x: datetime.datetime.fromtimestamp(int(x)/1000)
)

In [9]:
df_fake_followers = pd.read_csv('data/cresci-2017/fake_followers.csv/tweets.csv')
df_fake_followers['classification'] = 'bot'
df_fake_followers['dataset'] = 'fake_followers'

# elimina timezone da data
#df_fake_followers['created_at'] = pd.to_datetime(df_fake_followers['created_at']).dt.tz_localize(None)

In [10]:
print(df_social_spambots_1.shape)
print(df_social_spambots_2.shape)
print(df_social_spambots_3.shape)
print(df_traditional_spambots_1.shape)
print(df_fake_followers.shape)
print(df_genuine_accounts.shape)

(1610034, 27)
(428542, 27)
(1418557, 27)
(145094, 27)
(196027, 25)
(2839361, 27)


# Concatenando os Datasets

In [11]:
# Append dos datasets
df_tweets = pd.concat([
    df_social_spambots_1,
    df_social_spambots_2,
    df_social_spambots_3,
    df_traditional_spambots_1,
    df_fake_followers,
    df_genuine_accounts
])

In [12]:
df_tweets.columns, df_tweets.shape

(Index(['id', 'text', 'source', 'user_id', 'truncated', 'in_reply_to_status_id',
        'in_reply_to_user_id', 'in_reply_to_screen_name', 'retweeted_status_id',
        'geo', 'place', 'contributors', 'retweet_count', 'reply_count',
        'favorite_count', 'favorited', 'retweeted', 'possibly_sensitive',
        'num_hashtags', 'num_urls', 'num_mentions', 'created_at', 'timestamp',
        'crawled_at', 'updated', 'classification', 'dataset'],
       dtype='object'),
 (6637615, 27))

In [13]:
# Drop features listadas pelo twitter como deprecated

deprecated_features = [
    'geo',
    'contributors',
    'favorited',
    'retweeted',
    'crawled_at',
    'updated'
]

df_tweets = df_tweets.drop(deprecated_features, axis=1);

In [14]:
df_tweets.isnull().sum()

id                               0
text                         13006
source                          72
user_id                          0
truncated                  6636862
in_reply_to_status_id            0
in_reply_to_user_id              0
in_reply_to_screen_name    5598481
retweeted_status_id         196027
place                      6508964
retweet_count                    0
reply_count                      0
favorite_count                   0
possibly_sensitive         6610803
num_hashtags                     0
num_urls                         0
num_mentions                     0
created_at                       0
timestamp                        0
classification                   0
dataset                          0
dtype: int64

# Correção de valores das variáveis

In [15]:
# A variavel truncated possui 2 valores possiveis, de acordo com documentação do Twitter (True,False)
# Os data sets apresentam os valores 1 e NaN, portanto, Substituir valores NaN por 0.
df_tweets = df_tweets.fillna({'truncated':0})

df_tweets['truncated'].isnull().sum()

0

In [16]:
# A variavel possibly_sensitive possui 2 valores possiveis, de acordo com documentação do Twitter (True,False)
# Os data sets apresentam os valores 1 e NaN, portanto, Substituir valores NaN por 0.
df_tweets = df_tweets.fillna({'possibly_sensitive':0})

df_tweets['possibly_sensitive'].isnull().sum()

0

In [17]:
# investigando os tweets com text nulos
df_temp = df_tweets.query('text != text')
df_temp.groupby('dataset').agg(cont = ('id','count'))

Unnamed: 0_level_0,cont
dataset,Unnamed: 1_level_1
fake_followers,20
genuine_accounts,12571
social_spambots_1,90
social_spambots_2,321
social_spambots_3,4


In [18]:
# definindo '' branco para os tweets sem texto
df_tweets['text'] = df_tweets.fillna({'text':''}).filter(['text'])

df_tweets['text'].isnull().sum()

0

# Criando novas variáveis

In [19]:
# A variavel in_reply_to_screen_name quando diferente de null indica que o tweet é uma resposta ao scree_name em questão
# A variavel is_reply está sendo criada para indicar se o tweet é uma resposta a alguém.
df_tweets['is_reply'] = np.where(df_tweets['in_reply_to_screen_name'].isnull(),0,1)

In [20]:
# Todos os retweets tem o texto iniciado por RT
# Cria a variavel is_retweet para indicar se o tweet é na verdade um retweet baseado nas 2 primeira letras do text
df_tweets['is_retweet'] = np.where(df_tweets.text.str[0:2] == 'RT',1,0)

# Variáveis desprezadas

In [21]:
drop_features = [
    'in_reply_to_screen_name',
    'in_reply_to_status_id',
    'place',
    'retweeted_status_id',
    'source',
    'timestamp',
    'in_reply_to_user_id'
]

df_tweets.drop(drop_features, axis=1, inplace=True)

# Agregando valores para adicionar ao dataset de Accounts
#### Coletando métricas relacionadas aos usuários para que possam ser agregados ao dataset de Accounts

In [22]:
df_tweets.columns

Index(['id', 'text', 'user_id', 'truncated', 'retweet_count', 'reply_count',
       'favorite_count', 'possibly_sensitive', 'num_hashtags', 'num_urls',
       'num_mentions', 'created_at', 'classification', 'dataset', 'is_reply',
       'is_retweet'],
      dtype='object')

In [23]:
df_user_metrics = (
    df_tweets
    .groupby('user_id')
    .agg(
        tweet_count = ('id','count'),
        mean_hashtag_usage = ('num_hashtags','mean'),
        mean_url_usage = ('num_urls','mean'),
        mean_mention_usage = ('num_mentions','mean'),
        mean_replies_made = ('is_reply','mean'),
        mean_retweets_made = ('is_retweet','mean'),
        mean_reply_received = ('reply_count','mean'),
        mean_retweet_received = ('retweet_count','mean')
    )
)

In [24]:
df_user_metrics.head()

Unnamed: 0_level_0,tweet_count,mean_hashtag_usage,mean_url_usage,mean_mention_usage,mean_replies_made,mean_retweets_made,mean_reply_received,mean_retweet_received
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
678033,3195,0.368388,0.297653,1.112989,0.33928,0.28795,0.0,75.970266
722623,3201,0.360512,0.163386,0.711965,0.28085,0.175883,0.0,283.785067
755116,3209,0.040199,0.123715,0.959177,0.518542,0.068869,0.0,32.989716
755746,3234,0.269944,0.461657,0.682127,0.346939,0.085343,0.0,199.15337
785080,3235,0.051932,0.063988,0.78238,0.428748,0.120556,0.0,43.26306


In [26]:
df_tweets[['created_at','dataset']].sample(10)

Unnamed: 0,created_at,dataset
231062,Sat Jan 10 23:25:03 +0000 2015,genuine_accounts
1518612,Tue Jun 17 12:14:42 +0000 2014,social_spambots_1
1124066,Sat Sep 20 04:42:17 +0000 2014,social_spambots_3
697241,Sat Oct 04 23:05:11 +0000 2014,social_spambots_3
1928139,Tue Feb 10 17:14:00 +0000 2015,genuine_accounts
171936,Tue Apr 29 22:07:40 +0000 2014,social_spambots_2
269147,Wed Jul 30 06:53:24 +0000 2014,genuine_accounts
116709,2009-11-06 17:31:40,traditional_spambots_1
76221,2009-09-30 17:16:21,traditional_spambots_1
1025269,Mon Oct 20 23:36:50 +0000 2014,social_spambots_3
