# Técnicas de Machine Learning para Classificação Supervisionada de Contas Bots do Twitter.

## FIA LABDATA - Turma 13

### Descrição dos Data Sets utilizados: 

Contas classificadas como reais e bots,conforme descrita no paper:[The Paradigm-Shift of Social Spambots: Evidence, Theories, and Tools for the Arms Race](http://dl.acm.org/citation.cfm?doid=3041021.3055135)

fonte: http://mib.projects.iit.cnr.it/dataset.html


In [12]:
import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime

# Importando os Data Sets

In [3]:
df_genuine_accounts = pd.read_csv('data/cresci-2017/genuine_accounts.csv/tweets.csv')
df_genuine_accounts['classification'] = 'human'
df_genuine_accounts['dataset'] = 'genuine_accounts'

In [4]:
df_social_spambots_1 = pd.read_csv('data/cresci-2017/social_spambots_1.csv/tweets.csv')
df_social_spambots_1['classification'] = 'bot'
df_social_spambots_1['dataset'] = 'social_spambots_1'

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
df_social_spambots_2 = pd.read_csv('data/cresci-2017/social_spambots_2.csv/tweets.csv')
df_social_spambots_2['classification'] = 'bot'
df_social_spambots_2['dataset'] = 'social_spambots_2'

In [6]:
df_social_spambots_3 = pd.read_csv('data/cresci-2017/social_spambots_3.csv/tweets.csv')
df_social_spambots_3['classification'] = 'bot'
df_social_spambots_3['dataset'] = 'social_spambots_3'

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
df_traditional_spambots_1 = pd.read_csv('data/cresci-2017/traditional_spambots_1.csv/tweets.csv')
df_traditional_spambots_1['classification'] = 'bot'
df_traditional_spambots_1['dataset'] = 'traditional_spambots_1'

# Este dataset traz a variavel created_at em formato timestamp, em formato string com o 
# ultimo caracter = L (antiga marcação Python2 para Long)

# Elimina o ultimo caracter L das datas com formato timestamp 
df_traditional_spambots_1['created_at'] = pd.Series(df_traditional_spambots_1['created_at']).str[0:13]

# converte timestamp to datetime
df_traditional_spambots_1['created_at'] = df_traditional_spambots_1['created_at'].apply(
    lambda x: datetime.datetime.fromtimestamp(int(x)/1000)
)

In [8]:
df_fake_followers = pd.read_csv('data/cresci-2017/fake_followers.csv/tweets.csv')
df_fake_followers['classification'] = 'bot'
df_fake_followers['dataset'] = 'fake_followers'


  interactivity=interactivity, compiler=compiler, result=result)


In [20]:
# adiciona a coluna Date (YMD) em cada um dos dataframes
df_list = [
    df_social_spambots_1,
    df_social_spambots_2,
    df_social_spambots_3,
    df_traditional_spambots_1,
    df_fake_followers,
    df_genuine_accounts
]

for dataframe in df_list:

    dataframe['created_at'] = pd.to_datetime(dataframe['created_at'])
    dataframe['date'] = dataframe['created_at'].apply(
        lambda x: datetime(year=x.year, month=x.month, day=x.day))

In [21]:
print(df_social_spambots_1.shape)
print(df_social_spambots_2.shape)
print(df_social_spambots_3.shape)
print(df_traditional_spambots_1.shape)
print(df_fake_followers.shape)
print(df_genuine_accounts.shape)

(1610034, 28)
(428542, 28)
(1418557, 28)
(145094, 28)
(196027, 26)
(2839361, 28)


# Concatenando os Datasets

In [22]:
# Append dos datasets
df_tweets = pd.concat([
    df_social_spambots_1,
    df_social_spambots_2,
    df_social_spambots_3,
    df_traditional_spambots_1,
    df_fake_followers,
    df_genuine_accounts
])

In [23]:
df_tweets.columns, df_tweets.shape

(Index(['id', 'text', 'source', 'user_id', 'truncated', 'in_reply_to_status_id',
        'in_reply_to_user_id', 'in_reply_to_screen_name', 'retweeted_status_id',
        'geo', 'place', 'contributors', 'retweet_count', 'reply_count',
        'favorite_count', 'favorited', 'retweeted', 'possibly_sensitive',
        'num_hashtags', 'num_urls', 'num_mentions', 'created_at', 'timestamp',
        'crawled_at', 'updated', 'classification', 'dataset', 'date'],
       dtype='object'),
 (6637615, 28))

In [24]:
# Drop features listadas pelo twitter como deprecated

deprecated_features = [
    'geo',
    'contributors',
    'favorited',
    'retweeted',
    'crawled_at',
    'updated'
]

df_tweets = df_tweets.drop(deprecated_features, axis=1);

In [25]:
# Verifica se há valores nulos
df_tweets.isnull().sum()

id                               0
text                         13006
source                          72
user_id                          0
truncated                  6636862
in_reply_to_status_id            0
in_reply_to_user_id              0
in_reply_to_screen_name    5598481
retweeted_status_id         196027
place                      6508964
retweet_count                    0
reply_count                      0
favorite_count                   0
possibly_sensitive         6610803
num_hashtags                     0
num_urls                         0
num_mentions                     0
created_at                       0
timestamp                        0
classification                   0
dataset                          0
date                             0
dtype: int64

# Tratamento das variáveis

In [26]:
# A variavel truncated possui 2 valores possiveis, de acordo com documentação do Twitter (True,False)
# Os data sets apresentam os valores 1 e NaN, portanto, Substituir valores NaN por 0.
df_tweets['truncated'] = df_tweets['truncated'].fillna(0)
df_tweets['truncated'].isnull().sum()

0

In [27]:
# A variavel possibly_sensitive possui 2 valores possiveis, de acordo com documentação do Twitter (True,False)
# Os data sets apresentam os valores 1 e NaN, portanto, Substituir valores NaN por 0.
df_tweets['possibly_sensitive'] = df_tweets['possibly_sensitive'].fillna(0)
df_tweets['possibly_sensitive'].isnull().sum()

0

In [28]:
# investigando os tweets com text nulos
df_temp = df_tweets.query('text != text')
df_temp.groupby('dataset').agg(cont = ('id','count'))

Unnamed: 0_level_0,cont
dataset,Unnamed: 1_level_1
fake_followers,20
genuine_accounts,12571
social_spambots_1,90
social_spambots_2,321
social_spambots_3,4


In [29]:
# definindo '' branco para os tweets sem texto
df_tweets['text'] = df_tweets.fillna({'text':''}).filter(['text'])
df_tweets['text'].isnull().sum()

0

# Criando novas variáveis

In [30]:
# A variavel in_reply_to_screen_name quando diferente de null indica que o tweet é uma resposta ao scree_name em questão
# A variavel is_reply está sendo criada para indicar se o tweet é uma resposta a alguém.
df_tweets['is_reply'] = np.where(df_tweets['in_reply_to_screen_name'].isnull(),0,1)

In [31]:
# Todos os retweets tem o texto iniciado por RT
# Cria a variavel is_retweet para indicar se o tweet é na verdade um retweet baseado nas 2 primeira letras do text
df_tweets['is_retweet'] = np.where(df_tweets.text.str[0:2] == 'RT',1,0)

# Variáveis desprezadas

In [32]:
drop_features = [
    'in_reply_to_screen_name',
    'in_reply_to_status_id',
    'place',
    'retweeted_status_id',
    'source',
    'timestamp',
    'in_reply_to_user_id'
]

df_tweets.drop(drop_features, axis=1, inplace=True)

# Extraindo métricas dos usuários através de seus tweets 
#### Coletando métricas relacionadas aos usuários para que possam ser agregados ao dataset de Accounts

In [34]:
df_user_metrics = (
    df_tweets
    .groupby('user_id')
    .agg(
        tweet_count = ('id','count'),
        mean_hashtag_usage = ('num_hashtags','mean'),
        mean_url_usage = ('num_urls','mean'),
        mean_mention_usage = ('num_mentions','mean'),
        mean_replies_made = ('is_reply','mean'),
        mean_retweets_made = ('is_retweet','mean'),
        mean_reply_received = ('reply_count','mean'),
        mean_retweet_received = ('retweet_count','mean')
    )
)

In [51]:
# Métrica de tweets por dia do usuário

df_user_metrics['avg_tweets_per_day'] = (
    df_tweets.groupby(['user_id', 'date']).agg(tweets_per_day=('user_id','count'))
).groupby('user_id').agg(avg_tweets_per_day = ('tweets_per_day','mean'))


In [129]:
# Métrica de média de contagem de palavras.

df_tweets['word_split'] = df_tweets['text'].str.split()
df_tweets['word_count'] = df_tweets['word_split'].str.len()

df_user_metrics['avg_word_count'] = df_tweets.groupby('user_id').agg(
    avg_word_count = ('word_count', 'mean')
)

In [140]:
df_user_metrics.shape

(10197, 10)

# Exportando as métricas para CSV

In [141]:
df_user_metrics.to_csv(r'C:\git_repositories\tcc\accounts_tweet_metrics.csv', index=True)