# Técnicas de Machine Learning para Classificação Supervisionada de Contas Bots do Twitter.

## FIA LABDATA - Turma 13

### Descrição dos Data Sets utilizados: 

Contas classificadas como reais e bots,conforme descrita no paper:[The Paradigm-Shift of Social Spambots: Evidence, Theories, and Tools for the Arms Race](http://dl.acm.org/citation.cfm?doid=3041021.3055135)

fonte: http://mib.projects.iit.cnr.it/dataset.html

***

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import datetime

# Importando os Data Sets

`Genuine Data Set` - Genuine verified accounts that are human-operated

In [3]:
df_genuine = pd.read_csv('data/cresci-2017/genuine_accounts.csv/users.csv')
df_genuine = df_genuine.assign(classification='human', dataset='genuine')

# elimina timezone da data
#df_genuine['created_at'] = pd.to_datetime(df_genuine['created_at']).dt.tz_localize(None)


`social spambots #1` - Retweeters of an Italian political candidate

In [4]:
df_genuine.shape

(3474, 44)

In [5]:
df_social_bot_1 = pd.read_csv('data/cresci-2017/social_spambots_1.csv/users.csv')
df_social_bot_1 = df_social_bot_1.assign(classification='bot', dataset='social_spambots_1')

`social spambots #2` - Spammers of paid apps for mobile devices

In [6]:
df_social_bot_2 = pd.read_csv('data/cresci-2017/social_spambots_2.csv/users.csv')
df_social_bot_2 = df_social_bot_2.assign(classification='bot', dataset='social_spambots_2')

`social spambots #3` - Spammers of products on sale at Amazon.com

In [7]:
df_social_bot_3 = pd.read_csv('data/cresci-2017/social_spambots_3.csv/users.csv')
df_social_bot_3 = df_social_bot_3.assign(classification='bot', dataset='social_spambots_3')

`traditional spambots #1` - Spammers

In [8]:
df_traditional_spambots_1 = pd.read_csv('data/cresci-2017/traditional_spambots_1.csv/users.csv')
df_traditional_spambots_1 = df_traditional_spambots_1.assign(classification='bot', dataset='traditional_spambots_1')

`traditional spambots #2` - Spammers of scam URLs

In [9]:
df_traditional_spambots_2 = pd.read_csv('data/cresci-2017/traditional_spambots_2.csv/users.csv')
df_traditional_spambots_2 = df_traditional_spambots_2.assign(classification='bot', dataset='traditional_spambots_2')

`traditional spambots #3` - automated accounts spamming job offers

In [10]:
df_traditional_spambots_3 = pd.read_csv('data/cresci-2017/traditional_spambots_3.csv/users.csv')
df_traditional_spambots_3 = df_traditional_spambots_3.assign(classification='bot', dataset='traditional_spambots_3')

`traditional spambots #4`	- Another group of automated accounts spamming job offers

In [11]:
df_traditional_spambots_4 = pd.read_csv('data/cresci-2017/traditional_spambots_4.csv/users.csv')
df_traditional_spambots_4 = df_traditional_spambots_4.assign(classification='bot', dataset='traditional_spambots_4')

`fake followers` - Simple accounts that inflate the number of followers of another account

In [12]:
df_fake_followers = pd.read_csv('data/cresci-2017/fake_followers.csv/users.csv')
df_fake_followers = df_fake_followers.assign(classification='bot', dataset='fake_followers')

# Concatenando os Data Sets

In [15]:
df_twitter_accounts = pd.concat([
    df_genuine,
    df_social_bot_1,
    df_social_bot_2,
    df_social_bot_3,
    df_traditional_spambots_1,
    df_traditional_spambots_2,
    df_traditional_spambots_3,
    df_traditional_spambots_4,
    df_fake_followers    
])
df_twitter_accounts.shape

(14368, 44)

In [16]:
df_twitter_accounts.columns, df_twitter_accounts.shape

(Index(['id', 'name', 'screen_name', 'statuses_count', 'followers_count',
        'friends_count', 'favourites_count', 'listed_count', 'url', 'lang',
        'time_zone', 'location', 'default_profile', 'default_profile_image',
        'geo_enabled', 'profile_image_url', 'profile_banner_url',
        'profile_use_background_image', 'profile_background_image_url_https',
        'profile_text_color', 'profile_image_url_https',
        'profile_sidebar_border_color', 'profile_background_tile',
        'profile_sidebar_fill_color', 'profile_background_image_url',
        'profile_background_color', 'profile_link_color', 'utc_offset',
        'is_translator', 'follow_request_sent', 'protected', 'verified',
        'notifications', 'description', 'contributors_enabled', 'following',
        'created_at', 'timestamp', 'crawled_at', 'updated', 'test_set_1',
        'test_set_2', 'classification', 'dataset'],
       dtype='object'),
 (14368, 44))

## Correção de Valores das Variáveis

In [17]:
#verifica vaores nulos
df_twitter_accounts.isnull().sum()

id                                        0
name                                      1
screen_name                               0
statuses_count                            0
followers_count                           0
friends_count                             0
favourites_count                          0
listed_count                              0
url                                   10759
lang                                   1000
time_zone                              9018
location                               6684
default_profile                        9857
default_profile_image                 14290
geo_enabled                           10935
profile_image_url                         0
profile_banner_url                     9482
profile_use_background_image           1523
profile_background_image_url_https     1000
profile_text_color                     1000
profile_image_url_https                1000
profile_sidebar_border_color           1000
profile_background_tile         

In [18]:
# A variavel verified possui 2 valores possiveis, de acordo com documentação do Twitter (True,False)
# Os data sets apresentam os valores 1 e NaN, portanto, Substituir valores NaN por 0.
df_twitter_accounts['geo_enabled'] = df_twitter_accounts['geo_enabled'].fillna(0)
df_twitter_accounts['geo_enabled'].isnull().sum()

0

In [19]:
# A variavel verified possui 2 valores possiveis, de acordo com documentação do Twitter (True,False)
# Os data sets apresentam os valores 1 e NaN, portanto, Substituir valores NaN por 0.
df_twitter_accounts['verified'] = df_twitter_accounts['verified'].fillna(0)
df_twitter_accounts['verified'].isnull().sum()

0

In [20]:
# A variavel default_profile possui 2 valores possiveis, de acordo com documentação do Twitter (True,False)
# Os data sets apresentam os valores 1 e NaN, portanto, Substituir valores NaN por 0.
df_twitter_accounts['default_profile'] = df_twitter_accounts['default_profile'].fillna(0)
df_twitter_accounts['default_profile'].isnull().sum()

0

In [21]:
# A variavel default_profile_image possui 2 valores possiveis, de acordo com documentação do Twitter (True,False)
# Os data sets apresentam os valores 1 e NaN, portanto, Substituir valores NaN por 0.
df_twitter_accounts['default_profile_image'] = df_twitter_accounts['default_profile_image'].fillna(0)
df_twitter_accounts['default_profile_image'].isnull().sum()

0

In [22]:
# A variavel protected possui 2 valores possiveis, de acordo com documentação do Twitter (True,False)
# Os data sets apresentam os valores 1 e NaN, portanto, Substituir valores NaN por 0.
df_twitter_accounts['protected'] = df_twitter_accounts['protected'].fillna(0)
df_twitter_accounts['protected'].isnull().sum()

0

In [23]:
# A variavel protected possui 2 valores possiveis, de acordo com documentação do Twitter (True,False)
# Os data sets apresentam os valores 1 e NaN, portanto, Substituir valores NaN por 0.
df_twitter_accounts['lang'] = df_twitter_accounts['lang'].fillna('missing')
df_twitter_accounts['lang'].isnull().sum()

0

In [24]:
# existe um usuario que esta sem o nome e sera substituido pelo screen name (nome unico no twitter)
#df_twitter_accounts.query('name != name')
df_twitter_accounts['name'] = np.where(df_twitter_accounts['name'].isnull(), 
                                       df_twitter_accounts['screen_name'],
                                       df_twitter_accounts['name'])

# Gerando novas variáveis

In [27]:
df_twitter_accounts =(
    df_twitter_accounts
        # quantidade total de caracteres no screen name (mome unico)
        .assign(screen_name_total_len = (df_twitter_accounts['screen_name'].str.len()))
        # quantidade de caracteres numericos no screen name (nome unico)
        .assign(screen_name_num_len = (df_twitter_accounts['screen_name'].str.count('[0-9]')))
        # quantidade total de caracteres no name
        .assign(name_total_len = (df_twitter_accounts['name'].str.len()))
        # quantidade de caracteres numericos no name
        .assign(name_num_len = (df_twitter_accounts['name'].str.count('[0-9]')))
        # is_url_null --> se url nula == True se nao == False
        .assign(is_url_null = (np.where(df_twitter_accounts['url'].isnull(),True,False)))
        # se usuario informou location
        .assign(is_location_null = (np.where(df_twitter_accounts['location'].isnull(),True,False)))
        # se a variavel profile_banner_url foi informada ou nao pelo usuario
        .assign(profile_banner_url_null = (np.where(df_twitter_accounts['profile_banner_url'].isnull(),True,False)))
        # se a imagem do perfil foi informada pelo usuario
        .assign(profile_image_url_null = (np.where(df_twitter_accounts['profile_image_url_https'].isnull(),True,False)))
        # se usuário informou uma descricao para seu perfil
        .assign(description_null = (np.where(df_twitter_accounts['description'].isnull(),True,False)))
)

In [28]:
# CONVERT AS VARIAVEIS CATEGORICAS QUE ESTAVAM COMO FLOAT PARA BOOLEAN
boll_vars = ['default_profile','default_profile_image','geo_enabled','protected','verified']
df_twitter_accounts[boll_vars] = df_twitter_accounts[boll_vars].astype(bool)

# Drop das variaveis de alta cardinalidade

In [29]:
alta_card = ['url','location','profile_banner_url','profile_image_url_https','description','created_at']
df_twitter_accounts = df_twitter_accounts.drop(alta_card, axis=1)

# Dropando colunas marcadas como deprecated pelo Twitter 

In [30]:
deprecated_features_list = ['utc_offset','time_zone', 'contributors_enabled', 'is_translator', 
    'profile_background_color', 'profile_background_image_url', 
    'profile_background_image_url_https', 'profile_background_tile', 'profile_image_url',
    'profile_link_color', 'profile_sidebar_border_color','profile_sidebar_fill_color',
    'profile_text_color','profile_use_background_image','following',
    'follow_request_sent', 'notifications',
    #variaveis do data set nao relacionadas ao objeto user do Twitter
    'timestamp','crawled_at','updated','test_set_1','test_set_2']

# drop colunas deprecated pelo twitter
df_twitter_accounts = df_twitter_accounts.drop(deprecated_features_list, axis = 1)

In [36]:
df_twitter_accounts.shape

(14368, 25)

In [37]:
#verifica vaores nulos
df_twitter_accounts.isnull().sum()

id                         0
name                       0
screen_name                0
statuses_count             0
followers_count            0
friends_count              0
favourites_count           0
listed_count               0
lang                       0
default_profile            0
default_profile_image      0
geo_enabled                0
protected                  0
verified                   0
classification             0
dataset                    0
screen_name_total_len      0
screen_name_num_len        0
name_total_len             0
name_num_len               0
is_url_null                0
is_location_null           0
profile_banner_url_null    0
profile_image_url_null     0
description_null           0
dtype: int64

In [38]:
df_twitter_accounts.dtypes

id                          int64
name                       object
screen_name                object
statuses_count              int64
followers_count             int64
friends_count               int64
favourites_count            int64
listed_count                int64
lang                       object
default_profile              bool
default_profile_image        bool
geo_enabled                  bool
protected                    bool
verified                     bool
classification             object
dataset                    object
screen_name_total_len       int64
screen_name_num_len         int64
name_total_len              int64
name_num_len                int64
is_url_null                  bool
is_location_null             bool
profile_banner_url_null      bool
profile_image_url_null       bool
description_null             bool
dtype: object

# Exportando a base tratada para CSV

In [112]:
df_twitter_accounts.to_csv(r'C:\git_repositories\tcc\classified_twitter_accounts.csv', index=False)