In [None]:
import pandas as pd
import numpy as np
from json import loads
from datetime import datetime
from unicodedata import normalize
from string import punctuation

In [None]:
def clean_txt(txt):
    return normalize('NFKD', txt)\
           .encode('ASCII', 'ignore')\
           .decode('utf-8')\
           .lower()\
           .strip()

def find_estado(x, cep):
    x = clean_txt(x)    
    if not x:
        return 'Não Informado'
    else:
        try:
            ix = np.where(cep.values == clean_txt(x))[0][0]
            return cep.iloc[ix].sigla.upper()
        except:
            return 'Estrangeiro'

def clean_location(x, cep, sigla):
    for s in sigla:
        if s in x.split(' '):
            return s.upper()
    
    for p in punctuation:
        x = x.replace(p, '|')
    
    x = clean_txt(x).split('|')
    if x == ['brasil'] or x == ['brazil']:
        return 'Brasileiro sem Estado'
    else:
        estado = 'Estrangeiro'
        for i in x:
            if find_estado(i, cep) != 'Estrangeiro':
                estado = find_estado(i, cep)
    
        return estado

In [None]:
a =  loads(open('twitter.json', encoding="utf8").read())
a[0]

In [None]:
columns = ['created_at', 'full_text', 'entities', 'user', 'retweet_count', 'favorite_count']
candidates = ['fraudenasurnaseletronicas', 'LulaNoPrimeiroTurno', 'viraviraciro','17neles', 'elenao']

In [None]:
len(a)

In [None]:
df = pd.DataFrame(a)
df.head()

In [None]:
df = df[columns]
df.info()

In [None]:
cep = pd.read_csv('estados+cidades.csv', delimiter='|', header=None, names=['estado', 'sigla', 'cidade'])
cep = cep.applymap(lambda x: clean_txt(x))
cep.head()

In [None]:
sigla = set(cep.sigla.values.tolist())

In [None]:
df['location'] = df.user.apply(lambda x: x['location'])
df['estado'] = df.location.apply(lambda x: clean_location(x, cep, sigla))
df['user'] = df.user.apply(lambda x: x['name'])
df['entities'] = df.entities.apply(lambda x: ';'.join([i['text'] for i in x['hashtags']]))
df['candidate'] = df.entities.apply(lambda x: next((i for i in candidates if i in clean_txt(x)), 'branco'))
df['date'] = df.created_at.apply(lambda x : datetime.strptime(x,'%a %b %d %H:%M:%S +0000 %Y').strftime('%Y-%m-%d %H:%M:%S'))
del df['created_at']
df.head()

In [None]:
df['candidate']

In [None]:
df.tail()

In [None]:
df.groupby("candidate")['user'].count()

In [None]:
df.groupby("estado")['user'].count()

In [None]:
df['location'][df.estado=='Estrangeiro'].apply(lambda x: clean_txt(x)).value_counts()[:20]

In [None]:
df.describe()

In [None]:
columns = ['date', 'user', 'candidate', 'estado', 'location', 'retweet_count', 'favorite_count', 'full_text']

In [None]:
df['candidate']

In [None]:
df.date = df.date.astype('datetime64')
df.candidate = df.candidate.astype('category').cat.add_categories(['#17Neles', '#EleNão', '#FicaTemer', '#FraudeNasUrnasEletrônicas','#LulaNoPrimeiroTurno'])

In [None]:
df.info()

In [None]:
df = df[columns].sort_values(['date']).reset_index(drop=True)
df.head()

In [None]:
df.to_json('dataset.json.zip', compression='zip')

In [None]:
df.to_csv('dataset2.csv', sep=';', index=False, encoding='utf-8')

In [None]:
df.replace(';',',').drop(['full_text', 'user'], axis=1).to_csv('dataset2.csv', sep=';', index_label="twitter", encoding='utf-8')