In [None]:

import json
import nltk
import numpy as np
import pandas as pd
import pymongo
import time

# Conexão com o MongoDB

In [None]:
client = pymongo.MongoClient()

# database
db = client['SUA_DATABASE_AQUI']  

# collection
collection = db['SUA_COLLECTION_AQUI']

In [None]:
pipeline = [
    {
        '$project': {
            'tweet_id': '$id_str', 
            'date': {
                '$dateFromString': {
                    'dateString': '$created_at'
                }
            }, 
            'tweet_text': {
                '$ifNull': [
                    '$retweeted_status.full_text', '$full_text'
                ]
            }, 
            'num_likes': '$favorite_count', 
            'num_retweets': '$retweet_count', 
            'quoted_status_id': {
                '$ifNull': [
                    '$quoted_status_id_str', None
                ]
            }, 
            'retweeted_status_id': {
                '$ifNull': [
                    '$retweeted_status.id_str', None
                ]
            }, 
            'reply_to_user': {
                '$ifNull': [
                    '$in_reply_to_user_id_str', None
                ]
            }, 
            'reply_to_status': {
                '$ifNull': [
                    '$in_reply_to_status_id_str', None
                ]
            }, 
            'user_id': '$user.id_str', 
            'screen_name': '$user.screen_name', 
            'followers': '$user.followers_count', 
            'following': '$user.friends_count'
        }
    }, {
        '$project': {
            '_id': 0, 
            'tweet_id': 1, 
            'date': {
                '$dateToString': {
                    'format': '%d/%m/%Y %H:%M:%S', 
                    'date': '$date'
                }
            }, 
            'tweet_text': 1, 
            'num_likes': 1, 
            'num_retweets': 1, 
            'quoted_status_id': 1, 
            'retweeted_status_id': 1, 
            'reply_to_user': 1, 
            'reply_to_status': 1, 
            'user_id': 1, 
            'screen_name': 1, 
            'followers': 1, 
            'following': 1
        }
    }
]

In [None]:
time_start = time.time()
document = list(collection.aggregate(pipeline=pipeline))
time_end = time.time()

print(
    f'Tempo de execução: {time.strftime("%H:%M:%S", time.gmtime(time_end - time_start))}')
print(f'{len(document)} documentos selecionados')

In [None]:
print(f'Amostra de um documento:\n')
print(json.dumps(document[0], sort_keys=False, indent=3, ensure_ascii=False))

In [None]:
df = pd.DataFrame(document)
df = df[['tweet_id', 'date', 'tweet_text', 'num_likes', 'num_retweets',
                   'retweeted_status_id', 'quoted_status_id', 'reply_to_user', 'reply_to_status',
                   'user_id', 'screen_name', 'followers', 'following']]
df.head()

In [None]:
print(f'Amostra de um tweet antes do pré-processamento:\n\n{df.tweet_text[0]}')

In [None]:
df.to_csv('full_tweets_pt.csv')

## Pré-processamento

In [None]:
# remover links
df['clean_text'] = df['tweet_text'].str.replace(r"http\S+","") 
df.clean_text[0]

In [None]:
# remover mentions
df['clean_text'] = df['clean_text'].str.replace(r"@\S+","")
df.clean_text[0]

In [None]:
# remover hashtags
df['clean_text'] = df['clean_text'].str.replace(r"#\S+"," ")
df.clean_text[0]

In [None]:
# remover quebra de linhas
df['clean_text'] = df['clean_text'].str.replace(r"\n"," ")
df.clean_text[0]

In [None]:
# remover pontuações
df['clean_text'] = df['clean_text'].str.replace(r"[^\w\s#]|_"," ")
df.clean_text[0]

In [None]:
# remover números
df['clean_text'] = df['clean_text'].str.strip().str.replace(r"\b(?:[0-9]*)\b","")
df.clean_text[0]

In [None]:
# remover espaços duplos
df['clean_text'] = df['clean_text'].str.strip().str.replace(r"\s{2,}"," ")
df.clean_text[0]

In [None]:
# converter todas as letras para minusculas
df['clean_text'] = df['clean_text'].str.lower()
df.clean_text[0]

In [None]:
# remover letras que se repetem em sequência mais de 3x
import string
alphabet = list(string.ascii_lowercase)

for letter in alphabet:
    pattern = '{}{}'.format(letter, '{3,}')
    df['clean_text'] = df['clean_text'].str.replace(r'{}'.format(pattern), letter)
    
df.clean_text[0]

In [None]:
# remover stpwords
with open('stopwords/portuguese.txt', 'r') as file:
    stopwords = file.readlines()
stopwords = [sw.replace('\n','') for sw in stopwords]

def remover_stopwords(texto):
    palavras = [i for i in texto.split() if not i in stopwords]
    return (" ".join(palavras))

df['clean_text'] = df['clean_text'].apply(remover_stopwords)
df.clean_text[0]

In [None]:
# remover palavras com 2 caracteres ou menos
def remover_len_2(texto):
    palavras = [i for i in texto.split() if len(i) > 2]
    return (" ".join(palavras))

df['clean_text'] = df['clean_text'].apply(remover_len_2)
df.clean_text[0]

In [None]:
print(
    f'Amostra de um tweet depois do pré-processamento:\n\n{df.tweet_text[0]}\n\n{df.clean_text[0]}')

In [None]:
def reduzir_radical(texto):
    stemmer = nltk.stem.RSLPStemmer()
    palavras = [stemmer.stem(i) for i in texto.split()]
    return (" ".join(palavras))

In [None]:
reduzir_radical(df.clean_text[0])

In [None]:
# df['tweet_text'] = df['tweet_text'].apply(reduzir_radical)

In [None]:
import unicodedata

def remover_acentuacao(text):
    # Unicode normalize transforma um caracter em seu equivalente em latin.
    nfkd = unicodedata.normalize('NFKD', text)
    palavra_sem_acento = u"".join([c for c in nfkd if not unicodedata.combining(c)])
    return (palavra_sem_acento)

In [None]:
remover_acentuacao(df.clean_text[0])

In [None]:
df['clean_text'] = df['clean_text'].apply(remover_acentuacao)
df.clean_text[0]

In [None]:
# remover non ascii caracteres
df['clean_text'] = df['clean_text'].str.strip().str.replace(r"[^a-zA-z0-9#|\s]","")

df.clean_text[0]

In [None]:
print(
    f'Amostra de um tweet depois do pré-processamento:\n\n{df.tweet_text[0]}\n\n{df.clean_text[0]}')

# Criação do arquivo .csv

In [None]:
df = df[df['clean_text'] != '']

In [None]:
df.shape

In [None]:
df.to_csv('tweets_pt.csv', index=False)

criação do arquivo .txt (somente com os textos dos tweets):

In [None]:
tweets_text = np.array(df['clean_text'])

In [None]:
np.savetxt(fname='tweets_pt.txt', X=tweets_text, fmt='%s')