# Importando as bibliotecas

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pymongo
import re
import nltk

from wordcloud import WordCloud
from bson.json_util import dumps
from nltk import word_tokenize
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

import unicodedata
import csv

from polyglot.downloader import downloader
from polyglot.text import Text

%matplotlib inline 

### Procedimentos para baixar as stopwords. Caso já tenha sido feito, ignorar essa etapa.

In [None]:
nltk.download('stopwords')
nltk.download('rslp')

### Para a análise de sentimento

In [None]:
downloader.download("embeddings2.pt")
downloader.download("sentiment2.pt")
downloader.download("morph2.pt")

# Iniciando a conexão com o MongoDB

In [None]:
client = pymongo.MongoClient()

In [None]:
db = client.tweets8march

In [None]:
collection = db.tweets

## Pipeline 1: tweets que não são retweets

In [None]:
pipeline1 = [
    {
        '$match': {
            'lang': 'pt', 
            'retweeted_status': {
                '$exists': False
            }, 
            'is_quote_status': False
        }
    }, {
        '$project': {
            '_id': 1,
            'id_tweet': '$id',
            'extended_tweet': {
                '$ifNull': [
                    '$extended_tweet.full_text', '$text'
                ]
            }
        }
    }, {
        '$project': {
            '_id': 1, 
            'id_tweet': 1,
            'text': '$extended_tweet'
        }
    }
]

## Pipeline 2: tweets que são retweets, porém, apenas os com quotes

In [None]:
pipeline2 = [
    {
        '$match': {
            'lang': 'pt', 
            'retweeted_status': {
                '$exists': False
            }, 
            'is_quote_status': True
        }
    }, {
        '$project': {
            '_id': 1, 
            'id_tweet': '$id', 
            'extended_tweet': {
                '$ifNull': [
                    '$extended_tweet.full_text', '$text'
                ]
            }
        }
    }, {
        '$project': {
            '_id': 1, 
            'id_tweet': 1, 
            'text': '$extended_tweet'
        }
    }
]

## Executando a query

In [None]:
document1 = list(collection.aggregate(pipeline = pipeline1))
document2 = list(collection.aggregate(pipeline = pipeline2))

In [None]:
print(dumps(document1[0:5], indent = 4, sort_keys = True, ensure_ascii = False))

In [None]:
print(dumps(document2[0:5], indent = 4, sort_keys = True, ensure_ascii = False))

In [None]:
documents = document1 + document2

In [None]:
df = pd.DataFrame(documents)

In [None]:
df.head(10)

In [None]:
df.text.count()

In [None]:
df.drop_duplicates(['id_tweet'], inplace = True)

In [None]:
df.text.count()

In [None]:
df['text'][0:10].tolist()

In [None]:
#df['text'].str.replace(r"http\S+","").str.replace(r"@\S+","").str.replace(r"\n"," ").str.replace(r"#\S+"," ").str.strip().str.replace(r"\s+"," ").str.replace(r"[^\w\s]"," ")[0]

In [None]:
# remover links
df['text'] = df['text'].str.replace(r"http\S+","") 

# remover mentions
df['text'] = df['text'].str.replace(r"@\S+","")

# remover quebra de linhas
df['text'] = df['text'].str.replace(r"\n"," ")

# remover hashtags
df['text'] = df['text'].str.replace(r"#\S+"," ")

# remover pontuações
df['text'] = df['text'].str.replace(r"[^\w\s]"," ")

# remover espaços duplos
df['text'] = df['text'].str.strip().str.replace(r"\s{2,}"," ")

# converter todas as letras para minusculas
df['text'] = df['text'].str.lower()

In [None]:
df['text'][0:10].tolist()

In [None]:
# Função para remover stopwords da nossa base:
def remover_stopwords(texto):
    stopwords = set(nltk.corpus.stopwords.words('portuguese'))
    palavras = [i for i in texto.split() if not i in stopwords]
    return (" ".join(palavras))

df['text'] = df['text'].apply(remover_stopwords)

In [None]:
df['text'][0:10].tolist()

# Wordcloud

In [None]:
palavras = ','.join(list(df['text'].values))

In [None]:
# Generate a word cloud image
wordcloud = WordCloud().generate(palavras)

In [None]:
# Display the generated image:
# the matplotlib way:
plt.figure(figsize = (15,15))
plt.axis("off")
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.show()

In [None]:
wordcloud = WordCloud(max_font_size = 40).generate(palavras)
plt.figure(figsize = (15,15))
plt.axis("off")
plt.imshow(wordcloud, interpolation="bilinear")
plt.show()

# Topic Modeling

In [None]:
n_features = 10000
n_components = 10
n_top_word = 8

In [None]:
data_samples = df['text']
n_samples = len(data_samples)

In [None]:
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df = 0.95, 
                                min_df =2 ,
                                max_features = n_features,
                                stop_words = nltk.corpus.stopwords.words('portuguese'))
tf = tf_vectorizer.fit_transform(data_samples)

print("tf features for LDA extraction is completed!")

In [None]:
print("Fitting LDA models with tf features\n" 
      "n_samples = %d and n_features = %d..." % (n_samples, n_features))

lda = LatentDirichletAllocation(n_components = n_components, 
                                max_iter = 5,
                                learning_method = 'online',
                                learning_offset = 50.,
                                random_state = 0)

lda.fit(tf)

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [None]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_word)

In [None]:
print("Fiting LSA model")
lsa = TruncatedSVD(n_components=n_components, n_iter=40, tol=0.01)
lsa.fit(tf)

In [None]:
print("\nTopics in LSA model:")
print_top_words(lsa, tf_feature_names, n_top_word)

# Análise de sentimentos

In [None]:
data = df.text.unique().tolist()

### Extrair polaridades dos comentários 

In [None]:
polarity = []

for dat in data:
    text = Text(dat)
    text.language = "pt"
    try:
        polarity.append(text.polarity)
    except:
        polarity.append(0)

### Salvando polaridade de todos os tweets em um novo Dataframe

In [None]:
new_df = {'text': data, 'polarity': polarity}
sentiments_df = pd.DataFrame(data = new_df)

### Categorização dos sentimentos

In [None]:
dataLabel = sentiments_df.polarity.tolist()

pos = dataLabel.count(1)
neg = dataLabel.count(-1)
negMedio = 0
posMedio = 0

for dat in dataLabel:
    if (dat<0 and dat > -1 ):
        negMedio += 1
    elif(dat > 0 and dat < 1):
        posMedio += 1

neutral = dataLabel.count(0)
label = ["Positiva", "Positiva Média", "Negativa","Negativa Média", "Neutra"]

print ("Total Positiva: ", pos)
print ("Total Positiva Média: ", posMedio)
print ("Total Negativa: ", neg)
print ("Total Negativa Média: ", negMedio)
print ("Total Neutra: ", neutral)

In [None]:
plt.figure(figsize = (10,7))
plt.bar(range(len(label)), [pos,posMedio,neg,negMedio,neutral], align ='center')
plt.xticks(range(len(label)), label)

plt.plot()

In [None]:
# Aplicando o stemming em nossa base:
def stemming(texto):
    stemmer = nltk.stem.RSLPStemmer()
    palavras = []
    for palavra in texto.split():
        palavras.append(stemmer.stem(palavra))
    return (" ".join(palavras))

df['text'] = df['text'].apply(stemming)

In [None]:
df['text'][0:10].tolist()

In [None]:
def Preprocessing(instancia):
    instancia = re.sub(r"http\S+", "", instancia).lower().replace('.','').replace(';','').replace('-','').replace(':','').replace(')','').replace('"','')
    stopwords = set(nltk.corpus.stopwords.words('portuguese'))
    palavras = [i for i in instancia.split() if not i in stopwords]
    return (" ".join(palavras))