## Montando o drive para carregar o dataset

In [None]:
pip install prov

In [None]:
!python -m spacy download pt_core_news_lg
!pip install mglearn

In [None]:
from prov.model import ProvDocument, Namespace, Literal, PROV, Identifier
import datetime

g = ProvDocument(
    namespaces={
        "ex": "MonkeyPox Feelings",
        "dcterms": "http://purl.org/dc/terms/",
        "foaf": "http://xmlns.com/foaf/0.1/",
    }
)

article = g.entity("ex:article", {"dcterms:title": "MonkeyPox Feelings - Uma análise de sentimentos de tweets sobre a MonkeyPox"})

In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

In [None]:
import os
cwd = os.getcwd()
absolute_path = "drive/MyDrive/MestradoPPGI/topicos-ds/"
os.chdir(absolute_path)

## Carregando o dataset

In [None]:
import pandas as pd
import json

In [None]:
with open("teste.json","r") as f:
    json_str = f'[{f.read()}]'
    obj_list = json.loads(json_str)

In [None]:
data = pd.DataFrame.from_dict(obj_list)

In [None]:
tweets = pd.DataFrame.from_dict(data["data"][0])

## Quantidade de Tweets Coletados, Tweet mais antigo e Tweet mais recente

In [None]:
print(f"Quantidade Tweets: {len(tweets.index)},\nTweet mais antigo: {tweets['created_at'].min()},\nTweet mais recente: {tweets['created_at'].max()}")

In [None]:
tweets.head()

In [None]:
tweets["text"][1]

## Aplicando Stopwords 

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
stopwords_br = set(stopwords.words('portuguese'))

stopwords_br.update(["https", "t", "t.co", "twitter", "t co","damadanoite14", "RT","RT Dr","co"])


In [None]:
from nltk.tokenize import word_tokenize
tokens = []
for each in tweets.index:
    word_tokens = word_tokenize(tweets.iloc[each]['text'])
    tokens.append(word_tokens)


In [None]:
filtered_sentence = []
for tk in tokens:
    for w in tk:
        if w.lower() not in stopwords_br:
            filtered_sentence.append(w)

## Visualizando palavras mais recorrentes no Wordcloud

In [None]:
from PIL import Image
from wordcloud import WordCloud, ImageColorGenerator

import matplotlib.pyplot as plt
%matplotlib inline

# Start with one review:
text = " ".join(word for word in filtered_sentence)

# Create and generate a word cloud image:
wordcloud = WordCloud().generate(text)

# Display the generated image:
plt.figure(figsize=[10,5])

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

## Carregando modelo SVC

In [None]:
from model import modelo

## Classificando Tweets com o modelo SVC

In [None]:
saida = ""
teste = modelo.fazer_previsoes(tweets, saida)

## Levantando estatísticas quantativas de Sentimento x Tweet

In [None]:
from collections import Counter
sentimentos_dist= Counter(teste["Sentimento"])

In [None]:
sentimentos_dist

In [None]:
from matplotlib import pyplot as plt
x = sentimentos_dist.keys()
y = sentimentos_dist.values()

plt.bar(x,y,width = 0.5)
plt.show()


## Analise quantativa de Sentimento x Tweet com o termo "gay".

In [None]:
teste["gays"] = (teste["Tweet"].str.lower()
                            .str.contains("gay", regex=False, na=False)
                            .astype(int))

In [None]:
df_gays = teste.loc[teste["gays"] == 1]

In [None]:
df_gays.head(10)

In [None]:
qtd_sentimentos_gays = dict(Counter(df_gays["Sentimento"]))

In [None]:
qtd_sentimentos_gays

In [None]:
from matplotlib import pyplot as plt
x = qtd_sentimentos_gays.keys()
y = qtd_sentimentos_gays.values()

plt.bar(x,y,width = 0.5)
plt.show()