# Análise de sentimentos

## Importação das bibliotecas e base de dados

- Base de dados: https://www.kaggle.com/sid321axn/amazon-alexa-reviews/kernels

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
reviews_df = pd.read_csv('/content/amazon_alexa.tsv', sep = '\t')

In [None]:
reviews_df.shape

In [None]:
reviews_df

In [None]:
reviews_df.info()

In [None]:
reviews_df.describe()

In [None]:
reviews_df['verified_reviews']

## Exploração dos dados

In [None]:
sns.heatmap(reviews_df.isnull(), cbar=False);

In [None]:
reviews_df.isnull().sum()

In [None]:
reviews_df.hist(bins = 30, figsize=(13,5), color = 'r')

In [None]:
reviews_df['length'] = reviews_df['verified_reviews'].apply(len)
reviews_df.head()

In [None]:
reviews_df['length'].plot(bins = 100, kind = 'hist');

In [None]:
reviews_df.length.describe()

In [None]:
reviews_df[reviews_df['length'] == 2851]['verified_reviews'].iloc[0]

In [None]:
reviews_df[reviews_df['length'] == 1]['verified_reviews'].iloc[0]

In [None]:
reviews_df[reviews_df['length'] == 131]['verified_reviews'].iloc[0]

In [None]:
positive = reviews_df[reviews_df['feedback'] == 1]

In [None]:
positive

In [None]:
positive.describe()

In [None]:
negative = reviews_df[reviews_df['feedback'] == 0]

In [None]:
negative

In [None]:
negative.describe()

In [None]:
sns.countplot(reviews_df['feedback']);

In [None]:
sns.countplot(x = 'rating', data = reviews_df);

In [None]:
plt.figure(figsize = (40,15))
sns.barplot(x = 'variation', y = 'rating', data = reviews_df, palette = 'deep');

In [None]:
sentences = reviews_df['verified_reviews'].tolist()
len(sentences)

In [None]:
print(sentences[0:10])

In [None]:
sentences_as_one_string = ' '.join(sentences)

In [None]:
sentences_as_one_string

In [None]:
len(sentences_as_one_string)

In [None]:
from wordcloud import WordCloud
plt.figure(figsize=(20,20))
plt.imshow(WordCloud().generate(sentences_as_one_string));

In [None]:
negative_list = negative['verified_reviews'].tolist()
negative_sentences_as_one_string = ' '.join(negative_list)

In [None]:
len(negative_sentences_as_one_string)

In [None]:
plt.figure(figsize=(20,20))
plt.imshow(WordCloud().generate(negative_sentences_as_one_string));

## Limpeza dos dados

In [None]:
reviews_df.head()

In [None]:
reviews_df = reviews_df.drop(['date', 'rating', 'length'], axis = 1)

In [None]:
reviews_df

In [None]:
reviews_df['variation'].unique()

In [None]:
X_cat = reviews_df[['variation']]
X_cat

In [None]:
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder()
X_cat = onehotencoder.fit_transform(X_cat).toarray()

In [None]:
X_cat[1], X_cat.shape

In [None]:
type(X_cat)

In [None]:
X_cat = pd.DataFrame(X_cat)
type(X_cat)

In [None]:
reviews_df.drop(['variation'], axis = 1, inplace = True)

In [None]:
reviews_df

In [None]:
reviews_df = pd.concat([reviews_df, X_cat], axis = 1)
reviews_df

## Remoção de pontuação dos textos

In [None]:
import string
string.punctuation

In [None]:
test = 'Fun item to play with and get used to using. Sometimes has hard time answering the questions you ask, but I think it will be better!'

In [None]:
test_punc_removed = [char for char in test if char not in string.punctuation]

In [None]:
print(test_punc_removed)

In [None]:
test_punc_removed = ''.join(test_punc_removed)
test_punc_removed

## Remoção de stop words

In [None]:
import nltk

In [None]:
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
print(stopwords.words('english'))

In [None]:
len(stopwords.words('english'))

In [None]:
print(stopwords.words('portuguese'))

In [None]:
len(stopwords.words('portuguese'))

In [None]:
test_punc_removed

In [None]:
print(test_punc_removed.split())

In [None]:
'AB'.lower()

In [None]:
test_punc_stop_removed = [word for word in test_punc_removed.split() if word.lower() not in stopwords.words('english')]

In [None]:
print(test_punc_stop_removed)

## Count vectorization (tokenização)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
sample_data = ['This is the first document.','This document is the second document.',
               'And this is the third one.','Is this the first document?']

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(sample_data)

In [None]:
print(vectorizer.get_feature_names_out())

In [None]:
X

In [None]:
print(X.toarray())

## Pipeline de limpeza dos textos

In [None]:
def message_cleaning(message):
  text = [char for char in message if char not in string.punctuation]
  text = ''.join(text)
  text = [word for word in text.split() if word.lower() not in stopwords.words('english')]
  return text

In [None]:
reviews_df_clean = reviews_df['verified_reviews'].apply(message_cleaning)

In [None]:
print(reviews_df_clean[3])

In [None]:
print(reviews_df['verified_reviews'][3])

In [None]:
reviews_df_clean

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer=message_cleaning)
reviews_countvectorizer = vectorizer.fit_transform(reviews_df['verified_reviews'])

In [None]:
print(vectorizer.get_feature_names_out())

In [None]:
len(vectorizer.get_feature_names_out())

In [None]:
print(reviews_countvectorizer.toarray())

In [None]:
reviews_countvectorizer.shape

In [None]:
reviews_df

In [None]:
reviews_df.drop(['verified_reviews'], axis = 1, inplace=True)
reviews_df

In [None]:
type(reviews_countvectorizer)

In [None]:
reviews = pd.DataFrame(reviews_countvectorizer.toarray())
type(reviews)

In [None]:
reviews_df = pd.concat([reviews_df, reviews], axis = 1)

In [None]:
reviews_df

In [None]:
X = reviews_df.drop(['feedback'], axis = 1)
X

In [None]:
y = reviews_df['feedback']
y

## Treinamento e avaliação do Naïve Bayes

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
X_train.shape, X_test.shape

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
y_pred = naive_bayes.predict(X_test)

In [None]:
y_pred

In [None]:
y_test

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
sns.heatmap(cm, annot=True, fmt='.3g');

In [None]:
print(classification_report(y_test, y_pred))

## Treinamento e avaliação da regressão logística

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logistic = LogisticRegression()
logistic.fit(X_train, y_train)

In [None]:
y_pred = logistic.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

## Classificação de somente uma frase

In [None]:
import pickle

In [None]:
with open('text_classifier.pkl', 'wb') as f:
  pickle.dump([naive_bayes, onehotencoder, vectorizer], f)

In [None]:
with open('text_classifier.pkl', 'rb') as f:
  bayes, onehot, vec = pickle.load(f)

In [None]:
bayes

In [None]:
onehot

In [None]:
vec

In [None]:
negative.head()

In [None]:
negative_text = negative.iloc[1:2, [2,3]]
negative_text

In [None]:
X_cat = negative_text[['variation']]
X_cat

In [None]:
X_cat = onehot.transform(X_cat).toarray()
X_cat = pd.DataFrame(X_cat)
X_cat

In [None]:
negative_text

In [None]:
X_cat.index = negative_text.index

In [None]:
X_cat.index, negative_text.index

In [None]:
negative_df = pd.concat([negative_text, X_cat], axis = 1)
negative_df

In [None]:
negative_df.drop(['variation'], axis = 1, inplace = True)
negative_df

In [None]:
negative_coutvectorizer = vec.transform(negative_df['verified_reviews'])

In [None]:
negative_coutvectorizer

In [None]:
review = pd.DataFrame(negative_coutvectorizer.toarray())
review

In [None]:
negative_df.drop(['verified_reviews'], axis = 1, inplace = True)
negative_df

In [None]:
negative_df.index = review.index

In [None]:
negative_final = pd.concat([negative_df, review], axis = 1)
negative_final

In [None]:
bayes.predict(negative_final)
# logistic.predict(negative_final)

In [None]:
bayes.predict_proba(negative_final)
# logistic.predict_proba(negative_final)