# Exemplo de classificador de sentimentos

In [0]:
import pandas as pd

df = pd.read_csv('https://s3.amazonaws.com/aulas-fiap/imdb-reviews-pt-br.csv')

df.sentiment.value_counts()

neg    24765
pos    24694
Name: sentiment, dtype: int64

In [0]:
df.head()

Unnamed: 0,id,text_en,text_pt,sentiment
0,1,Once again Mr. Costner has dragged out a movie...,"Mais uma vez, o Sr. Costner arrumou um filme p...",neg
1,2,This is an example of why the majority of acti...,Este é um exemplo do motivo pelo qual a maiori...,neg
2,3,"First of all I hate those moronic rappers, who...","Primeiro de tudo eu odeio esses raps imbecis, ...",neg
3,4,Not even the Beatles could write songs everyon...,Nem mesmo os Beatles puderam escrever músicas ...,neg
4,5,Brass pictures movies is not a fitting word fo...,Filmes de fotos de latão não é uma palavra apr...,neg


In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('stopwords')
stops = nltk.corpus.stopwords.words('portuguese')

vect = TfidfVectorizer(ngram_range=(1,3), use_idf=True,stop_words=stops)
#vect = TfidfVectorizer(ngram_range=(1,2), use_idf=True,stop_words=stops, sublinear_tf = True, max_df =.67, min_df=.015,lowercase=False)

vect.fit(df.text_pt)
text_vect = vect.transform(df.text_pt)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
text_vect.shape

(49459, 8638447)

In [0]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(
    text_vect, 
    df.sentiment,
    test_size = 0.2, 
    random_state = 42
)

In [0]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)

y_prediction = tree.predict(X_test)

f1 = f1_score(y_prediction, y_test, average='weighted')

print(f1)

# Utilização do método LinearSVC

In [0]:
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

nltk.download('stopwords')
stops = nltk.corpus.stopwords.words('portuguese')

vect = TfidfVectorizer(ngram_range=(1,3), use_idf=True,stop_words=stops)
#vect = TfidfVectorizer(ngram_range=(1,2), use_idf=True,stop_words=stops, sublinear_tf = True, max_df =.67, min_df=.015,lowercase=False)

vect.fit(df.text_pt)
text_vect = vect.transform(df.text_pt)



X_train,X_test,y_train,y_test = train_test_split(
    text_vect, 
    df.sentiment,
    test_size = 0.2, 
    random_state = 42
)

# Training
model = LinearSVC()
model.fit(X_train, y_train)
y_prediction = model.predict(X_test)

f1 = f1_score(y_prediction, y_test, average='weighted')

print(f1)

# Utilização do método TextBlob e Vader de análise de sentimento





In [0]:
!pip install textblob vadersentiment
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


# TextBlob

In [0]:
for index, row in df.iterrows():
    analysis = TextBlob(row['text_en'])
 
    if analysis.sentiment.polarity > 0.2 and analysis.sentiment.subjectivity > 0.1:
      df.loc[index, 'sentiment_TextBlob'] = 'pos'

    else:
      df.loc[index, 'sentiment_TextBlob'] = 'neg'


   

In [0]:
from sklearn.metrics import f1_score

print(df.pivot_table(index = 'sentiment', columns = 'sentiment_TextBlob', values = 'text_en' , aggfunc='count'))

f1 = f1_score(df['sentiment_TextBlob'], df['sentiment'], average='weighted')



print(f1)

In [0]:
analysis = SentimentIntensityAnalyzer()

for index, row in df.iterrows():
    vs = analysis.polarity_scores(row['text_en']) 
 
    if vs['pos']-vs['neg'] > 0.2:
      df.loc[index, 'sentiment_Vader'] = 'pos'

    else:
      df.loc[index, 'sentiment_Vader'] = 'neg'

      
      
from sklearn.metrics import f1_score

print(df.pivot_table(index = 'sentiment', columns = 'sentiment_Vader', values = 'text_en' , aggfunc='count'))

f1 = f1_score(df['sentiment_Vader'], df['sentiment'], average='weighted')



print(f1)      
      
      