In [1]:
import numpy as np
import pandas as pd
import pickle
import tensorflow as tf
import gensim
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random
import string
from sklearn.utils import shuffle
import multiprocessing
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
from nltk.stem import WordNetLemmatizer
from nltk.stem.rslp import RSLPStemmer
from gensim.corpora.dictionary import Dictionary



In [2]:
dfBase = pd.read_csv('datasets/movie_review.csv')
dfBase.head()

Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag
0,0,cv000,29590,0,films adapted from comic books have had plenty...,pos
1,0,cv000,29590,1,"for starters , it was created by alan moore ( ...",pos
2,0,cv000,29590,2,to say moore and campbell thoroughly researche...,pos
3,0,cv000,29590,3,"the book ( or "" graphic novel , "" if you will ...",pos
4,0,cv000,29590,4,"in other words , don't dismiss this film becau...",pos


In [None]:
df = shuffle(dfBase)
df = df.reset_index(drop = True)
df.head()

stemmer = RSLPStemmer()

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def TreatText(data):
    #import pdb; pdb.set_trace()
    stops = set(stopwords.words("english"))  # melhora a performance convertendo num set
    data['text'] = [re.sub("[^a-zA-Z]", " ",data['text'][i]) for i in range(len(data))] #mantém apenas letras (há números, links, etc.)     
    data['text'] = [word_tokenize(data['text'][i].lower()) for i in range(len(data))] # caixa baixa
    data['text'] = [[lemmatize_stemming(w) for w in data['text'][i] if w not in stops]for i in range(len(data))]# lematize/stemming
    return(data['text'])

In [4]:
dfText = TreatText(df)
dfText.head()

0                         [sometim, wond, cens, think]
1                                      [loyal, friend]
2    [pric, tag, million, film, rememb, gigantic, f...
3    [result, pinkett, smith, first, opportunity, a...
4    [com, la, without, know, anyon, city, two, wee...
Name: text, dtype: object

In [5]:
dictionary = Dictionary(dfText)
bow_corpus = [dictionary.doc2bow(doc) for doc in dfText]

In [6]:
nume_topics = 2
num_passes = 7
num_workers = multiprocessing.cpu_count()
seed = 1713

In [7]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = nume_topics, 
                                   id2word = dictionary,                                    
                                   passes = num_passes,
                                   random_state = seed,
                                   workers = num_workers)

In [8]:
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.014*"film" + 0.009*"one" + 0.009*"charact" + 0.007*"lik" + 0.006*"act" + 0.005*"mak" + 0.005*"get" + 0.005*"tim" + 0.005*"good" + 0.005*"movi"


Topic: 1 
Words: 0.017*"film" + 0.014*"movi" + 0.008*"go" + 0.007*"one" + 0.007*"mak" + 0.006*"see" + 0.006*"even" + 0.006*"get" + 0.005*"tak" + 0.005*"know"




## As words além de não apresentarem em maioria correlação com sentimentos positivos/negativos, tem baixa correlação com os tópicos gerados. Como não está claro o que é positivo e negativos, vamos testar a accuracy selecionando um dos tópicos como positivo e testando contra o dataset.

In [9]:
dfText[0]

['sometim', 'wond', 'cens', 'think']

In [10]:
all_topics = lda_model[bow_corpus]
all_topics_csr = gensim.matutils.corpus2csc(all_topics)
all_topics_numpy = all_topics_csr.T.toarray()

In [11]:
dfResult = pd.DataFrame(all_topics_numpy).head(5).apply(lambda x: 1 if x[0]>x[1] else 0, axis=1)
dfResult.head()

0    0
1    0
2    0
3    1
4    1
dtype: int64

In [12]:
 acertos = sum(dfResult[0] == df['tag'].apply(lambda x: 1 if x == 'pos' else 0))

In [13]:
acertos / df.shape[0]

0.49108467243510506

## LDA com modelagem de 2 tópicos apresentou taxa de acerto de cerca de 50% no dataset, permitindo concluir que não é possível diferenciar entre reviews positivas e negativas por LDA no dataset, já que não apresenta assertividade maior que a seleção aleatória.