In [1]:
import numpy as np
import pandas as pd
import pickle
import tensorflow as tf
import gensim
from gensim.models import Word2Vec
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random
import string
from sklearn.utils import shuffle
import multiprocessing
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
from nltk.stem import WordNetLemmatizer
from nltk.stem.rslp import RSLPStemmer



In [2]:
dfBase = pd.read_csv('datasets/movie_review.csv')
dfBase.head()

Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag
0,0,cv000,29590,0,films adapted from comic books have had plenty...,pos
1,0,cv000,29590,1,"for starters , it was created by alan moore ( ...",pos
2,0,cv000,29590,2,to say moore and campbell thoroughly researche...,pos
3,0,cv000,29590,3,"the book ( or "" graphic novel , "" if you will ...",pos
4,0,cv000,29590,4,"in other words , don't dismiss this film becau...",pos


In [3]:
df = shuffle(dfBase)
df = df.reset_index(drop = True)
df.head()

Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag
0,5,cv503,11196,30,perhaps sci-fi spectaculars are just not an ac...,neg
1,4,cv428,12202,66,how does a talking decapitated head show that ...,neg
2,2,cv201,7421,7,she's the lone survivor this time .,neg
3,1,cv199,9629,42,it just doesn't have the emotional impact that...,pos
4,7,cv768,11751,13,the one real problem i had with it is the unre...,pos


In [4]:
stemmer = RSLPStemmer()

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def TreatText(data):
    #import pdb; pdb.set_trace()
    stops = set(stopwords.words("english"))  # melhora a performance convertendo num set
    data['text'] = [re.sub("[^a-zA-Z]", " ",data['text'][i]) for i in range(len(data))] #mantém apenas letras (há números, links, etc.)     
    data['text'] = [word_tokenize(data['text'][i].lower()) for i in range(len(data))] # caixa baixa
    data['text'] = [[lemmatize_stemming(w) for w in data['text'][i] if w not in stops]for i in range(len(data))]# lematize/stemming/remove stopwords
    return(data['text'])

In [5]:
dfText = TreatText(df)
dfText.head()

0            [perhap, sci, fi, spectacul, act, medium]
1    [talk, decapitat, head, show, te, dispondent, ...
2                                   [lon, surviv, tim]
3          [emot, impact, suspect, mr, cameron, shoot]
4    [one, real, probl, unrealistic, natur, main, c...
Name: text, dtype: object

In [6]:
labels = np.array(df['tag'].apply(lambda x: 1 if x == 'pos' else 0)) 
labels

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

In [7]:
dim_vec = 300
min_count = 10
window = 4
num_workers = multiprocessing.cpu_count()
seed = tf.set_random_seed(1713)

In [8]:
def meanVector(model,phrase):
    vocab = model.wv.vocab
    phrase = " ".join(phrase)
    phrase = [x for x in word_tokenize(phrase) if x in vocab]
    #Quando não houver palavra o vector recebe 0 para todas as posições
    if phrase == []:
        vetor = [0.0]*dim_vec 
    else: 
        #Caso contrário, calculando a matriz da frase
        vetor = np.mean([model[word] for word in phrase],axis=0)
    return vetor

In [9]:
def createFeatures(base, modelo): 
    features = [meanVector(modelo,base[i])for i in range(len(base))]
    return features

In [10]:
def generateF1Score(dfText, modelo):
    dfeatures = createFeatures(dfText, modelo)
    X_train, X_test, y_train, y_test = train_test_split(dfeatures[0:len(dfeatures)], labels[0:len(labels)], test_size=0.3,random_state=109)
    clf = svm.SVC(kernel='linear') 
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    f_score = metrics.f1_score(y_test, y_pred, average='micro')
    print("Accuracy:",f_score)

In [11]:
modeloCBOW = Word2Vec(dfText,
                    min_count = min_count, 
                    size = dim_vec, 
                    window = window,
                    seed = seed,
                    workers = num_workers,
                    sg = 0) #sg = 0 -> CBOW e sg = 1 -> skipgram
print("Tamanho do vocabulário do Word2Vec CBOW: ", len(modeloCBOW.wv.vocab.keys()))

Tamanho do vocabulário do Word2Vec CBOW:  7283


In [12]:
generateF1Score(dfText, modeloCBOW)

  # Remove the CWD from sys.path while we load stuff.


Accuracy: 0.5627317676143387


In [13]:
modeloCSKIPGRAM = Word2Vec(dfText,
                    min_count = min_count, 
                    size = dim_vec, 
                    window = window,
                    seed = seed,
                    workers = num_workers,
                    sg = 1) #sg = 0 -> CBOW e sg = 1 -> skipgram
print("Tamanho do vocabulário do Word2Vec skipgram: ", len(modeloCSKIPGRAM.wv.vocab.keys()))

Tamanho do vocabulário do Word2Vec skipgram:  7283


In [14]:
generateF1Score(dfText, modeloCSKIPGRAM)

  # Remove the CWD from sys.path while we load stuff.


Accuracy: 0.5968788627935723


## O Modelo skipgram teve uma performance superior ao CBOW( 0.5627317676143387 Vs 0.5968788627935723)