In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_val_predict
from sklearn import linear_model as lm
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import spacy
import re
import nltk
from nltk import word_tokenize, download
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer


https://cienciaenegocios.com/processamento-de-linguagem-natural-nlp/
https://medium.com/@ageitgey/natural-language-processing-is-fun-9a0bff37854e

In [2]:
download('punkt')
download('stopwords')
download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\STEFA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\STEFA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\STEFA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#### Importando o Dataset

In [5]:
df_train = pd.read_csv("dataset/train.csv")
df_test = pd.read_csv("dataset/test.csv")

Unindo os dataset de treino e teste para facilitar a etapa de limpeza

In [6]:
df = pd.concat([df_train,df_test])

In [7]:
df

Unnamed: 0,text,sentiment
0,"Now, I won't deny that when I purchased this o...",neg
1,"The saddest thing about this ""tribute"" is that...",neg
2,Last night I decided to watch the prequel or s...,neg
3,I have to admit that i liked the first half of...,neg
4,I was not impressed about this film especially...,neg
...,...,...
24995,"For one thing, he produced this movie. It has ...",neg
24996,The title comes from an alteration an adolesce...,pos
24997,Christopher Nolan's first film is a 'no budget...,pos
24998,The story is shortly about the faith-lacking b...,neg


In [8]:
#apenas para testar o algoritmo por causa da memoria
df = df_train

In [9]:
def sentiment_categorization(df):
    df.loc[df.sentiment=='neg','sentiment_cate'] = 0
    df.loc[df.sentiment=='pos','sentiment_cate'] = 1
    return df

df = sentiment_categorization(df)

In [10]:
df.dtypes

text               object
sentiment          object
sentiment_cate    float64
dtype: object

In [11]:
df.sample(5)

Unnamed: 0,text,sentiment,sentiment_cate
24034,"Yes, this production is long (good news for Br...",pos,1.0
1055,"This is a great example of a good, dumb movie....",pos,1.0
20829,"There are two things that I like about Elvira,...",pos,1.0
11801,Marjorie (a splendid and riveting performance ...,pos,1.0
23270,"Horrible acting, horrible cast and cheap props...",neg,0.0


### Limpeza e Pré-Processamento

#### Normalização
O processo de normalização no Processamento de Linguagem Natural trata de colocar todas as palavras em caixa baixa, retirando os acentos ou cedilha. Isto faz com que os algoritmos de análise não tratem palavras iguais como sendo diferentes apenas por apresentarem a letra inicial como maiúscula, ou um erro de acentuação, por exemplo.

1. Transformar coluna 'text' em list para facilitar a manipulação

In [12]:
df_list = list(df['text'][:3])
df_list[0]

"Now, I won't deny that when I purchased this off eBay, I had high expectations. This was an incredible out-of-print work from the master of comedy that I so enjoy. However, I was soon to be disappointed. Apologies to those who enjoyed it, but I just found the Compleat Al to be very difficult to watch. I got a few smiles, sure, but the majority of the funny came from the music videos (which I've got on DVD) and the rest was basically filler. You could tell that this was not Al's greatest video achievement (that honor goes to UHF). Honestly, I doubt if this will ever make the jump to DVD, so if you're an ultra-hardcore Al fan and just HAVE to own everything, buy the tape off eBay. Just don't pay too much for it."

2. Remover caracteres desnecessários e transformar todos os caracteres para minusculo

In [13]:
def pre_processing(textos):
    caracteres_1 = re.compile("[.;:!\'?@,\"()\[\]]")
    caracteres_2 = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
    caracteres_3 = re.compile('[^A-Za-z0-9\s]+')

    textos = [caracteres_1.sub("", texto) for texto in textos]
    textos = [caracteres_2.sub(" ", texto.lower()) for texto in textos]
    textos = [caracteres_3.sub("", texto) for texto in textos]
    
    return textos

df_processed = pre_processing(df_list)

In [14]:
df_processed[0]

'now i wont deny that when i purchased this off ebay i had high expectations this was an incredible out of print work from the master of comedy that i so enjoy however i was soon to be disappointed apologies to those who enjoyed it but i just found the compleat al to be very difficult to watch i got a few smiles sure but the majority of the funny came from the music videos which ive got on dvd and the rest was basically filler you could tell that this was not als greatest video achievement that honor goes to uhf honestly i doubt if this will ever make the jump to dvd so if youre an ultra hardcore al fan and just have to own everything buy the tape off ebay just dont pay too much for it'

#### Tokenização

In [15]:
#word_tokenize(df_processed[0])

In [16]:
df_tokens = [word_tokenize(text) for text in df_processed]

In [17]:
str(df_tokens[0])

"['now', 'i', 'wont', 'deny', 'that', 'when', 'i', 'purchased', 'this', 'off', 'ebay', 'i', 'had', 'high', 'expectations', 'this', 'was', 'an', 'incredible', 'out', 'of', 'print', 'work', 'from', 'the', 'master', 'of', 'comedy', 'that', 'i', 'so', 'enjoy', 'however', 'i', 'was', 'soon', 'to', 'be', 'disappointed', 'apologies', 'to', 'those', 'who', 'enjoyed', 'it', 'but', 'i', 'just', 'found', 'the', 'compleat', 'al', 'to', 'be', 'very', 'difficult', 'to', 'watch', 'i', 'got', 'a', 'few', 'smiles', 'sure', 'but', 'the', 'majority', 'of', 'the', 'funny', 'came', 'from', 'the', 'music', 'videos', 'which', 'ive', 'got', 'on', 'dvd', 'and', 'the', 'rest', 'was', 'basically', 'filler', 'you', 'could', 'tell', 'that', 'this', 'was', 'not', 'als', 'greatest', 'video', 'achievement', 'that', 'honor', 'goes', 'to', 'uhf', 'honestly', 'i', 'doubt', 'if', 'this', 'will', 'ever', 'make', 'the', 'jump', 'to', 'dvd', 'so', 'if', 'youre', 'an', 'ultra', 'hardcore', 'al', 'fan', 'and', 'just', 'have',

#### Stop words
As stop words são palavras que não adicionam sentido ao texto, pois são usadas para coesão e dar contexto, mas não fazem sentido específico quando olhadas individualmente. Portanto, o processo necessário em análises que usam Processamento de Linguagem Natural é eliminar as palavras vazias (stop words).

In [18]:
stop_words = stopwords.words('english')

In [19]:
df_without_sw = [[token for token in text if token not in stop_words]
                  for text in df_tokens]

In [20]:
print('Tamanho do texto com stop words: '+str(len(df_tokens[0])))
print('Tamanho do texto sem stop words: '+str(len(df_without_sw[0])))

Tamanho do texto com stop words: 140
Tamanho do texto sem stop words: 63


#### Lematização

In [21]:
lematizer = WordNetLemmatizer()
df_lematizer = [[lematizer.lemmatize(token) for token in text] for text in df_without_sw]

#### Stemização
O termo stemização vem do inglês stemming, e representa o processo de reduzir palavras flexionadas ou derivadas a sua base. Neste processo a tarefa é transformar palavras próximas em uma só, como ficar apenas com o radical dos verbos ou transformar todos os substantivos para o singular.

In [22]:
stemization = LancasterStemmer()
df_stemization = [[stemization.stem(token) for token in text] for text in df_lematizer]

Comparação das etapas de limpeza

In [23]:
str(df_without_sw[0])

"['wont', 'deny', 'purchased', 'ebay', 'high', 'expectations', 'incredible', 'print', 'work', 'master', 'comedy', 'enjoy', 'however', 'soon', 'disappointed', 'apologies', 'enjoyed', 'found', 'compleat', 'al', 'difficult', 'watch', 'got', 'smiles', 'sure', 'majority', 'funny', 'came', 'music', 'videos', 'ive', 'got', 'dvd', 'rest', 'basically', 'filler', 'could', 'tell', 'als', 'greatest', 'video', 'achievement', 'honor', 'goes', 'uhf', 'honestly', 'doubt', 'ever', 'make', 'jump', 'dvd', 'youre', 'ultra', 'hardcore', 'al', 'fan', 'everything', 'buy', 'tape', 'ebay', 'dont', 'pay', 'much']"

In [24]:
str(df_lematizer[0])

"['wont', 'deny', 'purchased', 'ebay', 'high', 'expectation', 'incredible', 'print', 'work', 'master', 'comedy', 'enjoy', 'however', 'soon', 'disappointed', 'apology', 'enjoyed', 'found', 'compleat', 'al', 'difficult', 'watch', 'got', 'smile', 'sure', 'majority', 'funny', 'came', 'music', 'video', 'ive', 'got', 'dvd', 'rest', 'basically', 'filler', 'could', 'tell', 'al', 'greatest', 'video', 'achievement', 'honor', 'go', 'uhf', 'honestly', 'doubt', 'ever', 'make', 'jump', 'dvd', 'youre', 'ultra', 'hardcore', 'al', 'fan', 'everything', 'buy', 'tape', 'ebay', 'dont', 'pay', 'much']"

In [25]:
str(df_stemization[0])

"['wont', 'deny', 'purchas', 'ebay', 'high', 'expect', 'incred', 'print', 'work', 'mast', 'comedy', 'enjoy', 'howev', 'soon', 'disappoint', 'apolog', 'enjoy', 'found', 'compl', 'al', 'difficult', 'watch', 'got', 'smil', 'sur', 'maj', 'funny', 'cam', 'mus', 'video', 'iv', 'got', 'dvd', 'rest', 'bas', 'fil', 'could', 'tel', 'al', 'greatest', 'video', 'achiev', 'hon', 'go', 'uhf', 'honest', 'doubt', 'ev', 'mak', 'jump', 'dvd', 'yo', 'ultr', 'hardc', 'al', 'fan', 'everyth', 'buy', 'tap', 'ebay', 'dont', 'pay', 'much']"

##### Função para realizar essas etapas em todo o dataset

In [40]:
def processamento_nltk(df):    
    text_processado = list()
    df_processed = pre_processing(df)
    df_processed = [word_tokenize(text) for text in df_processed]
    
    stop_words = stopwords.words('english')
    df_processed = [[token for token in text if token not in stop_words]
                  for text in df_processed]
    
    lematizer = WordNetLemmatizer()
    df_processed = [[lematizer.lemmatize(token) for token in text] for text in df_processed]
    
    stemization = LancasterStemmer()
    df_processed = [[stemization.stem(token) for token in text] for text in df_processed]
    
    for frase in df_processed:
        text_processado.append(' '.join(frase))
        
    return text_processado
    

In [41]:
#tokeniza e remove as stop words, salvando o lemma
def processamento_spacy(df):
    df_tokens = list()
    df = pre_processing(df)
    df_processado = list()
    nlp = spacy.load("en_core_web_sm")
    for text in df:
        tokens = list()
        doc = nlp(text)
        for token in doc:
            if(not(token.is_stop)):
                tokens.append(token.lemma_)
        df_tokens.append(tokens)
    
    for frase in df_tokens:
        df_processado.append(' '.join(frase))
    return df_processado

In [42]:
df_list = list(df['text'])
df['text_processed_nltk'] = processamento_nltk(df_list)

In [43]:
df['text_processed_spacy'] = processamento_spacy(df_list)

In [45]:
#df['text_processed'] = df_processado
df.to_csv('dataset/df_processado_train.csv')

In [44]:
df.sample(5)

Unnamed: 0,text,sentiment,sentiment_cate,text_processed_nltk,text_processed_spacy
5591,Some of those guys that watch films and compla...,pos,1.0,guy watch film complain liv forget someth dvd ...,guy watch film complain living forget dvd menu...
4396,The Cameraman's Revenge is an unusual short no...,pos,1.0,cameram reveng unus short subject mat adultery...,cameraman revenge unusual short subject matter...
12196,Kiera Nightly moved straight from the P&P set ...,pos,1.0,kier night mov straight pp set act movy could ...,kiera nightly move straight pp set action movi...
2479,It's been a looooonnnggg time since I saw this...,neg,0.0,looooonnngg tim sint saw comedy id forgot idio...,looooonnnggg time see comedy d forget idiotic ...
6967,A film that dramatized an understandable reluc...,pos,1.0,film dram understand reluct fac inevit com sec...,film dramatize understandable reluctance face ...


#### Divisão dos dados em treino e teste

In [3]:
df_train = pd.read_csv("dataset/df_processado_train.csv")
X = df_train['text_processed_nltk']
y = df_train['sentiment_cate']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42,stratify=y)

#### Vetorização - Bag of words

In [24]:
vetorizer = CountVectorizer()
X_train = vetorizer.fit_transform(X_train)
X_test = vetorizer.transform(X_test)

In [25]:
transformer = TfidfTransformer()
X_train = transformer.fit_transform(X_train)
X_test = transformer.transform(X_test)

In [33]:
y_train = y_train.reshape((-1,1))
y_test = y_test.reshape((-1,1))

#### Classificação

In [37]:
model = LogisticRegression(solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train.ravel())

LogisticRegression(max_iter=1000)

In [38]:
y_pred = model.predict(X_test)

In [39]:
y_test.ravel().shape

(5000,)

In [29]:
y_pred.shape

(5000,)

In [40]:
y_score = model.score(y_test.ravel(),y_pred)

ValueError: Expected 2D array, got 1D array instead:
array=[1. 1. 0. ... 0. 1. 1.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [25]:
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
tfidf_transformer.fit(word_count_vector)

# print idf values 
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=["idf_weights"]) 
 
# sort ascending 
df_idf.sort_values(by=['idf_weights'])

#https://kavita-ganesan.com/tfidftransformer-tfidfvectorizer-usage-differences/#.YWm0phrMKMo

<50000x134129 sparse matrix of type '<class 'numpy.int64'>'
	with 6857368 stored elements in Compressed Sparse Row format>

#### Abreviações