In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_val_predict
from sklearn import linear_model as lm
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import spacy
import re
import nltk
from nltk import word_tokenize, download
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

https://cienciaenegocios.com/processamento-de-linguagem-natural-nlp/
https://medium.com/@ageitgey/natural-language-processing-is-fun-9a0bff37854e

In [2]:
download('punkt')
download('stopwords')
download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\STEFA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\STEFA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\STEFA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#### Importando o Dataset

In [3]:
df_train = pd.read_csv("dataset/train.csv")
df_test = pd.read_csv("dataset/test.csv")

Unindo os dataset de treino e teste para facilitar a etapa de limpeza

In [4]:
df = pd.concat([df_train,df_test])

In [5]:
df

Unnamed: 0,text,sentiment
0,"Now, I won't deny that when I purchased this o...",neg
1,"The saddest thing about this ""tribute"" is that...",neg
2,Last night I decided to watch the prequel or s...,neg
3,I have to admit that i liked the first half of...,neg
4,I was not impressed about this film especially...,neg
...,...,...
24995,"For one thing, he produced this movie. It has ...",neg
24996,The title comes from an alteration an adolesce...,pos
24997,Christopher Nolan's first film is a 'no budget...,pos
24998,The story is shortly about the faith-lacking b...,neg


In [6]:
df.loc[df.sentiment=='neg','sentiment_cate'] = 0
df.loc[df.sentiment=='pos','sentiment_cate'] = 1

In [7]:
df.dtypes

text               object
sentiment          object
sentiment_cate    float64
dtype: object

In [8]:
df.sentiment_cate = df.sentiment_cate.astype(int)

In [9]:
df.sample(5)

Unnamed: 0,text,sentiment,sentiment_cate
5703,The discussion has been held a thousand times....,neg,0
8472,"In New York, when the shy and lonely project m...",pos,1
18213,A visit by Hitler in Rome is the backdrop of t...,pos,1
22414,"In April of 1965, CBS broadcast the first of B...",pos,1
13921,"For the life of me, I cannot get why they woul...",neg,0


### Limpeza

#### Normalização
O processo de normalização no Processamento de Linguagem Natural trata de colocar todas as palavras em caixa baixa, retirando os acentos ou cedilha. Isto faz com que os algoritmos de análise não tratem palavras iguais como sendo diferentes apenas por apresentarem a letra inicial como maiúscula, ou um erro de acentuação, por exemplo.

1. Transformar coluna 'text' em list para facilitar a manipulação

In [10]:
df_list = list(df['text'])
df_list[3]



2. Remover caracteres desnecessários e transformar todos os caracteres para minusculo

In [11]:
caracteres_1 = re.compile("[.;:!\'?@,\"()\[\]]")
caracteres_2 = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
caracteres_3 = re.compile('[^A-Za-z0-9\s]+')

def pre_processing(textos):
    textos = [caracteres_1.sub("", texto) for texto in textos]
    textos = [caracteres_2.sub(" ", texto.lower()) for texto in textos]
    textos = [caracteres_3.sub("", texto) for texto in textos]
    
    return textos

df_processed = pre_processing(df_list)

In [12]:
df_processed[3]



#### Tokenização

In [13]:
#word_tokenize(df_processed[0])

In [14]:
df_tokens = [word_tokenize(text) for text in df_processed]

#### Stop words
As stop words são palavras que não adicionam sentido ao texto, pois são usadas para coesão e dar contexto, mas não fazem sentido específico quando olhadas individualmente. Portanto, o processo necessário em análises que usam Processamento de Linguagem Natural é eliminar as palavras vazias (stop words).

In [15]:
stop_words = stopwords.words('english')

In [16]:
df_without_sw = [[token for token in text if token not in stop_words]
                  for text in df_tokens]

In [17]:
print('Tamanho do texto com stop words: '+str(len(df_tokens[0])))
print('Tamanho do texto sem stop words: '+str(len(df_without_sw[0])))

Tamanho do texto com stop words: 140
Tamanho do texto sem stop words: 63


#### Lematização

In [18]:
lematizer = WordNetLemmatizer()
df_lematizer = [[lematizer.lemmatize(token) for token in text] for text in df_without_sw]

#### Stemização
O termo stemização vem do inglês stemming, e representa o processo de reduzir palavras flexionadas ou derivadas a sua base. Neste processo a tarefa é transformar palavras próximas em uma só, como ficar apenas com o radical dos verbos ou transformar todos os substantivos para o singular.

In [19]:
stemization = LancasterStemmer()
df_stemization = [[stemization.stem(token) for token in text] for text in df_lematizer]

In [20]:
str(df_without_sw[0])

"['wont', 'deny', 'purchased', 'ebay', 'high', 'expectations', 'incredible', 'print', 'work', 'master', 'comedy', 'enjoy', 'however', 'soon', 'disappointed', 'apologies', 'enjoyed', 'found', 'compleat', 'al', 'difficult', 'watch', 'got', 'smiles', 'sure', 'majority', 'funny', 'came', 'music', 'videos', 'ive', 'got', 'dvd', 'rest', 'basically', 'filler', 'could', 'tell', 'als', 'greatest', 'video', 'achievement', 'honor', 'goes', 'uhf', 'honestly', 'doubt', 'ever', 'make', 'jump', 'dvd', 'youre', 'ultra', 'hardcore', 'al', 'fan', 'everything', 'buy', 'tape', 'ebay', 'dont', 'pay', 'much']"

In [17]:
str(df_lematizer[0])

"['wont', 'deny', 'purchased', 'ebay', 'high', 'expectation', 'incredible', 'print', 'work', 'master', 'comedy', 'enjoy', 'however', 'soon', 'disappointed', 'apology', 'enjoyed', 'found', 'compleat', 'al', 'difficult', 'watch', 'got', 'smile', 'sure', 'majority', 'funny', 'came', 'music', 'video', 'ive', 'got', 'dvd', 'rest', 'basically', 'filler', 'could', 'tell', 'al', 'greatest', 'video', 'achievement', 'honor', 'go', 'uhf', 'honestly', 'doubt', 'ever', 'make', 'jump', 'dvd', 'youre', 'ultra', 'hardcore', 'al', 'fan', 'everything', 'buy', 'tape', 'ebay', 'dont', 'pay', 'much']"

In [18]:
str(df_stemization[0])

"['wont', 'deny', 'purchas', 'ebay', 'high', 'expect', 'incred', 'print', 'work', 'mast', 'comedy', 'enjoy', 'howev', 'soon', 'disappoint', 'apolog', 'enjoy', 'found', 'compl', 'al', 'difficult', 'watch', 'got', 'smil', 'sur', 'maj', 'funny', 'cam', 'mus', 'video', 'iv', 'got', 'dvd', 'rest', 'bas', 'fil', 'could', 'tel', 'al', 'greatest', 'video', 'achiev', 'hon', 'go', 'uhf', 'honest', 'doubt', 'ev', 'mak', 'jump', 'dvd', 'yo', 'ultr', 'hardc', 'al', 'fan', 'everyth', 'buy', 'tap', 'ebay', 'dont', 'pay', 'much']"

#### Vetorização - Bag of words

In [22]:
df_processado = list()
for frase in df_stemization:
    df_processado.append(' '.join(frase))

In [6]:
df_train = pd.read_csv("dataset/df_processado.csv")
X = df_train['text_processed'][:20000]
y = df_train['sentiment_cate'][:20000]

In [7]:
#df['text_processed'] = df_processado
#df.to_csv('dataset/df_processado.csv')

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

vetorizer = CountVectorizer()
vetorizer = vetorizer.fit_transform(X)

#### Divisão dos dados em treino e teste

In [9]:
X_train, X_test, y_train, y_test = train_test_split(vetorizer.toarray(), y, test_size = 0.2, random_state = 42,stratify=y)

In [16]:
y_train = np.asarray(y_train).astype('float32').reshape((-1,1))
y_test = np.asarray(y_test).astype('float32').reshape((-1,1))

#### Classificação

In [10]:
model = LogisticRegression(solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [14]:
y_pred = model.predict(X_test)


In [17]:
y_score = model.score(y_pred,y_test)

ValueError: Expected 2D array, got 1D array instead:
array=[1 0 0 ... 1 1 0].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [25]:
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
tfidf_transformer.fit(word_count_vector)

# print idf values 
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=["idf_weights"]) 
 
# sort ascending 
df_idf.sort_values(by=['idf_weights'])

#https://kavita-ganesan.com/tfidftransformer-tfidfvectorizer-usage-differences/#.YWm0phrMKMo

<50000x134129 sparse matrix of type '<class 'numpy.int64'>'
	with 6857368 stored elements in Compressed Sparse Row format>