In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import spacy
import nltk
import string
import re
from unidecode import unidecode
from emoji import UNICODE_EMOJI

from tqdm import tqdm

# Definindo recursos nltk em português
nltk.download('stopwords')
nltk.download('punkt')
portuguese_stopwords = nltk.corpus.stopwords.words('portuguese')
rslps_stemmer = nltk.stem.RSLPStemmer()
snowball_stemmer = nltk.stem.snowball.SnowballStemmer('portuguese', ignore_stopwords=False)

# Carregando modelo pt-br spacy
nlp = spacy.load("pt_core_news_lg")

# Import data
data = pd.read_csv('B2W-Reviews01.csv',sep=';')

# Random seed
seed = 0

# Define train, validation, test split ratios
train_ratio = 0.80
validation_ratio = 0.10
test_ratio = 0.10

# Define train, validation, test labels for 2-class problem
data['split_2'] = 'train'
x_train, x_test, y_train, y_test = train_test_split(data, data.recommend_to_a_friend.values, test_size=1-train_ratio, random_state=seed)
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio+validation_ratio), random_state=seed)
data.loc[x_val.index.values, 'split_2'] = 'valid'
data.loc[x_test.index.values, 'split_2'] = 'test'

# Define train, validation, test labels for 5-class problem
data['split_5'] = 'train'
x_train, x_test, y_train, y_test = train_test_split(data, data.overall_rating.values, test_size=1-train_ratio, random_state=seed)
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio+validation_ratio), random_state=seed)
data.loc[x_val.index.values, 'split_5'] = 'valid'
data.loc[x_test.index.values, 'split_5'] = 'test'

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\frederico.souza\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\frederico.souza\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
data

Unnamed: 0,submission_date,reviewer_id,product_id,product_name,product_brand,site_category_lv1,site_category_lv2,review_title,overall_rating,recommend_to_a_friend,review_text,reviewer_birth_year,reviewer_gender,reviewer_state,split_2,split_5
0,2018-01-01 00:11:28,d0fb1ca69422530334178f5c8624aa7a99da47907c44de...,132532965,Notebook Asus Vivobook Max X541NA-GO472T Intel...,,Informática,Notebook,Bom,4,Yes,Estou contente com a compra entrega rápida o ú...,1958.0,F,RJ,train,train
1,2018-01-01 00:13:48,014d6dc5a10aed1ff1e6f349fb2b059a2d3de511c7538a...,22562178,Copo Acrílico Com Canudo 500ml Rocie,,Utilidades Domésticas,"Copos, Taças e Canecas","Preço imbatível, ótima qualidade",4,Yes,"Por apenas R$1994.20,eu consegui comprar esse ...",1996.0,M,SC,train,train
2,2018-01-01 00:26:02,44f2c8edd93471926fff601274b8b2b5c4824e386ae4f2...,113022329,Panela de Pressão Elétrica Philips Walita Dail...,philips walita,Eletroportáteis,Panela Elétrica,ATENDE TODAS AS EXPECTATIVA.,4,Yes,SUPERA EM AGILIDADE E PRATICIDADE OUTRAS PANEL...,1984.0,M,SP,train,train
3,2018-01-01 00:35:54,ce741665c1764ab2d77539e18d0e4f66dde6213c9f0863...,113851581,Betoneira Columbus - Roma Brinquedos,roma jensen,Brinquedos,Veículos de Brinquedo,presente mais que desejado,4,Yes,MEU FILHO AMOU! PARECE DE VERDADE COM TANTOS D...,1985.0,F,SP,valid,valid
4,2018-01-01 01:00:28,7d7b6b18dda804a897359276cef0ca252f9932bf4b5c8e...,131788803,"Smart TV LED 43"" LG 43UJ6525 Ultra HD 4K com C...",lg,TV e Home Theater,TV,"Sem duvidas, excelente",5,Yes,"A entrega foi no prazo, as americanas estão de...",1994.0,M,MG,train,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132368,2018-05-31 23:30:50,15f20e95ff44163f3175aaf67a5ae4a94d5030b409e521...,17962233,Carregador De Pilha Sony + 4 Pilhas Aa 2500mah,,Câmeras e Filmadoras,Acessórios para Câmeras e Filmadoras,Ótimo produto!,5,Yes,"Vale muito, estou usando no controle do Xbox e...",1988.0,M,RS,test,test
132369,2018-05-31 23:42:25,def7cf9028b0673ab8bca3b1d06e085461fafb88cd48d9...,132631701,Mop Giratório Fit + Refil Extra - At Home,,Utilidades Domésticas,Material de Limpeza,Sensacional,5,Yes,"Prático e barato, super indico o produto para ...",1979.0,F,SP,test,test
132370,2018-05-31 23:44:16,7bcbf542f5d7dd9a9a192a6805adba7a7a4c1ce3bf00df...,16095859,Fita Led 5m Rgb 3528 Siliconada Com 300 Leds C...,,Automotivo,Iluminação,Ótimo produto,4,Yes,Chegou antes do prazo previsto e corresponde a...,1979.0,F,PR,train,train
132371,2018-05-31 23:46:48,e6fb0b19277d01c2a300c7837a105f3c369377e92f9c19...,6774907,Etiquetas Jurídicas Vade Mecum - Marca Fácil,marca facil,Papelaria,Material de Escritório,O produto não é bom.,1,No,"Material fraco, poderia ser melhor. Ficou deve...",1991.0,M,RJ,train,train


In [2]:
# Checking target distribution among splits
check_distribution_label_2 = pd.concat(
    [
        pd.DataFrame(100.0*data[data.split_2=='train'].recommend_to_a_friend.value_counts()/data[data.split_2=='train'].shape[0]).rename(columns={'recommend_to_a_friend':'train'}),
        pd.DataFrame(100.0*data[data.split_2=='valid'].recommend_to_a_friend.value_counts()/data[data.split_2=='valid'].shape[0]).rename(columns={'recommend_to_a_friend':'valid'}),
        pd.DataFrame(100.0*data[data.split_2=='test'].recommend_to_a_friend.value_counts()/data[data.split_2=='test'].shape[0]).rename(columns={'recommend_to_a_friend':'test'})
        ],
    axis=1
).reset_index().rename(columns={'index':'label'})

check_distribution_label_5 = pd.concat(
    [
        pd.DataFrame(100.0*data[data.split_5=='train'].overall_rating.value_counts()/data[data.split_5=='train'].shape[0]).rename(columns={'overall_rating':'train'}),
        pd.DataFrame(100.0*data[data.split_5=='valid'].overall_rating.value_counts()/data[data.split_5=='valid'].shape[0]).rename(columns={'overall_rating':'valid'}),
        pd.DataFrame(100.0*data[data.split_5=='test'].overall_rating.value_counts()/data[data.split_5=='test'].shape[0]).rename(columns={'overall_rating':'test'})
        ],
    axis=1
).reset_index().rename(columns={'index':'label'})

display(check_distribution_label_2)
display(check_distribution_label_5)

Unnamed: 0,label,train,valid,test
0,Yes,72.797409,72.818614,72.80556
1,No,27.19126,27.166276,27.164224


Unnamed: 0,label,train,valid,test
0,5,36.148936,36.450857,36.6294
1,4,24.526431,24.038679,24.097296
2,1,20.674611,20.548463,20.811301
3,3,12.282574,12.714361,12.275268
4,2,6.367448,6.247639,6.186735


In [206]:
def nltk_tokenization(text, is_stemmed=False, remove_stopwords=False, remove_emojis=False):
    text = re.sub(r"(http|www)\S+", "", text) #remove URL
    text = re.sub(r"\d{1,}.?\d{0,}", " 0 ", text) #normalize numbers to zero
    text = re.sub(r"[\⛤\¿\—\…\’\•\¡\°\º\´\!\"\#\%\&\\\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\\\]\^\_\`\{\|\}\~]{1,}", " ", text) # remove special characters
    text = re.sub(r"(?i)r? ?\${1,}", " $ ", text) #normalize $
    for emoji_i in UNICODE_EMOJI.keys():
        if remove_emojis==False:
            text = text.replace(emoji_i, ' ' + UNICODE_EMOJI[emoji_i].replace(':','') + ' ')
        else:
            text = text.replace(emoji_i, ' ')
            
    tokens_list = nltk.tokenize.word_tokenize(text, language='portuguese', preserve_line=False)
    tokenized_text = []
    for token in tokens_list:
        if is_stemmed==True:
            token = rslps_stemmer.stem(token).lower()
        token = token.lower()
        token = unidecode(token)
        token = token.strip()
        tokenized_text.append(token)
    if remove_stopwords==True:
        tokenized_text = [i for i in tokenized_text if i not in portuguese_stopwords]
    tokenized_text = [i for i in tokenized_text if i not in ['','[?]']]
    return tokenized_text

def spacy_tokenization(text, is_lemmatized=False, remove_stopwords=False, remove_emojis=False):
    text = re.sub(r"(http|www)\S+", "", text) #remove URL
    text = re.sub(r"\d{1,}.?\d{0,}", " 0 ", text) #normalize numbers to zero
    text = re.sub(r"[\⛤\¿\—\…\’\•\¡\°\º\´\!\"\#\%\&\\\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\\\]\^\_\`\{\|\}\~]{1,}", " ", text) # remove special characters
    text = re.sub(r"(?i)r? ?\${1,}", " $ ", text) #normalize $
    for emoji_i in UNICODE_EMOJI.keys():
        if remove_emojis==False:
            text = text.replace(emoji_i, ' ' + UNICODE_EMOJI[emoji_i].replace(':','') + ' ')
        else:
            text = text.replace(emoji_i, ' ')
            
    doc = nlp(text, disable=['tagger','parser','ner'])
    tokenized_text = []
    for token in doc:
        if is_lemmatized==True:
            token = token.lemma_
        else:
            token = token.text
        token = token.lower()
        token = unidecode(token)
        token = token.strip()
        tokenized_text.append(token)
    if remove_stopwords==True:
        tokenized_text = [i for i in tokenized_text if i not in portuguese_stopwords]
    tokenized_text = [i for i in tokenized_text if i not in ['','[?]']]
    return tokenized_text
    
# i = 1
# print(data.review_text.values[i])
# print('\n')
# print(nltk_tokenization(data.review_text.values[i], is_stemmed=False, remove_stopwords=True, remove_emojis=False))
# print('\n')
# print(spacy_tokenization(data.review_text.values[i], is_lemmatized=False, remove_stopwords=True, remove_emojis=False))

In [214]:
for i in tqdm(range(data.shape[0])):
    if (nltk_tokenization(data.review_text.values[i], is_stemmed=False, remove_stopwords=True) == spacy_tokenization(data.review_text.values[i], is_lemmatized=False, remove_stopwords=True))==False:
        print(i)
        print(nltk_tokenization(data.review_text.values[i], is_stemmed=False, remove_stopwords=True))
        print('\n')
        print(spacy_tokenization(data.review_text.values[i], is_lemmatized=False, remove_stopwords=True))
        print('\n')

100%|██████████| 132373/132373 [06:49<00:00, 323.23it/s]


In [None]:
def custom_tokenizer(text, normalization_type=None, remove_stopwords=False, remove_emojis=False):
    text = re.sub(r"(http|www)\S+", "", text) #remove URL
    text = re.sub(r"\d{1,}.?\d{0,}", " 0 ", text) #normalize numbers to zero
    text = re.sub(r"[\⛤\¿\—\…\’\•\¡\°\º\´\!\"\#\%\&\\\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\\\]\^\_\`\{\|\}\~]{1,}", " ", text) # remove special characters
    text = re.sub(r"(?i)r? ?\${1,}", " $ ", text) #normalize $
    for emoji_i in UNICODE_EMOJI.keys():
        if remove_emojis==False:
            text = text.replace(emoji_i, ' ' + UNICODE_EMOJI[emoji_i].replace(':','') + ' ')
        else:
            text = text.replace(emoji_i, ' ')
            
    doc = nlp(text, disable=['tagger','parser','ner'])
    tokenized_text = []
    for token in doc:
        if normalization_type=='lemma':
            token = token.lemma_
        elif normalization_type=='stem':
            token = rslps_stemmer.stem(token.text).lower()
        else:
            token = token.text
        token = token.lower()
        token = unidecode(token)
        token = token.strip()
        tokenized_text.append(token)
    if remove_stopwords==True:
        tokenized_text = [i for i in tokenized_text if i not in portuguese_stopwords]
    tokenized_text = [i for i in tokenized_text if i not in ['','[?]']]
    return tokenized_text

In [211]:
from sklearn.feature_extraction.text import TfidfVectorizer

X_train_2 = data.review_text[data.split_2=='train'].values
X_valid_2 = data.review_text[data.split_2=='valid'].values
X_test_2  = data.review_text[data.split_2=='test'].values
y_train_2 = data.recommend_to_a_friend[data.split_2=='train'].values
y_valid_2 = data.recommend_to_a_friend[data.split_2=='valid'].values
y_test_2  = data.recommend_to_a_friend[data.split_2=='test'].values

X_train_5 = data.review_text[data.split_5=='train'].values
X_valid_5 = data.review_text[data.split_5=='valid'].values
X_test_5  = data.review_text[data.split_5=='test'].values
y_train_5 = data.overall_rating[data.split_5=='train'].values
y_valid_5 = data.overall_rating[data.split_5=='valid'].values
y_test_5  = data.overall_rating[data.split_5=='test'].values