In [1]:
import sys
sys.path.append('../preprocessing/')

import pandas as pd
import numpy as np
from preprocess import *

df = pd.read_csv('../datasets/B2W-Reviews01.csv')

#df = df.head(1000)

preprocessing_functions = ['transform_to_lowercase', 'remove_special_characters', 'remove_stopwords',
                           'remove_specific_phrases']#, 'perform_lemmatization', 'perform_stemming', 'teste']

processed_df = preprocess_data(data=df, 
                               preprocessing_funcs=preprocessing_functions, 
                               language='portuguese', 
                               column='review_text', verbose=1)

display(processed_df[['review_text', 'prep']])

vector, processed_df2 = vectorization(data=processed_df, verbose=1, vectorizer='tf-idf')


Defined pipeline: ['transform_to_lowercase', 'remove_special_characters', 'remove_stopwords', 'remove_specific_phrases'] 

Preprocess --> transform_to_lowercase
Converting text to lowercase...
Done!

Preprocess --> remove_special_characters
Removing special characters from text...
Done!

Preprocess --> remove_stopwords
Getting stopword list and removing from text...


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/fernando/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Done!

Preprocess --> remove_specific_phrases
Removing phrases from text...
Done!




Unnamed: 0,review_text,prep
0,Estou contente com a compra entrega rápida o ú...,contente compra entrega rapida unico problema ...
1,"Por apenas R$1994.20,eu consegui comprar esse ...",apenas r 1994 20 consegui comprar lindo copo a...
2,SUPERA EM AGILIDADE E PRATICIDADE OUTRAS PANEL...,supera agilidade praticidade outras panelas el...
3,MEU FILHO AMOU! PARECE DE VERDADE COM TANTOS D...,filho amou parece verdade tantos detalhes
4,"A entrega foi no prazo, as americanas estão de...",entrega prazo americanas parabens smart tv boa...
...,...,...
132368,"Vale muito, estou usando no controle do Xbox e...",vale usando controle xbox durou semana carga p...
132369,"Prático e barato, super indico o produto para ...",pratico barato super indico produto corre dia ...
132370,Chegou antes do prazo previsto e corresponde a...,chegou antes prazo previsto corresponde anuncio
132371,"Material fraco, poderia ser melhor. Ficou deve...",material fraco poderia melhor ficou devendo op...



Applying tf-idf vectorizer...
Done!




# Using a Pipeline

In [3]:
from NLPipeline import *

# Define your arguments
preprocessing_funcs = ['transform_to_lowercase', 'remove_special_characters', 'remove_stopwords',
                       'remove_specific_phrases']
vectorizer = 'word2vec'

# Create the pipeline
pipeline = NLP_helper(preprocessing_funcs=preprocessing_funcs, vectorizer=vectorizer, column='review_text')

df = pd.read_csv('../datasets/B2W-Reviews01.csv')

pipeline = pipeline.fit(df)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/fernando/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
pipeline

Pipeline(steps=[('preprocess',
                 PreprocessDataTransformer(column='review_text',
                                           preprocessing_funcs=['transform_to_lowercase',
                                                                'remove_special_characters',
                                                                'remove_stopwords',
                                                                'remove_specific_phrases'])),
                ('vectorize',
                 VectorizationTransformer(vector=<gensim.models.word2vec.Word2Vec object at 0x7f6e319570a0>,
                                          vectorizer='word2vec'))])

## Testing with a phrase (string):

In [5]:
pipeline.transform('Produto muito bom, gostei demais')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/fernando/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.115804,0.490362,0.044363,0.333982,0.75633,-0.071293,0.751959,2.095208,-0.98293,-0.813508,...,-0.419598,1.505069,0.356586,0.217078,1.358626,0.846513,-2.072424,0.107777,1.041514,0.163294


## Testing with a list:

In [6]:
pipeline.transform(
    ['Produto muito bom, gostei demais', 
     'Odiei péssimo atendimento e oferta']
)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/fernando/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.115804,0.490362,0.044363,0.333982,0.75633,-0.071293,0.751959,2.095208,-0.98293,-0.813508,...,-0.419598,1.505069,0.356586,0.217078,1.358626,0.846513,-2.072424,0.107777,1.041514,0.163294
1,0.934772,0.64163,-0.094007,0.076443,0.504954,-0.254416,0.142825,0.840631,-0.005801,-0.135815,...,-0.490219,-0.381241,0.269133,-0.196793,0.774624,0.818078,-0.439028,-0.596835,0.32747,0.382993


## Testing with a DataFrame:

In [7]:
df = pd.DataFrame(
    {'review_text': ['Produto muito bom, gostei demais', 
                     'Odiei péssimo atendimento e oferta']
    }
)

pipeline.transform(df)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/fernando/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.115804,0.490362,0.044363,0.333982,0.75633,-0.071293,0.751959,2.095208,-0.98293,-0.813508,...,-0.419598,1.505069,0.356586,0.217078,1.358626,0.846513,-2.072424,0.107777,1.041514,0.163294
1,0.934772,0.64163,-0.094007,0.076443,0.504954,-0.254416,0.142825,0.840631,-0.005801,-0.135815,...,-0.490219,-0.381241,0.269133,-0.196793,0.774624,0.818078,-0.439028,-0.596835,0.32747,0.382993


### Using the Pipeline methods separately:

In [14]:
only_preprocess = pipeline.named_steps['preprocess'].transform('Produto muito bom, gostei demais')
display(only_preprocess)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/fernando/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,review_text,prep
0,"Produto muito bom, gostei demais",produto muito bom gostei demais


In [16]:
only_vectorize = pipeline.named_steps['vectorize'].transform(only_preprocess)
only_vectorize

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.115804,0.490362,0.044363,0.333982,0.75633,-0.071293,0.751959,2.095208,-0.98293,-0.813508,...,-0.419598,1.505069,0.356586,0.217078,1.358626,0.846513,-2.072424,0.107777,1.041514,0.163294


### Using vector-specific functions:
* Here, we are using a Word2Vec internal function.

In [18]:
vector_pipeline = pipeline.named_steps['vectorize'].vector
similar_words = vector_pipeline.wv.most_similar('comprei')
print(similar_words)

[('adquiri', 0.7210985422134399), ('compramos', 0.6766924262046814), ('encomendei', 0.595485270023346), ('compro', 0.5833603739738464), ('escolhi', 0.5779616832733154), ('pedi', 0.5587214827537537), ('dei', 0.5285062789916992), ('peguei', 0.518151581287384), ('comprado', 0.5003346800804138), ('usava', 0.4931623637676239)]
