In [1]:
# Standard libs
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt

# DataPrep
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
import re
# import spacy
# nlp = spacy.load('en_core_web_sm')

# Modelling
from sklearn.model_selection import train_test_split
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB


# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gabriel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/gabriel/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## Importing Dataset 

In [2]:
df = pd.read_csv('../data/raw/B2W-Reviews01.csv')
df.head(5)

  df = pd.read_csv('../data/raw/B2W-Reviews01.csv')


Unnamed: 0,submission_date,reviewer_id,product_id,product_name,product_brand,site_category_lv1,site_category_lv2,review_title,overall_rating,recommend_to_a_friend,review_text,reviewer_birth_year,reviewer_gender,reviewer_state
0,2018-01-01 00:11:28,d0fb1ca69422530334178f5c8624aa7a99da47907c44de...,132532965,Notebook Asus Vivobook Max X541NA-GO472T Intel...,,Informática,Notebook,Bom,4,Yes,Estou contente com a compra entrega rápida o ú...,1958.0,F,RJ
1,2018-01-01 00:13:48,014d6dc5a10aed1ff1e6f349fb2b059a2d3de511c7538a...,22562178,Copo Acrílico Com Canudo 500ml Rocie,,Utilidades Domésticas,"Copos, Taças e Canecas","Preço imbatível, ótima qualidade",4,Yes,"Por apenas R$1994.20,eu consegui comprar esse ...",1996.0,M,SC
2,2018-01-01 00:26:02,44f2c8edd93471926fff601274b8b2b5c4824e386ae4f2...,113022329,Panela de Pressão Elétrica Philips Walita Dail...,philips walita,Eletroportáteis,Panela Elétrica,ATENDE TODAS AS EXPECTATIVA.,4,Yes,SUPERA EM AGILIDADE E PRATICIDADE OUTRAS PANEL...,1984.0,M,SP
3,2018-01-01 00:35:54,ce741665c1764ab2d77539e18d0e4f66dde6213c9f0863...,113851581,Betoneira Columbus - Roma Brinquedos,roma jensen,Brinquedos,Veículos de Brinquedo,presente mais que desejado,4,Yes,MEU FILHO AMOU! PARECE DE VERDADE COM TANTOS D...,1985.0,F,SP
4,2018-01-01 01:00:28,7d7b6b18dda804a897359276cef0ca252f9932bf4b5c8e...,131788803,"Smart TV LED 43"" LG 43UJ6525 Ultra HD 4K com C...",lg,TV e Home Theater,TV,"Sem duvidas, excelente",5,Yes,"A entrega foi no prazo, as americanas estão de...",1994.0,M,MG


## Trained model VADER

[Multilanguage sentiment analysis paper](https://homepages.dcc.ufmg.br/~fabricio/download/brasnam15-multi.pdf)

In [3]:
# Pre processing
# Apply  regex

def apply_regex(corpus, regex):
    corpus = [re.sub(regex, ' ', x) for x in corpus]
    return corpus

def multiple_regex(corpus, regex_list):
    # Lowcase
    corpus = corpus.apply(lambda x: x.lower())
    # Negation
    corpus = [re.sub('([nN][ãÃaA][oO]|[ñÑ]| [nN] )', ' negação ', r) for r in corpus]
    # Basix regex
    for regex in regex_list:
        corpus = apply_regex(corpus, regex)
    return corpus

def data_cleaning(df):
    df = df.copy()
    df.drop_duplicates(inplace=True)
    df['review_text'] = df['review_title'] + ' ' + df['review_text']
    cols=['submission_date', 'reviewer_id', 'product_id', 'product_name', 'product_brand',
          'site_category_lv1', 'site_category_lv2', 'overall_rating', 'reviewer_birth_year',
          'reviewer_gender', 'reviewer_state', 'review_title']
    df.drop(columns=cols, inplace=True)
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    regex_list = [r'www\S+', r'http\S+', r'@\S+', r'#\S+', r'[0-9]+', r'\W', r'\s+', r'[ \t]+$']
    df['review_text'] = multiple_regex(df['review_text'], regex_list)

    return df

In [4]:
%%time

df_vader = data_cleaning(df.copy())
df_vader.head(5)

CPU times: user 3.99 s, sys: 107 ms, total: 4.09 s
Wall time: 4.09 s


Unnamed: 0,recommend_to_a_friend,review_text
0,Yes,bom estou contente com a compra entrega rápida...
1,Yes,preço imbatível ótima qualidade por apenas r e...
2,Yes,atende todas as expectativa supera em agilidad...
3,Yes,presente mais que desejado meu filho amou pare...
4,Yes,sem duvidas excelente a entrega foi no prazo a...


In [5]:
%%time

sid = SentimentIntensityAnalyzer()
df_vader['scores'] = df_vader['review_text'].apply(lambda review: sid.polarity_scores(review))

CPU times: user 21.8 s, sys: 21.3 ms, total: 21.8 s
Wall time: 21.8 s


In [6]:
df_vader['compound']  = df_vader['scores'].apply(lambda score_dict: score_dict['compound']) # type: ignore
df_vader['comp_score'] = df_vader['compound'].apply(lambda c: 'Yes' if c >=0 else 'No') # type: ignore
df_vader.head(5) # type: ignore

Unnamed: 0,recommend_to_a_friend,review_text,scores,compound,comp_score
0,Yes,bom estou contente com a compra entrega rápida...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,Yes
1,Yes,preço imbatível ótima qualidade por apenas r e...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,Yes
2,Yes,atende todas as expectativa supera em agilidad...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,Yes
3,Yes,presente mais que desejado meu filho amou pare...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,Yes
4,Yes,sem duvidas excelente a entrega foi no prazo a...,"{'neg': 0.063, 'neu': 0.86, 'pos': 0.077, 'com...",0.128,Yes


In [7]:
def evaluate(y_test, predictions):
    print('accuracy: ',metrics.accuracy_score(y_test, predictions))
    print('\n', '------------------', '\n')
    print(metrics.classification_report(y_test, predictions))
    print('\n', '------------------', '\n')
    print(metrics.confusion_matrix(y_test, predictions))
          
evaluate(y_test=df_vader['recommend_to_a_friend'], predictions=df_vader['comp_score']) # type: ignore

accuracy:  0.7191948867809536

 ------------------ 

              precision    recall  f1-score   support

          No       0.43      0.23      0.30     33328
         Yes       0.77      0.89      0.82     94654

    accuracy                           0.72    127982
   macro avg       0.60      0.56      0.56    127982
weighted avg       0.68      0.72      0.69    127982


 ------------------ 

[[ 7739 25589]
 [10349 84305]]


## ML classification models
### Base Model

In [8]:
# Create Feature and Label sets
df_ml = data_cleaning(df.copy())
X = df_ml['review_text']
y = df_ml['recommend_to_a_friend']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('Training Data Size:', len(X_train))
print('Testing Data Size: ', len(X_test))

Training Data Size: 102385
Testing Data Size:  25597


In [10]:
pt_stopwords = stopwords.words('portuguese')

count_vect = CountVectorizer(lowercase=True, min_df=2, max_df=0.95, ngram_range=(1,3), stop_words=pt_stopwords)
tfidf_vect = TfidfVectorizer(lowercase=True, min_df=2, max_df=0.95, ngram_range=(1,3), stop_words=pt_stopwords)

X1 = X_train.copy()
X2 = X_train.copy()

X_train_tfidf = tfidf_vect.fit_transform(X1)
X_train_count= count_vect.fit_transform(X2)

In [11]:
clf = LinearSVC()
clf.fit(X_train_tfidf,y_train)
# Form a prediction set
X_test_tfidf = tfidf_vect.transform(X_test)
predictions = clf.predict(X_test_tfidf)

In [12]:
evaluate(y_test, predictions)

accuracy:  0.9348361136070633

 ------------------ 

              precision    recall  f1-score   support

          No       0.88      0.87      0.87      6599
         Yes       0.96      0.96      0.96     18998

    accuracy                           0.93     25597
   macro avg       0.92      0.91      0.91     25597
weighted avg       0.93      0.93      0.93     25597


 ------------------ 

[[ 5748   851]
 [  817 18181]]


### Grid Search

In [13]:
%%time

pipe = Pipeline(steps=[
        ('token', TfidfVectorizer()),
        ('estimator', LinearSVC())])

params_grid = [{'token': [TfidfVectorizer(lowercase=True, min_df=2, max_df=0.95, ngram_range=(1,3), stop_words=pt_stopwords),
                          CountVectorizer(lowercase=True, min_df=2, max_df=0.95, ngram_range=(1,3), stop_words=pt_stopwords) ],
                'estimator':[LinearSVC(), RandomForestClassifier(), MultinomialNB(), LogisticRegression(), MLPClassifier()],
                },
                ]

f1_scorer = metrics.make_scorer(metrics.f1_score, pos_label="Yes")
grid = GridSearchCV(pipe, params_grid, cv=5, verbose=1, scoring = f1_scorer)
grid.fit(X_train, y_train)

best_recall = grid.best_score_
best_parameters = grid.best_params_
print("Best Accuracy: {:.2f} %".format(best_recall*100))
print("Best Parameters:", best_parameters)

Fitting 5 folds for each of 10 candidates, totalling 50 fits




In [None]:
metrics.classification_report(y_test,grid.best_estimator_.predict(X_test)) # type: ignore

## (TO-DO) Create Pipelines

In [79]:
class DataCleaning(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Make a new variable that is rating divided by number of reviews
        X['review_text'] = X['review_title'] + ' ' + X['review_text']
        X.drop_duplicates(inplace=True)
        X.drop(columns=['review_title'], inplace=True)
        X.reset_index(drop=True, inplace=True)
        X['review_text'] = X['review_text'].str.lower().copy()
        X['review_text'] = X['review_text'].str.replace('([nN][ãÃaA][oO]|[ñÑ]| [nN] )', 'negação', regex=True).copy()
        regex_list = [r'www\S+', r'http\S+', r'@\S+', r'#\S+', r'[0-9]+', r'\W', r'\s+', r'[ \t]+$']
        for regex in regex_list:
            X['review_text'] = X['review_text'].str.replace(regex, ' ', regex=True).copy()
        return X

In [80]:
text_vars = ['review_text', 'review_title']

text_pipeline = Pipeline([('data_cleaning', DataCleaning())])

data_pipeline = ColumnTransformer(
    [
    ('text', text_pipeline, text_vars),
    ],
    remainder='drop'
)

num_transformed = data_pipeline.fit_transform(X)

array([['bom estou contente com a compra entrega rápida o único problema com as americanas é se houver troca ou devolução do produto o consumidor tem problemas com espera '],
       ['preço imbatível ótima qualidade por apenas r eu consegui comprar esse lindo copo de acrílico '],
       ['atende todas as expectativa supera em agilidade e praticidade outras panelas elétricas costumo usar outra panela para cozimento de arroz japonesa mas leva muito tempo minutos nessa panela é muito mais rápido exatamente minutos eu recomendo '],
       ...,
       ['ótimo produto chegou antes do prazo previsto e corresponde ao anúncio'],
       ['o produto negação é bom material fraco poderia ser melhor ficou devendo na minha opinião '],
       ['produto negação entregue comprei esse produto quando chegou estava com avaria devolvi já vai fazer meses negação me enviaram o produto e nem o estorno ']],
      dtype=object)

In [None]:
# Linear SVC:

pt_stopwords = stopwords.words('portuguese')

text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer(min_df=2, max_df=0.75, ngram_range=(1,3), stop_words=pt_stopwords)),
                         ('clf', LinearSVC()),
])

text_clf_lsvc.fit(X_train, y_train)

In [None]:
parameters = {'clf_C': [0.25, 0.5, 0.75, 1],
                'clf_kernel': ['linear', 'rbf'],
                'clf_gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}
grid_search = GridSearchCV(estimator = text_clf_lsvc,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 3,
                           n_jobs = 2)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

In [None]:
# Form a prediction set
predictions = text_clf_lsvc.predict(X_test)