In [85]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [177]:
df = pd.read_csv('data/olist_order_reviews_dataset.csv')

In [178]:
df.head()

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17 00:00:00,2018-02-18 14:36:24
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53


In [179]:
df.isnull().sum()

review_id                      0
order_id                       0
review_score                   0
review_comment_title       88285
review_comment_message     58247
review_creation_date           0
review_answer_timestamp        0
dtype: int64

In [180]:
df.isnull().count()

review_id                  100000
order_id                   100000
review_score               100000
review_comment_title       100000
review_comment_message     100000
review_creation_date       100000
review_answer_timestamp    100000
dtype: int64

# Using Reviews to Predict Score

Problem: Given the review text, try and predict what the user rated for the product. 

We will use the text from 
1. `review_comment_title`
2. `review_comment_message`

To predict `review_score`

We will not take into account any entry that does not have at least 1 of the columns mentioned above

Methods:
- Naive Bayes
- Logistic Regression
- LSTM

In [181]:
df = df[~df['review_comment_message'].isnull() | ~df['review_comment_title'].isnull()]

In [223]:
df = df[['review_comment_title', 'review_comment_message', 'review_score']].reset_index(drop=True)

In [224]:
df = df.fillna('')

In [225]:
df.head()

Unnamed: 0,review_comment_title,review_comment_message,review_score
0,,Recebi bem antes do prazo estipulado.,5
1,,Parabéns lojas lannister adorei comprar pela I...,5
2,recomendo,aparelho eficiente. no site a marca do aparelh...,4
3,,"Mas um pouco ,travando...pelo valor ta Boa.\r\n",4
4,Super recomendo,"Vendedor confiável, produto ok e entrega antes...",5


# Pre-processing of Text

1. We combine the 2 columns together
2. We remove any punctuation, newlines, excess whitespace, stopwords
3. Use TF-IDF to convert words to vectors

https://towardsdatascience.com/review-rating-prediction-a-combined-approach-538c617c495c

https://medium.com/@datamonsters/text-preprocessing-in-python-steps-tools-and-examples-bf025f872908

https://www.digitalocean.com/community/tutorials/how-to-perform-sentiment-analysis-in-python-3-using-the-natural-language-toolkit-nltk

https://towardsdatascience.com/a-beginners-guide-to-sentiment-analysis-in-python-95e354ea84f6

In [226]:
df['review'] = df['review_comment_title'] + ' ' + df['review_comment_message']

In [227]:
len(df)

43482

In [228]:
# Remove stopwords
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

def process_text(text):
    text = ''.join([c for c in text if c not in string.punctuation])
    text = text.lower().strip()
#     text = [word for word in text.split() if word not in stopwords.words('portuguese')]
    return text

[nltk_data] Downloading package stopwords to \\ds8.student.main.ntu.ed
[nltk_data]     u.sg\hwl2$\student\tohh0023\Application
[nltk_data]     Data\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [188]:
# Split
df['review'] = df['review'].apply(lambda x: process_text(x))

In [229]:
df['review']

0                    Recebi bem antes do prazo estipulado.
1         Parabéns lojas lannister adorei comprar pela ...
2        recomendo aparelho eficiente. no site a marca ...
3          Mas um pouco ,travando...pelo valor ta Boa.\r\n
4        Super recomendo Vendedor confiável, produto ok...
                               ...                        
43477     Entregou dentro do prazo. O produto chegou em...
43478     O produto não foi enviado com NF, não existe ...
43479     Excelente mochila, entrega super rápida. Supe...
43480     Solicitei a compra de uma capa de retrovisor ...
43481     meu produto chegou e ja tenho que devolver, p...
Name: review, Length: 43482, dtype: object

In [230]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')

In [231]:
X = pd.DataFrame()
y = pd.DataFrame()

# Pre-processing of Score

We will scale between 0 and 1

In [232]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
y = scaler.fit_transform(y)

ValueError: at least one array or dtype is required

In [233]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['review'], df['review_score'], test_size=0.3)

In [234]:
train_matrix = vectorizer.fit_transform(X_train)
test_matrix = vectorizer.transform(X_test)

In [235]:
train_matrix

<30437x13853 sparse matrix of type '<class 'numpy.int64'>'
	with 332405 stored elements in Compressed Sparse Row format>

In [236]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

In [237]:
lr.fit(train_matrix, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [238]:
lr.score(test_matrix, y_test)

0.6840935224223841

In [242]:
y_test[22347]

5

In [253]:
df[df['review_score'] == 4]

Unnamed: 0,review_comment_title,review_comment_message,review_score,review
2,recomendo,aparelho eficiente. no site a marca do aparelh...,4,recomendo aparelho eficiente. no site a marca ...
3,,"Mas um pouco ,travando...pelo valor ta Boa.\r\n",4,"Mas um pouco ,travando...pelo valor ta Boa.\r\n"
13,Muito bom.,Recebi exatamente o que esperava. As demais en...,4,Muito bom. Recebi exatamente o que esperava. A...
28,,"Ótima loja para parceria: rápidíssima, produto...",4,"Ótima loja para parceria: rápidíssima, produt..."
50,,"Estava faltando apenas um produto, eu recebi h...",4,"Estava faltando apenas um produto, eu recebi ..."
...,...,...,...,...
43434,Muito bom,Poderia ser excelente se o suporte não tivesse...,4,Muito bom Poderia ser excelente se o suporte n...
43436,,"Chegou um pouco amassada, mas nada de mais, e ...",4,"Chegou um pouco amassada, mas nada de mais, e..."
43445,,Produto de muito boa qualidade!,4,Produto de muito boa qualidade!
43469,👍,Aprovado!,4,👍 Aprovado!


In [247]:
df.iloc[6]

review_comment_title              Não chegou meu produto 
review_comment_message                            Péssimo
review_score                                            1
review                    Não chegou meu produto  Péssimo
Name: 6, dtype: object

In [254]:
lr.predict(vectorizer.transform([df.iloc[2]['review']]))

array([5], dtype=int64)