# Analysis of Olist reviews 

Analyse the Olist reviews to understand what could be the causes of the bad review scores.

In [129]:
# import modules 

import pandas as pd
import numpy as np 
import string
import unidecode

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 

from olistpackage.data import Olist

In [None]:
# magic commands

%load_ext autoreload
%autoreload 2

In [131]:
# Instantiate the Olist class 
# which main purpose is downloading Olist csv files.

olist = Olist()

In [132]:
# Download Olist csv files in root/data/csv folder.
# If all the csv files exists already, do nothing.
# Uses the Kaggle Official API for donwloading Olist csv files.
# Kaggle Official API credentials are asked by opendatasets.download().
# Sign in to https://kaggle.com/, then click on your profile picture
# on the top right and select "My Account" from the menu.
# Scroll down to the "API" section and click "Create New API Token".
# This will download a file kaggle.json with the following contents:
# {"username":"YOUR_KAGGLE_USERNAME","key":"YOUR_KAGGLE_KEY"}

olist.download_data()

The Olist csv files are already downloaded.


In [133]:
# get the Olist csv files data into a dictionary of dataframes
data = olist.get_data()

In [134]:
df = data['order_reviews'].merge(data['orders'], on="order_id", how="inner")

In [135]:
df.shape

(99224, 14)

In [136]:
df["review_comment_message"].count()

40977

In [137]:
df.columns

Index(['review_id', 'order_id', 'review_score', 'review_comment_title',
       'review_comment_message', 'review_creation_date',
       'review_answer_timestamp', 'customer_id', 'order_status',
       'order_purchase_timestamp', 'order_approved_at',
       'order_delivered_carrier_date', 'order_delivered_customer_date',
       'order_estimated_delivery_date'],
      dtype='object')

In [138]:
df.dtypes

review_id                        object
order_id                         object
review_score                      int64
review_comment_title             object
review_comment_message           object
review_creation_date             object
review_answer_timestamp          object
customer_id                      object
order_status                     object
order_purchase_timestamp         object
order_approved_at                object
order_delivered_carrier_date     object
order_delivered_customer_date    object
order_estimated_delivery_date    object
dtype: object

In [139]:
df.query('review_comment_message != review_comment_message').count()

review_id                        58247
order_id                         58247
review_score                     58247
review_comment_title              1729
review_comment_message               0
review_creation_date             58247
review_answer_timestamp          58247
customer_id                      58247
order_status                     58247
order_purchase_timestamp         58247
order_approved_at                58193
order_delivered_carrier_date     57688
order_delivered_customer_date    57259
order_estimated_delivery_date    58247
dtype: int64

In [140]:
# combine review title and review message
df["review_title_and_message"] = df["review_comment_title"].fillna("") \
    + " " + df["review_comment_message"].fillna("")

In [141]:
# Customers could review an order before receiving it
# Let's consider reviews written only after receiving the order 

df = df.query("review_creation_date >= order_delivered_customer_date")
df.shape

(88039, 15)

In [142]:
# Cleaning the reviews

def remove_punctuation(text=""):
    return "".join([char for char in text if char not in string.punctuation])

def lower_case(text=""):
    return text.lower()

def remove_number(text=""):
    return "".join([char for char in text if not char.isdigit()])

def remove_accents_diacritics(text=""):
    return unidecode.unidecode(text)

def tokenize(text=""):
    return word_tokenize(text)
    

def stop_words(text=""):
    return " ".join([word for word in text if word not in stopwords.words("portuguese")])

funcs = [
    remove_punctuation,
    lower_case,
    remove_number,
    remove_accents_diacritics,
    tokenize,
    stop_words
]

def clean(text=""):
    for f in funcs:
        text = f(text)
    return text

In [143]:
df["clean_review"] = df["review_title_and_message"].apply(clean)

In [144]:
df["clean_review"]

0                                                         
1                                                         
2                                                         
3                        recebi bem antes prazo estipulado
4        parabens lojas lannister adorei comprar intern...
                               ...                        
99219                                                     
99220                                                     
99221    excelente mochila entrega super rapida super r...
99222                                                     
99223    produto chegou ja devolver pois defeito nao se...
Name: clean_review, Length: 88039, dtype: object

In [145]:
# drop the empty reviews
df = df[df["clean_review"] != ""]
df.shape

(35983, 16)

In [107]:
# Check the unique values of reviews score
df['review_score'].unique()

array([4, 5, 1, 3, 2])

In [146]:
# review score distribution
round(df["review_score"].value_counts(normalize = True), 2)

5    0.57
4    0.16
1    0.13
3    0.09
2    0.05
Name: review_score, dtype: float64

More than 25% of the orders with a review have a score equal or below to 3.
Let's focus on those orders' reviews.

In [149]:
df = df[df["review_score"]<=3]

In [150]:
df.shape

(9599, 16)