## Let's use Pandas and Plotly to start exploring the dataset

In [16]:
import pandas as pd
import plotly.express as px

df_train = pd.read_csv("../data/arm-english-train.csv")
df_train

Unnamed: 0,review_id,product_id,reviewer_id,stars,review_body,review_title,language,product_category
0,en_0964290,product_en_0740675,reviewer_en_0342986,1,Arrived broken. Manufacturer defect. Two of th...,I'll spend twice the amount of time boxing up ...,en,furniture
1,en_0690095,product_en_0440378,reviewer_en_0133349,1,the cabinet dot were all detached from backing...,Not use able,en,home_improvement
2,en_0311558,product_en_0399702,reviewer_en_0152034,1,I received my first order of this product and ...,The product is junk.,en,home
3,en_0044972,product_en_0444063,reviewer_en_0656967,1,This product is a piece of shit. Do not buy. D...,Fucking waste of money,en,wireless
4,en_0784379,product_en_0139353,reviewer_en_0757638,1,went through 3 in one day doesn't fit correct ...,bubble,en,pc
...,...,...,...,...,...,...,...,...
199995,en_0046316,product_en_0980158,reviewer_en_0629807,5,"Cute slippers, my MIL loved them.",Nice and fit as advertised,en,shoes
199996,en_0956024,product_en_0954574,reviewer_en_0459072,5,My 6 year old likes this and keeps him engaged...,good to keep the kids engaged,en,toy
199997,en_0589358,product_en_0402982,reviewer_en_0199163,5,Replaced my battery with it. Works like new.,This works,en,wireless
199998,en_0970602,product_en_0873374,reviewer_en_0590563,5,"I like them, holding up well.",Well made.,en,industrial_supplies


In [40]:
# It's often helpful to set known categorical columns to the 'category' datatype.
# Though be careful of some common pitfalls ... 
# (see https://towardsdatascience.com/staying-sane-while-adopting-pandas-categorical-datatypes-78dbd19dcd8a)
df_train["product_category"] = df_train["product_category"].astype("category")

**Part 1:** Descriptive statistics

In [27]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   review_id         200000 non-null  object  
 1   product_id        200000 non-null  object  
 2   reviewer_id       200000 non-null  object  
 3   stars             200000 non-null  int64   
 4   review_body       200000 non-null  object  
 5   review_title      199977 non-null  object  
 6   language          200000 non-null  object  
 7   product_category  200000 non-null  category
dtypes: category(1), int64(1), object(6)
memory usage: 10.9+ MB


In [28]:
df_train.describe(include="all")

Unnamed: 0,review_id,product_id,reviewer_id,stars,review_body,review_title,language,product_category
count,200000,200000,200000,200000.0,200000,199977,200000,200000
unique,200000,185541,196745,,199426,137403,1,31
top,en_0964290,product_en_0184780,reviewer_en_0857640,,Smaller than expected,Three Stars,en,home
freq,1,6,5,,29,4169,200000,17679
mean,,,,3.0,,,,
std,,,,1.414217,,,,
min,,,,1.0,,,,
25%,,,,2.0,,,,
50%,,,,3.0,,,,
75%,,,,4.0,,,,


**Part 2:** Text fields

In [30]:
# There's quite a bit of repetition in the review title. Let's inspect ...
df_train["review_title"].value_counts()

Three Stars                                                                     4169
Four Stars                                                                      3932
Five Stars                                                                      3918
Two Stars                                                                       3730
One Star                                                                        3435
                                                                                ... 
Do NOT order from uk.                                                              1
At least two sizes too small                                                       1
Provides a good workout, but the handle straps scrape and scratch your arms.       1
I wish I would have read the reviews before purchasing ...                         1
Very good for my village lights are bright I love it                               1
Name: review_title, Length: 137403, dtype: int64

In [33]:
# Let's see the most common review bodies ...
df_train["review_body"].value_counts()[:5]

Smaller than expected     29
Works as advertised.      15
Exactly what I wanted     14
Exactly as described.     14
Smaller than expected.    10
Name: review_body, dtype: int64

In [39]:
# Let's quickly see if we can spot any correlation between the review title/body lengths and the rating
df_train.assign(
    review_title_length=df_train["review_title"].str.len(),
    review_body_length=df_train["review_body"].str.len()
).select_dtypes(include="number").corr()

Unnamed: 0,stars,review_title_length,review_body_length
stars,1.0,-0.045136,-0.038379
review_title_length,-0.045136,1.0,0.27892
review_body_length,-0.038379,0.27892,1.0


**Part 3:** Product category occurrences

In [3]:
df_train["product_category"].value_counts()

home                        17679
apparel                     15951
wireless                    15717
other                       13418
beauty                      12091
drugstore                   11730
kitchen                     10382
toy                          8745
sports                       8277
automotive                   7506
lawn_and_garden              7327
home_improvement             7136
pet_products                 7082
digital_ebook_purchase       6749
pc                           6401
electronics                  6186
office_product               5521
shoes                        5197
grocery                      4730
book                         3756
baby_product                 3150
furniture                    2984
jewelry                      2747
camera                       2139
industrial_supplies          1994
digital_video_download       1364
luggage                      1328
musical_instruments          1102
video_games                   775
watch         

In [2]:
px.bar(
    df_train["product_category"].value_counts().to_frame().reset_index(),
    x="index",
    y="product_category",
    labels={"index": "Product Category", "product_category": "Counts"}
)

**Part 4:** Star rating occurrences

In [4]:
df_train["stars"].value_counts()

1    40000
2    40000
3    40000
4    40000
5    40000
Name: stars, dtype: int64

In [5]:
px.bar(
    df_train["stars"].value_counts().to_frame().reset_index(),
    x="index",
    y="stars",
    labels={"index": "Star Rating", "stars": "Counts"},
)

**Part 5:** Average star ratings per product

In [12]:
df_train.groupby("product_category")["stars"].mean().sort_values()

product_category
wireless                    2.726729
video_games                 2.781935
grocery                     2.801903
digital_video_download      2.851906
personal_care_appliances    2.853333
lawn_and_garden             2.866521
electronics                 2.891044
beauty                      2.902407
pc                          2.904234
pet_products                2.931799
watch                       2.939553
furniture                   2.960791
automotive                  2.966294
drugstore                   2.970503
toy                         2.971298
jewelry                     2.995996
industrial_supplies         3.000000
camera                      3.002338
apparel                     3.015736
home_improvement            3.023122
other                       3.024817
home                        3.047684
office_product              3.051983
kitchen                     3.061260
musical_instruments         3.083485
shoes                       3.091976
baby_product         

In [13]:
px.bar(
    df_train.groupby("product_category")["stars"]
    .mean()
    .sort_values()
    .to_frame()
    .reset_index(),
    x="product_category",
    y="stars",
    labels={"product_category": "Product Category", "stars": "Average Rating"},
)