# EDA of Review Data

In [1]:
import pandas as pd
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
def get_basic_info(review_dataset, meta_dataset):
    '''@param review_dataset, meta_dataset: type 'datasets.arrow_dataset.Dataset'''
    print(f"Tot number of reviews", len(review_dataset))
    print("Columns in metadata:", meta_dataset.column_names)

    categories_col = meta_dataset['categories']
    all_categories = [category for sublist in categories_col for category in sublist]
    unique_categories_set = set(all_categories)
    print("Number of categories:", len(unique_categories_set))
    print("Unique categories:", unique_categories_set)

In [23]:
cellphone_reviews_all = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Cell_Phones_and_Accessories", split="full", trust_remote_code=True)
cellphone_meta_all = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Cell_Phones_and_Accessories", split="full", trust_remote_code=True)

print("Dataset type:", type(cellphone_reviews_all), type(cellphone_meta_all))
get_basic_info(cellphone_reviews_all, cellphone_meta_all)

Dataset type: <class 'datasets.arrow_dataset.Dataset'> <class 'datasets.arrow_dataset.Dataset'>
Tot number of reviews 20812945
Columns in metadata: ['main_category', 'title', 'average_rating', 'rating_number', 'features', 'description', 'price', 'images', 'videos', 'store', 'categories', 'details', 'parent_asin', 'bought_together', 'subtitle', 'author']
Number of categories: 159
Unique categories: {'Single Ear Bluetooth Headsets', 'Windows Phone 7', 'iPhone 7 Cases, Accessories & Bluetooth Headphones', 'Brave Frontier Phone Cases', 'Lens Protectors', 'Joysticks', 'Samsung Galaxy S6 Cases', 'iPhone 7 Wallet Cases', 'Virtual Reality (VR) Headsets', 'Sleeves', 'Samsung Galaxy S 4 Cases', 'Alexa Features: Entertainment', 'Photo & Video Accessories', 'Crossbody & Lanyard Cases', 'Smartwatch Cables & Chargers', 'Cloud Drive Promotion', 'SIM Card Tools & Accessories', 'Deals in Unlocked Cell Phones', 'Carrier Branded Wireless Trade-In', 'iPhone 7+ Cases', 'Smartwatch Cases', 'Screen Expanders

In [8]:
handmade_reviews_all = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Handmade_Products",  split="full", trust_remote_code=True)
handmade_meta_all = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Handmade_Products", split="full", trust_remote_code=True)

get_basic_info(handmade_reviews_all, handmade_meta_all)

Tot number of reviews 664162
Columns in metadata: ['main_category', 'title', 'average_rating', 'rating_number', 'features', 'description', 'price', 'images', 'videos', 'store', 'categories', 'details', 'parent_asin', 'bought_together', 'subtitle', 'author']
Number of categories: 646
Unique categories: {'Travel Accessories', 'Deodorants & Antiperspirants', 'Balms & Moisturizers', 'Aisle Runners', 'Lotions', 'Christmas Tree Skirts', 'Bowls', 'Ring Cushions', 'Sleeves', 'Beauty & Grooming', 'Portraits', 'Styling Products', 'Photo Albums & Frames', 'Clocks', 'Southwest FBA', 'Home Décor', 'Card Cases', 'Furniture', 'Bathroom Accessories', 'Planters & Boxes', 'Toys & Games', 'Jewelry Sets', 'Home Fragrance', 'Northeast FBA', 'Cell Phone Accessories', 'Slippers', 'Music Boxes', 'Belts, Holsters & Slings', 'Baskets', 'Totes', 'Earmuffs', 'Pacifier Accessories', 'Bras', 'Patriotic Picks', 'Musical Toy Instruments', 'Cleaning Tools', 'Handmade Products', 'New York', 'Ring Boxes', 'Hunting & Sho

In [11]:
reviews_cases = pd.read_csv('Basic_Cases_reviews.csv')
reviews_cases.head()

Unnamed: 0.1,Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,0,4.0,No white background! It’s clear!,I bought this bc I thought it had the nice whi...,"[{'attachment_type': 'IMAGE', 'large_image_url...",B08L6L3X1S,B08L6L3X1S,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1612044451196,0,True
1,1,4.0,Decent,Lasted about 9 months then the lock button bro...,"[{'attachment_type': 'IMAGE', 'large_image_url...",B07XRDHDNQ,B07XRDHDNQ,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,1590470082910,0,True
2,2,5.0,It is a great value & protects the phone from ...,My son loves his camo case! It is a great valu...,[],B00W3C3ISY,B00W3C3ISY,AGXVBIUFLFGMVLATYXHJYL4A5Q7Q,1461331211000,0,True
3,3,5.0,Great value!,Some people have given this less favorable rev...,[],B01JTI2K3S,B08FLGTLBQ,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1530231753869,0,True
4,4,4.0,Great fit and love the color,Great fit and love the color. Just enough grip...,[],B01LVXS6C0,B01LVXS6C0,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1519316898712,0,True


In [21]:
def get_most_reviewed_prods(reviews_df):
    print(f"Number of reviews: {reviews_df.shape[0]}")
    review_counts = reviews_df.groupby(['parent_asin']).agg({'text':'count'}).sort_values(by='text', ascending=False)
    review_counts = review_counts.reset_index()
    review_counts = review_counts.rename({'text':'numb_of_revs'}, axis='columns')
    print(f"Number of unique reviews: {review_counts.shape[0]}")
    k = 10
    print(f'Top {k} most reviewed:\n', review_counts.head(k),'\n')
    print(review_counts['numb_of_revs'].describe())

In [22]:
# Most reviewed products + their review counts for Basic_Cases
get_most_reviewed_prods(reviews_cases)

Number of reviews: 2646
Number of unique reviews: 2398
Top 10 most reviewed:
   parent_asin  numb_of_revs
0  B096Z4H1CP             8
1  B0BVLY92D8             8
2  B074822GN4             6
3  B0B65275R7             5
4  B07H9LHX7M             5
5  B00Z7SH7GK             4
6  B00MIPS386             4
7  B08KQ1YP6K             4
8  B0BYZCWKZL             4
9  B07HCVLMY9             4 

count    2398.000000
mean        1.103003
std         0.432107
min         0.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         8.000000
Name: numb_of_revs, dtype: float64


In [19]:
# Convert handmade_reviews_all to pandas dataframe
handmade_reviews_df = handmade_reviews_all.to_pandas()
handmade_reviews_df.head()

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,5.0,Beautiful colors,I bought one for myself and one for my grandda...,[],B08GPJ1MSN,B08GPJ1MSN,AF7OANMNHQJC3PD4HRPX2FATECPA,1621607495111,1,True
1,5.0,You simply must order order more than one!,I’ve ordered three bows so far. Have not been ...,[],B084TWHS7W,B084TWHS7W,AGMJ3EMDVL6OWBJF7CA5RGJLXN5A,1587762946965,0,True
2,5.0,Great,As pictured. Used a frame from the dollar stor...,[],B07V3NRQC4,B07V3NRQC4,AEYORY2AVPMCPDV57CE337YU5LXA,1591448951297,0,True
3,5.0,Well made and so beautiful,"This is beyond beautiful. So shiny, the size ...",[],B071ZMDK26,B071ZMDK26,AEINY4XOINMMJCK5GZ3M6MMHBN6A,1559438079784,2,True
4,5.0,Smells just like the real thing!,Oh wow what a pleasant surprise! This smells g...,[],B01MPVZ4YP,B01MPVZ4YP,AGCPAPUHXYA3EEIL2KGSQTGO5HRA,1546906331674,1,True


In [20]:
get_most_reviewed_prods(handmade_reviews_df)

Number of reviews: 664162
Number of unique reviews: 164728
Top 10 most reviewed:
   parent_asin  numb_of_revs
0  B09GK2JJDZ          2498
1  B07RC9FWLN          1305
2  B01G29HQ8G          1088
3  B01ASDYQQC          1044
4  B01N5SVHUU           972
5  B083Q2N1KQ           913
6  B07NP52Y1F           896
7  B015NSJ11W           887
8  B081732LNJ           766
9  B015HVACEA           734 

count    164728.000000
mean          4.031871
std          16.397329
min           1.000000
25%           1.000000
50%           1.000000
75%           3.000000
max        2498.000000
Name: numb_of_revs, dtype: float64
