## Load dataset

In [1]:
from IPython.display import display

import gzip

path = "../large_datasets/amazon_reviews_us_Gift_Card_v1_00.tsv.gz"
f = gzip.open(path, 'rt')

import csv
reader = csv.reader(f, delimiter='\t')

## Load header
header = next(reader)

## Load dataset
dataset = []

for line in reader:
    d = dict(zip(header, line))
    for field in ['helpful_votes', 'star_rating', 'total_votes']:
        d[field] = int(d[field])
    for field in ['verified_purchase', 'vine']:
        if d[field] == 'Y':
            d[field] = True
        else:
            d[field] = False

    dataset.append(d)

display(dataset[0])

{'marketplace': 'US',
 'customer_id': '24371595',
 'review_id': 'R27ZP1F1CD0C3Y',
 'product_id': 'B004LLIL5A',
 'product_parent': '346014806',
 'product_title': 'Amazon eGift Card - Celebrate',
 'product_category': 'Gift Card',
 'star_rating': 5,
 'helpful_votes': 0,
 'total_votes': 0,
 'vine': False,
 'verified_purchase': True,
 'review_headline': 'Five Stars',
 'review_body': 'Great birthday gift for a young adult.',
 'review_date': '2015-08-31'}

## Simple stats

In [2]:
## Average rating
ratings = [d['star_rating'] for d in dataset]
avg_rating = sum(ratings)/len(ratings)

avg_rating

4.731333018677096

In [3]:
## Rating distribution
ratingCounts = {1:0, 2:0, 3:0, 4:0, 5:0}

for d in dataset:
    ratingCounts[d['star_rating']] += 1

display(ratingCounts)

{1: 4766, 2: 1560, 3: 3147, 4: 9808, 5: 129029}

In [4]:
## Rating distribution - default dict
from collections import defaultdict

ratingCounts = defaultdict(int)

for d in dataset:
    ratingCounts[d['star_rating']] += 1

display(ratingCounts)

defaultdict(int, {5: 129029, 1: 4766, 4: 9808, 2: 1560, 3: 3147})

In [5]:
## Verified count
verified_counts = defaultdict(int)

for d in dataset:
    verified_counts[d['verified_purchase']] += 1

display(verified_counts)

defaultdict(int, {True: 135289, False: 13021})

In [6]:
## Most popular product
product_counts = defaultdict(int)

for d in dataset:
    product_counts[d['product_id']] += 1

counts = [(product_counts[p], p) for p in product_counts]
counts.sort()
counts[-10:]

[(2038, 'B004KNWWO0'),
 (2173, 'B0066AZGD4'),
 (2630, 'BT00DDC7CE'),
 (2643, 'B004LLIKY2'),
 (3407, 'BT00DDC7BK'),
 (3440, 'BT00CTOUNS'),
 (4283, 'B00IX1I3G6'),
 (5034, 'BT00DDVMVQ'),
 (6037, 'B00A48G0D4'),
 (28705, 'B004LLIKVU')]

In [7]:
## Top rated products
ratings_per_product = defaultdict(list)

for d in dataset:
    ratings_per_product[d['product_id']].append(d['star_rating'])

avg_rating_per_product = {}
for p in ratings_per_product:
    avg_rating_per_product[p] = sum(ratings_per_product[p])/len(ratings_per_product[p])

#### Filter only product > 50 reviews
top_rated = [(avg_rating_per_product[p], p) for p in avg_rating_per_product if len(ratings_per_product[p]) > 50]
top_rated.sort()
top_rated[-10:]

[(4.918918918918919, 'B004KNWX94'),
 (4.919354838709677, 'B00CRQ496G'),
 (4.923076923076923, 'B00PMLDNBA'),
 (4.931034482758621, 'B00CT77E60'),
 (4.936842105263158, 'B004KNWX76'),
 (4.9423076923076925, 'B00SNMPQYC'),
 (4.944444444444445, 'B007V6EWKK'),
 (4.947368421052632, 'B004LLIL5K'),
 (4.955882352941177, 'B00H5BNKYA'),
 (4.966101694915254, 'B00P8N49M4')]