In [3]:
# Personalized Machine Learning Chapter 4, Users Who Bought X also Bought Y
     # https://cseweb.ucsd.edu/~jmcauley/pml/code/chap4.html
# All reviews: https://s3.amazonaws.com/amazon-reviews-pds/tsv/index.txt
import gzip
from collections import defaultdict
from tqdm import tqdm

In [4]:
f = gzip.open('../data/amazon_reviews/amazon_reviews_us_Musical_Instruments_v1_00.tsv.gz', 'rt', encoding="utf8")
header = f.readline()
header = header.strip().split('\t')
dataset = []
for line in tqdm(f):
    fields = line.strip().split('\t')
    d = dict(zip(header, fields))
    d['star_rating'] = int(d['star_rating'])
    d['helpful_votes'] = int(d['helpful_votes'])
    d['total_votes'] = int(d['total_votes'])
    dataset.append(d)
    
print(len(dataset))
dataset[0]

904765it [00:25, 34967.63it/s]

904765





{'marketplace': 'US',
 'customer_id': '45610553',
 'review_id': 'RMDCHWD0Y5OZ9',
 'product_id': 'B00HH62VB6',
 'product_parent': '618218723',
 'product_title': 'AGPtek® 10 Isolated Output 9V 12V 18V Guitar Pedal Board Power Supply Effect Pedals with Isolated Short Cricuit / Overcurrent Protection',
 'product_category': 'Musical Instruments',
 'star_rating': 3,
 'helpful_votes': 0,
 'total_votes': 1,
 'vine': 'N',
 'verified_purchase': 'N',
 'review_headline': 'Three Stars',
 'review_body': 'Works very good, but induces ALOT of noise.',
 'review_date': '2015-08-31'}

In [14]:
usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
itemNames = {}
ratingDict = {} # To retrieve a rating for a specific user/item pair

for d in dataset:
    user, item = d['customer_id'], d['product_id']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    ratingDict[(user,item)] = d['star_rating']
    itemNames[item] = d['product_title']

In [18]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

def mostSimilar(i, N):
    similarities = []
    users = usersPerItem[i]
    for i2 in tqdm(usersPerItem):
        if i2 == i: continue
        sim = Jaccard(users, usersPerItem[i2])
        similarities.append((sim,i2))
    similarities.sort(reverse=True)
    return [(s[0], itemNames[s[1]]) for s in similarities[:N]]

In [20]:
itemNames['B0006VMBHI']

'AudioQuest LP record clean brush'

In [19]:
mostSimilar('B0006VMBHI', 5)

100%|███████████████████████████████████████████████████████████████████████| 123328/123328 [00:01<00:00, 62383.85it/s]


[(0.028446389496717725, 'Shure SFG-2 Stylus Tracking Force Gauge'),
 (0.01694915254237288,
  'Shure M97xE High-Performance Magnetic Phono Cartridge'),
 (0.015065913370998116, 'ART Pro Audio DJPRE II Phono Turntable Preamplifier'),
 (0.014204545454545454,
  'Signstek Blue LCD Backlight Digital Long-Playing LP Turntable Stylus Force Scale Gauge Tester'),
 (0.008955223880597015,
  'Audio Technica AT120E/T Standard Mount Phono Cartridge')]