In [1]:
# Personalized Machine Learning Chapter 4, Users Who Bought X also Bought Y
     # https://cseweb.ucsd.edu/~jmcauley/pml/code/chap4.html
# All reviews: https://s3.amazonaws.com/amazon-reviews-pds/tsv/index.txt
import gzip
from collections import defaultdict
from tqdm import tqdm

In [2]:
f = gzip.open('../data/amazon_reviews/amazon_reviews_us_Musical_Instruments_v1_00.tsv.gz', 'rt', encoding="utf8")
header = f.readline()
header = header.strip().split('\t')
dataset = []
for line in tqdm(f):
    fields = line.strip().split('\t')
    d = dict(zip(header, fields))
    d['star_rating'] = int(d['star_rating'])
    d['helpful_votes'] = int(d['helpful_votes'])
    d['total_votes'] = int(d['total_votes'])
    dataset.append(d)
    
print(len(dataset))
dataset[0]

904765it [00:10, 85080.60it/s] 

904765





{'marketplace': 'US',
 'customer_id': '45610553',
 'review_id': 'RMDCHWD0Y5OZ9',
 'product_id': 'B00HH62VB6',
 'product_parent': '618218723',
 'product_title': 'AGPtek® 10 Isolated Output 9V 12V 18V Guitar Pedal Board Power Supply Effect Pedals with Isolated Short Cricuit / Overcurrent Protection',
 'product_category': 'Musical Instruments',
 'star_rating': 3,
 'helpful_votes': 0,
 'total_votes': 1,
 'vine': 'N',
 'verified_purchase': 'N',
 'review_headline': 'Three Stars',
 'review_body': 'Works very good, but induces ALOT of noise.',
 'review_date': '2015-08-31'}

In [3]:
usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
itemNames = {}
ratingDict = {} # To retrieve a rating for a specific user/item pair

for d in dataset:
    user, item = d['customer_id'], d['product_id']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    ratingDict[(user,item)] = d['star_rating']
    itemNames[item] = d['product_title']

In [4]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

def mostSimilar(i, N):
    similarities = []
    users = usersPerItem[i]
    for i2 in tqdm(usersPerItem):
        if i2 == i: continue
        sim = Jaccard(users, usersPerItem[i2])
        similarities.append((sim,i2))
    similarities.sort(reverse=True)
    return [(s[0], s[1], itemNames[s[1]]) for s in similarities[:N]]

In [56]:
itemNames['B0006VMBHI']

'AudioQuest LP record clean brush'

In [57]:
mostSimilar('B0006VMBHI', 5)

100%|██████████████████████████████████████████████████████████████████████| 123328/123328 [00:00<00:00, 149585.22it/s]


[(0.028446389496717725,
  'B00006I5SD',
  'Shure SFG-2 Stylus Tracking Force Gauge'),
 (0.01694915254237288,
  'B00006I5SB',
  'Shure M97xE High-Performance Magnetic Phono Cartridge'),
 (0.015065913370998116,
  'B000AJR482',
  'ART Pro Audio DJPRE II Phono Turntable Preamplifier'),
 (0.014204545454545454,
  'B00E7MVP3S',
  'Signstek Blue LCD Backlight Digital Long-Playing LP Turntable Stylus Force Scale Gauge Tester'),
 (0.008955223880597015,
  'B001255YL2',
  'Audio Technica AT120E/T Standard Mount Phono Cartridge')]

In [32]:
IND = product2ind['B0006VMBHI']
IND

90488

In [49]:
from scipy.sparse import csr_matrix
import numpy as np

In [6]:
users, products = set(), set()
for d in dataset:
    users.add(d['customer_id'])
    products.add(d['product_id'])

user2ind = dict(zip(users, range(len(users))))
product2ind = dict(zip(products, range(len(products))))
ind2product = {v:k for k,v in product2ind.items()}

In [7]:
rows, cols, vals = [], [], []
for d in dataset:
    rows.append(product2ind[d['product_id']])
    cols.append(user2ind[d['customer_id']])
    vals.append(1)

In [95]:
matrix = csr_matrix((vals,(rows,cols)),shape=(len(product2ind), len(user2ind)))
matrix

<123328x573149 sparse matrix of type '<class 'numpy.intc'>'
	with 904647 stored elements in Compressed Sparse Row format>

In [100]:
# Could take these products out, since they don't have any customer overlap with any other products
(np.asarray(matrix.sum(1)).flatten()==1).sum()

56692

In [94]:
intersection = (matrix.dot(matrix.T)).tolil()
intersection

<123328x123328 sparse matrix of type '<class 'numpy.intc'>'
	with 3304520 stored elements in List of Lists format>

In [82]:
def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [86]:
product_counts[inds] + product_counts.T - intersection[inds]

matrix([[188,   3,   3, ...,   3,  17,   2],
        [193,   8,   8, ...,   8,  22,   7],
        [189,   4,   4, ...,   4,  18,   3],
        ...,
        [221,  36,  36, ...,  36,  50,  35],
        [189,   4,   4, ...,   4,  18,   3],
        [192,   7,   7, ...,   7,  21,   6]])

In [None]:
sims = []
product_counts = matrix.sum(1)
for inds in tqdm(chunks(range(intersection.shape[0]), 100), 
                 total=int(intersection.shape[0]/100)):
    # intersection[inds] /= product_counts[inds] + product_counts.T - intersection[inds]
    sims.append(intersection[inds] / (product_counts[inds] + product_counts.T - intersection[inds]))

In [None]:
np.concatenate(sims).shape

In [68]:
product_counts = matrix.sum(1)
for product_row in tqdm(intersection, total=intersection.shape[0]):
    product_row -= product_row

In [52]:
intersection[:10] = intersection[:10] / (matrix[:10].sum(1) + matrix.sum(1).T - intersection[:10])

In [33]:
# union = matrix.sum(1) + matrix.sum(1).T - intersection
union = matrix[IND].sum() + matrix.sum(1).T - intersection[IND]
union

matrix([[492, 307, 307, ..., 307, 321, 306]])

In [34]:
result = intersection[IND] / union
result

matrix([[0., 0., 0., ..., 0., 0., 0.]])

In [35]:
sims = np.asarray(result).flatten()
top_inds = np.argsort(-sims)[:10]
top_sims = sims[top_inds]

list(zip(top_sims, [ind2product[i] for i in top_inds]))

[(1.0, 'B0006VMBHI'),
 (0.028446389496717725, 'B00006I5SD'),
 (0.01694915254237288, 'B00006I5SB'),
 (0.015065913370998116, 'B000AJR482'),
 (0.014204545454545454, 'B00E7MVP3S'),
 (0.008955223880597015, 'B001255YL2'),
 (0.008849557522123894, 'B003EIRVO8'),
 (0.008333333333333333, 'B0015VEZ22'),
 (0.00821917808219178, 'B00006I5UH'),
 (0.008021390374331552, 'B00008BWM7')]

In [62]:
matrix = csr_matrix((vals,(rows,cols)),shape=(len(product2ind), len(user2ind)))

In [63]:
(matrix.nnz/(matrix.shape[0]*matrix.shape[1]))*100

0.001279823026391015

In [64]:
intersection = matrix[IND].dot(matrix.T)
intersection.todense()

matrix([[0, 0, 0, ..., 0, 0, 0]], dtype=int32)

In [65]:
union = matrix[IND].sum()+matrix.sum(1).T-intersection
union

matrix([[492, 307, 307, ..., 307, 321, 306]])

In [66]:
sims = np.asarray(intersection/union).flatten()
sims

array([0., 0., 0., ..., 0., 0., 0.])

In [67]:
top_inds = np.argsort(-sims)[:10]
top_sims = sims[top_inds]

list(zip(top_sims, [ind2product[i] for i in top_inds]))

[(1.0, 'B0006VMBHI'),
 (0.028446389496717725, 'B00006I5SD'),
 (0.01694915254237288, 'B00006I5SB'),
 (0.015065913370998116, 'B000AJR482'),
 (0.014204545454545454, 'B00E7MVP3S'),
 (0.008955223880597015, 'B001255YL2'),
 (0.008849557522123894, 'B003EIRVO8'),
 (0.008333333333333333, 'B0015VEZ22'),
 (0.00821917808219178, 'B00006I5UH'),
 (0.008021390374331552, 'B00008BWM7')]