In [5]:
import gzip
from collections import defaultdict
import random
import numpy
import scipy.optimize

In [6]:
path = "/Users/cnogueira/Documents/Development/Notebooks/study/week2/amazon_reviews_us_Musical_Instruments_v1_00.tsv.gz"

In [36]:
f = gzip.open(path, 'rt', encoding="utf8")
header = f.readline()

In [10]:
header = header.strip().split('\t')

In [13]:
dataset = []
for line in f: 
    fields = line.strip().split('\t')
    d = dict(zip(header, fields))
    d['star_rating'] = int(d['star_rating'])
    d['helpful_votes'] = int(d['helpful_votes'])
    d['total_votes'] = int(d['total_votes'])
    dataset.append(d)

### I am going to see which ItemId were consumed by each userId
- So this is the only fields I care about on the dataset (reviewerID and productID)

In [21]:
# Here i am creating two data structures:  
#For each user, which itens did they consume (users per item), and
#For each items, which users consumed that item (item per user)
usersPerItem = defaultdict(set) #Ui
itemsPerUser = defaultdict(set) #Iu
itemNames = {}
for d in dataset:
    user, item = d['customer_id'], d['product_id']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    itemNames[item] = d['product_title']

## Jaccard Similarity
- I want a recommendation function that returns itens similar to a candidate item i, so for this I have the strategy:
     - Find the set of users who purchased i
     - Iterate over all other items other than i
     - For all other items, compute their similarity with i (and store it)
     - Sort all other items by Jaccard similarity
     - return the most similar
     
- Problem os this algo: 
    - Iterate every recommendation to all items (second point)
        - So, as most itens have 0 similarity, so 0 on Jaccard, we can fingure this out in advance. 
        - (!) We are going to iterate over those items purchased by one of the users who purchased i: candidate set of items

In [23]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer/denom


In [27]:
# recommendation system itself
def mostSimilar(i):
    similarities = []
    users = usersPerItem[i] #all users that purchased item i
    for i2 in usersPerItem: 
        if i2 == i: continue
        sim = Jaccard(users, usersPerItem[i2])
        similarities.append((sim, i2))
    similarities.sort(reverse = True)
    return similarities[:10]

In [32]:
# recommendation system itself (more efficient)
# It won't work for never bought items
def mostSimilarEfficient(i):
    similarities = []
    users = usersPerItem[i]
    candidateItems = set()
    for u in users: 
        candidateItems = candidateItems.union(itemsPerUser[u])
        # union of the possible candidate set of items and items per user of that user
        # Then I have a list of all items that possibily have non 0 Jaccard sim
    for i2 in candidateItems:
        if i2 == i: continue
        sim = Jaccard(users, usersPerItem[i2])
        similarities.append((sim, i2))
    similarities.sort(reverse = True)
    return similarities[:10]

In [33]:
query = dataset[2]['product_id']

In [34]:
mostSimilar(query)

[(0.028446389496717725, 'B00006I5SD'),
 (0.01694915254237288, 'B00006I5SB'),
 (0.015065913370998116, 'B000AJR482'),
 (0.014204545454545454, 'B00E7MVP3S'),
 (0.008955223880597015, 'B001255YL2'),
 (0.008849557522123894, 'B003EIRVO8'),
 (0.008333333333333333, 'B0015VEZ22'),
 (0.00821917808219178, 'B00006I5UH'),
 (0.008021390374331552, 'B00008BWM7'),
 (0.007656967840735069, 'B000H2BC4E')]

In [35]:
itemNames[query]

'AudioQuest LP record clean brush'

In [31]:
[itemNames[x[1]] for x in mostSimilar(query)]

['Shure SFG-2 Stylus Tracking Force Gauge',
 'Shure M97xE High-Performance Magnetic Phono Cartridge',
 'ART Pro Audio DJPRE II Phono Turntable Preamplifier',
 'Signstek Blue LCD Backlight Digital Long-Playing LP Turntable Stylus Force Scale Gauge Tester',
 'Audio Technica AT120E/T Standard Mount Phono Cartridge',
 'Technics: 45 Adaptor for Technics 1200 (SFWE010)',
 'GruvGlide GRUVGLIDE DJ Package',
 'STANTON MAGNETICS Record Cleaner Kit',
 'Shure M97xE High-Performance Magnetic Phono Cartridge',
 'Behringer PP400 Ultra Compact Phono Preamplifier']