### Recommending

In [None]:
users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, 
                      "Norah Jones": 4.5, "Phoenix": 5.0, 
                      "Slightly Stoopid": 1.5, 
                      "The Strokes": 2.5, "Vampire Weekend": 2.0}, 
         "Bill":     {"Blues Traveler": 2.0, "Broken Bells": 3.5, 
                      "Deadmau5": 4.0, 
                      "Phoenix": 2.0, "Slightly Stoopid": 3.5, 
                      "Vampire Weekend": 3.0}, 
         "Chan":     {"Blues Traveler": 5.0, "Broken Bells": 1.0, 
                      "Deadmau5": 1.0, "Norah Jones": 3.0, 
                      "Phoenix": 5, "Slightly Stoopid": 1.0}, 
         "Dan":      {"Blues Traveler": 3.0, "Broken Bells": 4.0, 
                      "Deadmau5": 4.5, "Phoenix": 3.0, 
                      "Slightly Stoopid": 4.5, "The Strokes": 4.0, 
                      "Vampire Weekend": 2.0}, 
         "Hailey":   {"Broken Bells": 4.0, "Deadmau5": 1.0, 
                      "Norah Jones": 4.0, "The Strokes": 4.0, 
                      "Vampire Weekend": 1.0}, 
         "Jordyn":   {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, 
                      "Phoenix": 5.0, "Slightly Stoopid": 4.5, 
                      "The Strokes": 4.0, "Vampire Weekend": 4.0}, 
         "Sam":      {"Blues Traveler": 5.0, "Broken Bells": 2.0, 
                      "Norah Jones": 3.0, "Phoenix": 5.0, 
                      "Slightly Stoopid": 4.0, "The Strokes": 5.0}, 
         "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, 
                      "Phoenix": 4.0, "Slightly Stoopid": 2.5, 
                      "The Strokes": 3.0}}

In [None]:
users["Veronica"]

In [None]:
users["Hailey"]

In [None]:
def manhattan(rating1, rating2): 
    """Computes the Manhattan distance. Both rating1 and rating2 are 
    dictionaries of the form {'The Strokes': 3.0, 'Slightly 
    Stoopid': 2.5}"""
    distance = 0 
    commonRatings = False

    for key in rating1: 
        if key in rating2: 
            distance += abs(rating1[key] - rating2[key]) 
            commonRatings = True
        if commonRatings: 
            return distance 
        else: 
            return -1 #Indicates no ratings in common

In [None]:
manhattan(users['Hailey'], users['Veronica']) 

In [None]:
manhattan(users['Hailey'], users['Jordyn'])

In [None]:
def compute_nearest_neighbor(username, users):
    """
    creates a sorted list of users based on their distance to username
    """ 
    distances = [] 
    for user in users: 
        if user != username: 
            distance = manhattan(users[user], users[username]) 
            distances.append((distance, user)) 
            # sort based on distance -- closest first
            distances.sort() 
            return distances

In [None]:
compute_nearest_neighbor('Hailey', users)

In [None]:
def recommend(username, users):
    """
    Give list of recommendations
    """
    # first find nearest neighbor
    nearest = compute_nearest_neighbor(username, users)[0][1] 
    recommendations = [] 
    # now find bands neighbor rated that user didn't 
    neighborRatings = users[nearest] 
    userRatings = users[username] 
    for artist in neighborRatings: 
        if not artist in userRatings:
            recommendations.append((artist, neighborRatings[artist]))
            recommendations.sort(key=lambda artistTuple: artistTuple[1], 
            reverse = True)
            return recommendations

In [None]:
recommend('Hailey', users)

### Pearson Correlation Coefficient

In [None]:
import math

def pearson(rating1, rating2):
    sum_xy = 0 
    sum_x = 0 
    sum_y = 0 
    sum_x2 = 0 
    sum_y2 = 0 
    n = 0 
    for key in rating1: 
        if key in rating2: 
            n += 1 
            x = rating1[key] 
            y = rating2[key] 
            sum_xy += x * y 
            sum_x += x 
            sum_y += y 
            sum_x2 += x**2 
            sum_y2 += y**2 
    # now compute denominator 
    denominator = math.sqrt(sum_x2 - (sum_x**2) / n) * math.sqrt(sum_y2 -(sum_y**2) / n) 
    if denominator == 0: 
        return 0 
    else: 
        return (sum_xy - (sum_x * sum_y) / n) / denominator

In [None]:
pearson(users['Angelica'], users['Bill']) 

In [None]:
pearson(users['Angelica'], users['Hailey']) 

In [None]:
pearson(users['Angelica'], users['Jordyn']) 

### Cosine Similarity & K-nearest neighbor

In [1]:
import pickle
path = '/class/datasets/'
used_tag = 'halloween'

all_media = pickle.load(open(path+'%s_ig.p' % used_tag,'rb'))

In [2]:
all_tags_nonunique = []
for m in all_media:
    if hasattr(m, 'tags'):
        all_tags_nonunique += [t.name for t in m.tags]
all_tags = set(all_tags_nonunique)

In [3]:
len(all_tags)

25159

In [4]:
all_tags = list(all_tags)
all_tags[:10]

['fawn',
 'terrificpicscolor',
 'mustachio',
 'zombieschool',
 'spiders',
 'vscominsk',
 'hanging',
 'woody',
 'videogram',
 'bradloree']

In [5]:
all_users = list(set([m.user.username for m in all_media]))

In [6]:
len(all_users)

8530

In [7]:
all_users[:10]

['twinvogue',
 'yutazu',
 'ruthatchley1428',
 'dallasliketexas',
 'spooky_minecraft__pokemon_',
 'yhasydneyharbour',
 'ckr_y16',
 'melly_lara',
 'maddecourcey',
 'michaelsoy89']

In [8]:
from collections import Counter

c = Counter(all_tags_nonunique)

In [9]:
c.most_common(10)

[('halloween', 9920),
 ('pumpkin', 792),
 ('october', 686),
 ('makeup', 662),
 ('fall', 542),
 ('scary', 541),
 ('costume', 508),
 ('horror', 453),
 ('love', 450),
 ('art', 420)]

In [None]:
t = 'costume'
all_tags.index(t)

In [None]:
users_to_tags = {}
for u in all_users:
    users_to_tags[u]=[0]*len(all_tags)

In [None]:
for m in all_media:
    if hasattr(m, 'tags'):
        for t in m.tags:
            #if t.name in all_tags:
            cur_index = all_tags.index(t.name)
            users_to_tags[m.user.username][cur_index]+=1

In [None]:
len(users_to_tags['spooky_minecraft__pokemon_'])

In [None]:
import numpy as np
np.sum(np.array(users_to_tags['spooky_minecraft__pokemon_']))

In [None]:
np.array([x for x in users_to_tags['spooky_minecraft__pokemon_'] if x > 0])

In [None]:
[x for x in users_to_tags['maddecourcey'] if x > 0]

In [None]:
import numpy.linalg as LA
import numpy as np

cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)

In [None]:
v1 = users_to_tags['spooky_minecraft__pokemon_']
v2 = users_to_tags['maddecourcey']

cos = cx(v1,v2)
cos

In [None]:
# get top matches
k = 5
person = 'spooky_minecraft__pokemon_'
scores = [(cx(users_to_tags[person], users_to_tags[other]),other) for other in all_users if other != person]

In [None]:
max(scores)

In [None]:
scores.sort()
scores.reverse()
print scores[:k]

### Item-based (tag)

In [None]:
tags_to_users = {}
for t in all_tags:
    tags_to_users[t]=[0]*len(all_users)

In [None]:
for m in all_media:
    if hasattr(m, 'tags'):
        for t in m.tags:
            #if t.name in all_tags:
            cur_index = all_users.index(m.user.username)
            tags_to_users[t.name][cur_index]+=1

In [None]:
len(tags_to_users['horror'])

In [None]:
# get top matches
k = 5
tag = 'horror'
scores = [(cx(tags_to_users[tag], tags_to_users[other]),other) for other in all_tags if other != tag]

scores.sort()
scores.reverse()
print scores[:k]

In [None]:
# get top matches
k = 5
tag = 'vampire'
scores = [(cx(tags_to_users[tag], tags_to_users[other]),other) for other in all_tags if other != tag]

scores.sort()
scores.reverse()
print scores[:k]

In [None]:
# get top matches
k = 5
tag = 'spider'
scores = [(cx(tags_to_users[tag], tags_to_users[other]),other) for other in all_tags if other != tag]

scores.sort()
scores.reverse()
print scores[:k]

### Assignment

### Jaccard Distance

Jaccard Distance calculates the relative overlap of two vectors ( a single row of the above user/item matrix). 

In [None]:
from IPython.core.display import Image 
path = '/Users/giladlotan/Documents/ITP Course/code/Week7/'
Image(filename=path+'jaccard.png')

The overlap (Intersect) is scaled by the overall set size (the union): So let us look at an example two sets from our example data-set X6 and X7

x=[1001111010] and y=[0111111111]

The intersect is x∩y=5

The union is x∪y=10

Jacc(x,y)=1−0.5=0.5