In [44]:

# """
# Class 15: Recommendation Engines
# Content based and Collaborative based filtering
# Jaccard Similarity
# Modified KNN Algorithm
# """


In [45]:
########################################
## Collaborative-Based User Filtering ##
########################################

In [46]:
import pandas as pd
import collections

In [47]:
#read in brands data
user_brands = pd.read_csv('../data/user_brand.csv')

In [48]:
#look at count of stores
user_brands.Store.value_counts()

Target                        1866
Old Navy                      1200
Home Depot                    1186
Kohl's                        1157
Banana Republic                932
Nordstrom                      904
Gap                            860
Crate & Barrel                 816
Express                        785
KitchenAid                     700
J.Crew                         569
Container Store                564
Steve Madden                   539
Guess                          509
Cuisinart                      506
Nine West                      489
Calvin Klein                   476
Levi's                         472
Converse                       456
New Balance                    438
BCBGMAXAZRIA                   429
Restoration Hardware           410
Lacoste                        405
Kenneth Cole                   366
Kate Spade                     354
Puma                           350
Melissa & Doug                 335
DKNY                           328
Last Call by Neiman 

In [49]:
# Series of user IDs, note the duplicates
user_ids = user_brands.ID

In [50]:
user_ids.head()

0    80002
1    80002
2    80010
3    80010
4    80010
Name: ID, dtype: int64

In [51]:
len(user_ids)

23804

In [52]:
# EXERCISE 
# create a dictionary called brandsfor that takes in a user id and outputs a list of the brands they like


# key == a user ID, and the value is a 
# value == list of stores that the user "likes"
# example brandsfor[83065] == ["Kohl's", 'Target']













In [53]:
# ANSWER

brandsfor = dict(user_brands.groupby('ID').Store.apply(lambda x:list(x)))

In [54]:
brandsfor[83065]

["Kohl's", 'Target']

In [55]:
# try it out. User 83065 likes Kohl's and Target


In [56]:
# User 82983 likes many more!
brandsfor[82983]

['Hanky Panky',
 'Betsey Johnson',
 'Converse',
 'Steve Madden',
 'Old Navy',
 'Target',
 'Nordstrom']

In [57]:
########################
## Jaccard Similarity ##
########################

In [58]:
#
#The Jaccard Similarity allows us to compare two sets
#If we regard people as merely being a set of brands they prefer
#the Jaccard Similarity allows us to compare people
#
#Example. the jaccard similarty between user 82983 and 83065 is .125
#            because
#             brandsfor['83065'] == ["Kohl's", 'Target']
#             brandsfor['82983'] == ['Hanky Panky', 'Betsey Johnson', 'Converse', 'Steve Madden', 'Old Navy', 'Target', 'Nordstrom']

# the intersection of these two sets is just set("Target")
# the union of the two sets is set(['Target', 'Hanky Panky', 'Betsey Johnson', 'Converse', 'Steve Madden', 'Old Navy', 'Target', 'Nordstrom'])
# so the len(intersection) / len(union) = 1 / 8 == .125

# EXERCISE: what is the Jaccard Similarity 
#           between user 82956 and user 82963?
# 

In [59]:
# ANSWER HERE

combinedList = brandsfor[82956] + brandsfor[82963]
combinedSet = set(combinedList)
overlap = [brand for brand, count in collections.Counter((combinedList)).items() if count > 1]

print(len(overlap)/len(combined))

# 0.3333333333333333

0.3333333333333333


In [60]:
'''
EXERCISE: Complete the jaccard method below.
           It should take in a list of brands, and output the 
           jaccard similarity between them

 This should work with anything in the set, for example
 jaccard([1,2,3], [2,3,4,5,6])  == .3333333

 HINT: set1 & set2 is the intersection
       set1 | set2 is the union

'''

'\nEXERCISE: Complete the jaccard method below.\n           It should take in a list of brands, and output the \n           jaccard similarity between them\n\n This should work with anything in the set, for example\n jaccard([1,2,3], [2,3,4,5,6])  == .3333333\n\n HINT: set1 & set2 is the intersection\n       set1 | set2 is the union\n\n'

In [61]:
def jaccard(first, second):
  first = set(first)
  second = set(second)
  # the line below should be changed
  # ANSWER HEREEEEEE
    

    
    
    
    
    
    

In [62]:
def jaccard(first, second):
  first = set(first)
  second = set(second)
  return len(first & second) / float(len(first | second))

In [63]:
# try it out!
brandsfor[83065] # brands for user 83065
brandsfor[82983] # brands for user 82983
jaccard(brandsfor[83065], brandsfor[82983])
jaccard(brandsfor[82956], brandsfor[82963])

0.3333333333333333

In [64]:
#######################
### Our Recommender ###
#######################


In [65]:
# '''
# Our recommender will be a modified KNN collaborative algorithm.
# Input: A given user's brands that they like
# Output: A set (no repeats) of brand recommendations based on
#         similar users preferences

# 1. When a user's brands are given to us, we will calculate the input user's
# jaccard similarity with every person in our brandsfor dictionary

# 2. We will pick the K most similar users and recommend
# the brands that they like that the given user doesn't know about

# EXAMPLE:
# Given User likes ['Target', 'Old Navy', 'Banana Republic', 'H&M']
# Outputs: ['Forever 21', 'Gap', 'Steve Madden']
# '''

In [66]:
given_user = ['Target', 'Old Navy', 'Banana Republic', 'H&M']

In [67]:
#similarty between user 83065 and given user
brandsfor[83065]
jaccard(brandsfor[83065], given_user) 
# should be 0.2

0.2

In [68]:
# '''
# EXERCISE
#     Find the similarty between given_user and ALL of our users
#     output should be a dictionary where
#     the key is a user id and the value is the jaccard similarity
# {...
#  {81920: 0.07142857142857142,
#  81921: 0.25,
#  90114: 0.18181818181818182,
#  81923: 0.25,
#  ...}
# '''

In [87]:
# ANSWER HEREEEEEE
similarities = {}

for userid in user_ids:
    similarities[userid] = jaccard(brandsfor[userid], given_user)
    
print(similarities[81920])
print(similarities)
# This worked, and I wrote it! Huzzah!









0.07142857142857142
{81920: 0.07142857142857142, 81921: 0.25, 90114: 0.18181818181818182, 81923: 0.25, 90118: 0.07142857142857142, 81927: 0.2857142857142857, 91655: 0.14285714285714285, 90121: 0.2222222222222222, 81932: 0.0, 81933: 0.15384615384615385, 90126: 0.0, 90128: 0.2, 81937: 0.0, 81938: 0.08333333333333333, 81942: 0.16666666666666666, 90135: 0.0, 81944: 0.0, 81948: 0.0, 81949: 0.0, 81950: 0.13333333333333333, 81951: 0.125, 81952: 0.0, 81956: 0.0, 90149: 0.09523809523809523, 81958: 0.14285714285714285, 90152: 0.25, 81961: 0.0, 81962: 0.08333333333333333, 90155: 0.2, 81965: 0.0, 81966: 0.1, 81972: 0.25, 90165: 0.0, 81976: 0.18181818181818182, 90169: 0.05555555555555555, 90170: 0.2222222222222222, 81982: 0.2857142857142857, 90175: 0.0, 81985: 0.07142857142857142, 90179: 0.5, 90180: 0.0, 81989: 0.2857142857142857, 90183: 0.0, 81992: 0.2, 90185: 0.15, 81994: 0.0, 81996: 0.05555555555555555, 90125: 0.3333333333333333, 82000: 0.14285714285714285, 89527: 0.16666666666666666, 82003: 0.1

In [77]:
# ANSWER
similarities = {}
for user_id, brands in brandsfor.iteritems():
    similarities[user_id] = jaccard(given_user, brands)
similarities[81920]

AttributeError: 'dict' object has no attribute 'iteritems'

In [78]:
K = 5 #number of similar users to look at


In [82]:
# Now for the top K most similar users, let's aggregate the brands they like.
# I sort by the jaccard similarity so most similar users are first
# I use the sorted method, but because I'm sorting dictionaries
# I specify the "key" as the value of the dictionary
# the key is what the list should sort on
# so the most similar users end up being on top
# ANSWER HEREEEEEE

for key in similarities:
    

similarities.sorted







IndentationError: expected an indented block (<ipython-input-82-a7ec25a09c73>, line 12)

In [88]:
# list of K similar users' IDs
most_similar_users = sorted(similarities.items(), key = lambda x: x[1], reverse=True)[:K]
most_similar_users

[(81012, 0.75), (82970, 0.6), (91362, 0.6), (84807, 0.6), (88549, 0.6)]

In [89]:
# let's see what some of the most similar users likes
brandsfor[most_similar_users[0][0]]

['Banana Republic', 'Old Navy', 'Target']

In [93]:
brandsfor[most_similar_users[3][0]]

['Steve Madden', 'Banana Republic', 'Old Navy', 'Target']

In [97]:
brandsfor[most_similar_users[0][0]]

['Banana Republic', 'Old Navy', 'Target']

In [109]:
# Aggregate all brands liked by the K most similar users into a single set
brands_to_recommend = list()
for user in most_similar_users:
    print(user[0])
    # ANSWER HEREEEEEE
    brands_to_recommend.append(brandsfor[user][0])
    
brands_to_recommend
    
    
    
    
    
    

81012


KeyError: (81012, 0.75)

In [106]:
# Aggregate all brands liked by the K most similar users into a single set
# be sure to not include brands the the given user is already known to like1
brands_to_recommend = []
for user in most_similar_users:
    print(user[0])
    for brand in brandsfor[user[0]]:
        if brand not in given_user:
            brands_to_recommend.append(brand)

81012
82970
91362
84807
88549


In [107]:
brands_to_recommend
# UH OH WE HAVE DUPLICATES. Banana Republic, Old Navy, Target are all repeats.
# let's caculate a "score" of recommendation
# We will define the score as being the number of times
# a brand appears within the first K users

['Gap', 'Gap', 'Steve Madden', 'Forever 21']

In [108]:
from collections import Counter
dict(Counter(brands_to_recommend))

{'Forever 21': 1, 'Gap': 2, 'Steve Madden': 1}

In [None]:
#################################
#### Collaborative Item based ###
#################################

In [None]:
# '''
# We can also define a similary between items using jaccard similarity.
# We can say that the similarity between two items is the jaccard similarity
# between the sets of people who like the two brands.

# Example: similarity of Gap to Target is:
# '''

In [110]:
# filter users by liking Gap
gap_lovers = set(user_brands['Gap' == user_brands.Store].ID)
old_navy_lovers = set(user_brands['Old Navy' == user_brands.Store].ID)


In [111]:
# similarty between Gap and Old Navy
jaccard(gap_lovers, old_navy_lovers)

0.35437212360289283

In [112]:
guess_lovers = set(user_brands['Guess' == user_brands.Store].ID)
# similarty between Gap andGuess
jaccard(guess_lovers, gap_lovers)

0.21257750221434898

In [113]:
calvin_lovers = set(user_brands['Calvin Klein' == user_brands.Store].ID)
# similarty between Gap and Calvin Klein
jaccard(calvin_lovers, gap_lovers)

0.2068654019873532