# Initializiation

In [None]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/My\ Drive/MSBA/BA890/data

Mounted at /gdrive
/gdrive/My Drive/MSBA/BA890/data


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.spatial.distance import pdist, squareform
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KNeighborsRegressor


In [None]:
! ls

EDA_recipe_interactions.csv  PP_recipes.csv
ingr_map.pkl		     PP_users.csv
interactions_test.csv	     RAW_interactions.csv
interactions_train.csv	     RAW_recipes.csv
interactions_validation.csv  recipe_interactions.csv


In [None]:
df_recipes = pd.read_csv('RAW_recipes.csv')
df_interact = pd.read_csv('RAW_interactions.csv')
train_interact = pd.read_csv('interactions_train.csv')

In [None]:
df_recipes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231637 entries, 0 to 231636
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   name            231636 non-null  object
 1   id              231637 non-null  int64 
 2   minutes         231637 non-null  int64 
 3   contributor_id  231637 non-null  int64 
 4   submitted       231637 non-null  object
 5   tags            231637 non-null  object
 6   nutrition       231637 non-null  object
 7   n_steps         231637 non-null  int64 
 8   steps           231637 non-null  object
 9   description     226658 non-null  object
 10  ingredients     231637 non-null  object
 11  n_ingredients   231637 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 21.2+ MB


In [None]:
df_interact.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1132367 entries, 0 to 1132366
Data columns (total 5 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   user_id    1132367 non-null  int64 
 1   recipe_id  1132367 non-null  int64 
 2   date       1132367 non-null  object
 3   rating     1132367 non-null  int64 
 4   review     1132198 non-null  object
dtypes: int64(3), object(2)
memory usage: 43.2+ MB


In [None]:
train_interact.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 698901 entries, 0 to 698900
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   user_id    698901 non-null  int64  
 1   recipe_id  698901 non-null  int64  
 2   date       698901 non-null  object 
 3   rating     698901 non-null  float64
 4   u          698901 non-null  int64  
 5   i          698901 non-null  int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 32.0+ MB


In [None]:
# join recipes on interactions to see which recipes are most popular
df = pd.merge(train_interact, df_recipes[['name','id']], left_on='recipe_id', right_on='id')
del df['id']

In [None]:
df.head()

Unnamed: 0,user_id,recipe_id,date,rating,u,i,name
0,2046,4684,2000-02-25,5.0,22095,44367,flank steak with lime chipotle sauce
1,12882,4684,2002-03-13,5.0,10399,44367,flank steak with lime chipotle sauce
2,37758,4684,2002-06-17,5.0,4954,44367,flank steak with lime chipotle sauce
3,37636,4684,2003-05-08,5.0,831,44367,flank steak with lime chipotle sauce
4,54697,4684,2003-06-30,5.0,1147,44367,flank steak with lime chipotle sauce


# Collaborative Filtering

This notebook will go into collaborative filtering (CF). CF essentially uses the historical ratings of a user and all other users to provide recommendations.

Due to memory limitations of the CPU, we will subset the dataset keeping the top 100 users with the most interactions and the top 2000 recipes with the most ratings.

In [None]:
top_users = train_interact.groupby('user_id').count().sort_values('recipe_id',ascending=False).head(100).index
top_recipes = train_interact.groupby('recipe_id').count().sort_values('user_id',ascending=False).head(2000).index

In [None]:
train_interact2 = train_interact[(train_interact.user_id.isin(top_users)) & train_interact.recipe_id.isin(top_recipes)]
train_interact2.reset_index(inplace=True, drop=True)

In [None]:
train_interact2.head()

Unnamed: 0,user_id,recipe_id,date,rating,u,i
0,4470,834,2001-08-21,5.0,100,1915
1,6357,11365,2001-09-04,5.0,220,54684
2,6357,11642,2001-09-14,5.0,220,103767
3,9869,2886,2001-09-17,5.0,506,106975
4,4470,11427,2001-10-01,0.0,100,32135


## User-based Collaborative Filtering

User-based CF computes the similarity of user behaviors across users to find which users are most similar to one another. If User A likes a scrambled eggs recipe and an apple pie recipe, and User B likes the same scrambled eggs recipe, since User A and B are similar then User B is likely to also like the apple pie recipe.

In [None]:
# tmp = train_interact.sample(15000, random_state = 1)

# checking to see which user has rated the banana bread recipe
train_interact2.loc[train_interact2.recipe_id == 2886]

# we will be using user 452940 in later examples

Unnamed: 0,user_id,recipe_id,date,rating,u,i
3,9869,2886,2001-09-17,5.0,506,106975
6101,452940,2886,2008-09-22,5.0,347,106975
6817,560491,2886,2009-03-20,5.0,249,106975
8684,383346,2886,2010-12-07,5.0,193,106975
8933,482376,2886,2011-04-12,4.0,247,106975
10040,8688,2886,2012-11-03,5.0,235,106975


In [None]:
# tmp = tmp.reset_index()
user_ratings_table = train_interact2.pivot(index='user_id', columns='recipe_id', values='rating')

In [None]:
user_ratings_table[user_ratings_table.index == 452940].dropna(axis=1)

# it looks like the table was transformed correctly with user 452940's preferences being captured

recipe_id,2886,10837,18816,23439,23686,24796,34233,37625,38953,39900,47515,50844,53594,55556,55672,57685,57790,61816,66241,73062,73825,77854,107712,113299,114907,116849,118477,119124,125633,128566,128952,128956,129345,151177,158634,166973,181870,185342,215911,222188,234672,239398,248350,258508,262140,278221,283073,290136,294481,309422,360528
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
452940,5.0,4.0,5.0,1.0,3.0,5.0,0.0,5.0,5.0,4.0,4.0,5.0,4.0,4.0,4.0,4.0,5.0,5.0,5.0,3.0,5.0,2.0,5.0,5.0,4.0,5.0,2.0,5.0,5.0,5.0,5.0,5.0,4.0,5.0,5.0,4.0,4.0,4.0,4.0,5.0,5.0,5.0,5.0,5.0,4.0,5.0,5.0,4.0,5.0,5.0,5.0


In [None]:
# print(user_ratings_table.shape)
user_ratings_table.iloc[:5,:15]

recipe_id,246,432,519,536,607,632,749,834,860,916,1035,1356,2072,2496,2498
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
4470,,,0.0,,,,,5.0,,,,5.0,,,
5060,,,,,,,,,,,,,,,
6357,,,,,,,,,,,,,,,
8688,,,,,,,,,,,,,,5.0,
9869,,,,,,,,,,,,,,,


In [None]:
## We need to normalize the ratings to deal with nulls
# Get the average rating for each user 
avg_ratings = user_ratings_table.mean(axis=1)

# Center each users ratings around 0
user_ratings_table_centered = user_ratings_table.sub(avg_ratings, axis=0)

# Fill in the missing data with 0s
user_ratings_table_normed = user_ratings_table_centered.fillna(0)

In [None]:
user_ratings_table_normed.iloc[:5,:15]

recipe_id,246,432,519,536,607,632,749,834,860,916,1035,1356,2072,2496,2498
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
4470,0.0,0.0,-4.690141,0.0,0.0,0.0,0.0,0.309859,0.0,0.0,0.0,0.309859,0.0,0.0,0.0
5060,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6357,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8688,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.089494,0.0
9869,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### K-Nearest Neighbors (Users)

In [None]:
from sklearn.neighbors import KNeighborsRegressor

user_knn = KNeighborsRegressor(n_neighbors=5, metric='cosine')
similarities = cosine_similarity(user_ratings_table_normed)

user_cosine_similarity_df = pd.DataFrame(similarities, index = user_ratings_table_normed.index, columns = user_ratings_table_normed.index)
user_cosine_similarity_df.head()

user_id,4470,5060,6357,8688,9869,13483,17803,28177,29196,29782,37449,37636,37779,39835,41578,47559,47892,50969,52282,53932,56003,58104,61660,67656,80353,88099,89831,95743,101823,104295,107135,107583,124249,125388,126440,128473,130819,131126,133174,136997,...,226863,227978,242729,266635,280271,286566,296809,305531,315565,323186,324390,369715,382071,383346,386585,400708,422893,424680,428885,452355,452940,461834,464080,482376,482933,486725,498271,527607,537937,542159,560491,573325,593927,599450,653438,679953,844554,895132,1072593,1179225
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
4470,1.0,0.004631,0.002444,0.004053,0.003308,-0.003219,-0.002941,-0.00022,-0.008068,-0.011561,0.002996,-0.0501,-0.009196,0.001514,0.001114,-0.019959,-0.006357,-0.00749,-0.003905,0.049473,0.001717,0.053455,-0.004971,-0.001151,-0.005306,-0.008347,0.004645,0.000851,0.03612,-0.008235,0.0,-0.019968,-0.003945,-0.000409,-0.011598,-0.001248,0.002224,-0.006143,-0.020168,0.01443,...,0.004093,0.003931,0.004608,0.003844,-0.020589,-0.011535,0.003727,-0.020273,0.04,-0.007897,0.000821,0.058031,-0.002715,-0.007419,-0.01569,-0.008979,0.000102,0.001308,-0.003572,-0.010299,-0.028449,-0.003536,0.002858,0.020461,-0.020565,0.003861,0.102189,0.004812,0.004183,-0.001441,-0.007077,0.002649,0.002394,0.001579,0.0,0.0,0.056739,0.001665,0.105965,-0.00284
5060,0.004631,1.0,0.001913,0.003223,-0.01001,-0.012591,-0.006831,0.006058,-0.006145,-0.011408,-0.001037,-0.001175,0.002498,0.253318,0.000581,0.001401,0.0,-0.005952,-0.012785,0.060422,-0.023846,0.014057,-0.004116,-0.02106,-0.020659,-0.029872,-0.002201,0.004056,0.002067,0.052216,0.0,-0.013874,-0.010966,0.072459,-0.02126,-0.006241,-0.005484,0.007783,0.001731,0.001137,...,-0.018607,-0.029288,0.001804,-0.019814,-0.021035,0.027768,0.0,-0.004584,-0.014999,0.000875,0.002572,-0.010935,0.0,-0.007348,0.080736,-0.008501,0.003201,0.001366,-0.008988,0.002711,0.008274,0.004461,-0.003488,-0.01169,-0.008368,-0.024839,-0.0065,0.005023,0.0,0.04881,-0.005149,0.004148,0.005867,0.0,0.0,0.0,-0.014912,-0.011567,-0.010657,-0.004783
6357,0.002444,0.001913,1.0,0.001227,0.001934,0.001593,0.002382,0.001172,-0.02989,0.001681,0.002045,-0.000944,0.002575,0.0,0.00045,0.00125,0.00308,-0.000756,0.00774,0.0,-0.004297,0.003879,0.000926,0.001968,0.002084,0.001149,0.003029,-0.039271,-0.005426,0.003836,0.0,0.001209,0.001912,-0.021451,0.054778,0.001757,0.001077,-0.021059,-0.008831,0.002349,...,0.002275,-0.022536,0.002791,-0.007982,0.0,0.003434,0.001505,0.0,0.0,-0.017255,0.0,0.001108,-0.001096,-0.016195,0.001799,0.004642,0.002476,-0.025354,0.0,-0.000773,0.0,0.002487,0.0,-0.014998,0.0,0.003965,0.002063,0.000971,0.001267,0.004074,0.001732,0.001604,-0.001639,0.001913,0.0,0.0,-0.003416,0.0,0.000389,-0.005304
8688,0.004053,0.003223,0.001227,1.0,-0.00463,-0.008792,0.004938,-0.011329,-0.005778,-0.000199,0.000184,-0.012139,0.014639,-0.004339,-0.000592,-0.003443,0.005109,-0.008826,0.00321,0.013024,-0.070424,0.002584,0.003841,0.006296,-0.013258,0.017417,0.027645,0.001526,-0.001053,0.017094,0.0,0.001003,-0.000862,-0.002749,-0.023555,-0.00349,0.00134,0.013397,-0.006514,-0.006321,...,0.044236,-0.009609,0.002315,0.033586,0.002304,0.029178,0.00025,-0.000206,0.003982,0.001122,0.004126,0.017305,-0.000455,0.029522,4.7e-05,-0.026079,0.001129,0.001314,-0.011991,0.012565,0.002441,0.004335,-0.006728,-0.006233,0.002175,0.007336,0.034168,-0.00564,-0.001142,0.000965,0.010906,0.001996,0.002405,-0.005691,0.0,0.001066,-0.007744,0.000836,-0.01469,-0.012977
9869,0.003308,-0.01001,0.001934,-0.00463,1.0,0.033623,-0.010032,0.004084,-0.010196,-0.011927,-0.003308,-0.009163,-0.023382,-0.045608,0.001959,0.002293,0.004026,-0.023806,-0.00637,-0.018544,-0.033835,-0.007273,-0.013145,0.089133,0.004541,0.004806,-0.01946,-0.018706,0.004703,-0.001003,0.0,0.003161,0.004997,0.038368,-0.007711,-0.000398,0.002347,0.005467,-0.01904,0.031152,...,-0.008093,0.069785,0.003648,-0.002898,0.000223,0.014964,0.111421,-0.001067,0.003719,-0.00577,-0.017835,-0.014134,-0.002865,-0.005721,-0.017401,-0.008337,-0.018956,0.001381,-0.000556,0.021329,-0.013335,0.001886,0.004652,0.001406,0.002285,0.01134,-0.006651,0.000847,0.001104,-0.001522,0.087322,-0.011883,0.003955,0.006668,0.0,0.00448,-0.025192,0.0,0.000412,-0.005386


In [None]:
# looking at 15 most similar users to user 452940
user_similarity_series = user_cosine_similarity_df.loc[452940]

ordered_similarities = user_similarity_series.sort_values(ascending=False)

nearest_neighbors = ordered_similarities[1:16].index

print(nearest_neighbors)

Int64Index([  56003,  133174,  560491,  573325, 1072593,  136997,  199848,
             679953,  386585,  145352,   37449,  424680, 1179225,  498271,
              50969],
           dtype='int64', name='user_id')


In [None]:
neighbor_ratings = user_ratings_table.reindex(nearest_neighbors)
neighbor_ratings[2886].mean()

# user 452940's neighbors rated the banana bread recipe an average of 5

5.0

In [None]:
neighbor_ratings.loc[:,2886]

user_id
56003      NaN
133174     NaN
560491     5.0
573325     NaN
1072593    NaN
136997     NaN
199848     NaN
679953     NaN
386585     NaN
145352     NaN
37449      NaN
424680     NaN
1179225    NaN
498271     NaN
50969      NaN
Name: 2886, dtype: float64

#### Using Sci-kit Learn

In [None]:
# Get the target data from user_ratings_table
other_users_y = user_ratings_table_normed[2886]

# Get the data for only those that have rated the recipe
other_users_x = user_ratings_table_normed[other_users_y.notnull()]

# Remove those that have not rated the recipe from the target
other_users_y.dropna(inplace=True)

In [None]:
from sklearn.neighbors import KNeighborsRegressor

# Instantiate the user KNN model with 10 NNs
user_knn = KNeighborsRegressor(metric='cosine', n_neighbors=10)

# Fit the model and predict the target user
user_knn.fit(other_users_x, other_users_y)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='cosine',
                    metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                    weights='uniform')

In [None]:
# predict on random user the rating they would give the banana bread recipe based on how others rated the banana bread
np.random.seed(1)
random_user = np.random.choice(user_ratings_table_normed.index)

target_user_x = user_ratings_table_normed.loc[[random_user]]
user_user_pred = user_knn.predict(target_user_x)
predicted_rating = float(user_user_pred+avg_ratings[random_user])

print(f'User {random_user} rates our banana bread recipe a {round(predicted_rating)}') # added avg_ratings to de-normalize

User 131126 rates our banana bread recipe a 5


Based on this model, which uses the collective rating patterns of all the reviewers, this random user would likely rate our banana bread recipe a 5.



**What are the implications of this model?**

Well to start, this model would be able to provide recommendations to users based on his preferences and the preferences of those who are similar to him. We could potentially input a catalog of recipes they have never seen before and predict ratings for each of those recipes. This is similar to how Spotify recommends new songs you might like based on your listening history and the listening history of those similar to you.

However, a problem does arise with this method. Let's see how 10 other users would rate our banana bread recipe:

In [None]:
for _ in range(10):
    random_user = np.random.choice(user_ratings_table_normed.index)

    target_user_x = user_ratings_table_normed.loc[[random_user]]
    user_user_pred = user_knn.predict(target_user_x)
    predicted_rating = float(user_user_pred+avg_ratings[random_user])

    print(f'User {random_user} rates our banana bread recipe a {round(predicted_rating)}') # added avg_ratings to de-normalize

User 37779 rates our banana bread recipe a 5
User 382071 rates our banana bread recipe a 5
User 29782 rates our banana bread recipe a 5
User 400708 rates our banana bread recipe a 5
User 13483 rates our banana bread recipe a 5
User 452355 rates our banana bread recipe a 5
User 280271 rates our banana bread recipe a 5
User 47892 rates our banana bread recipe a 4
User 5060 rates our banana bread recipe a 5
User 422893 rates our banana bread recipe a 4


Based on this model, most users would give the banana bread recipe a rating of 5 (with 2 out of the 10 providing a rating of 4). 

Since our model uses historical data of how others reviewed the banana bread (which is often highly rated), this might just lead to our model recognizing what is popular instead of tailored recommendations, eventually creating a feedback loop for banana bread.

-----

Let's see how we could implement this model in for a user across multiple recipes by transforming it into a function.

In [None]:
def user_KNN_CF(recipe_list, user_id):
    '''
    Returns the top 10 recipes from recipe list for a given user
    '''
    predicted_ratings = []
    tmp = None

    for recipe in recipe_list:
        # Get the target data from user_ratings_table
        other_users_y = user_ratings_table_normed[recipe]

        # Get the data for only those that have rated the recipe
        other_users_x = user_ratings_table_normed[other_users_y.notnull()]

        # Remove those that have not rated the recipe from the target
        other_users_y.dropna(inplace=True)

        # Instantiate the user KNN model with 10 NNs
        user_knn = KNeighborsRegressor(metric='cosine', n_neighbors=10)

        # Fit the model and predict the target user
        user_knn.fit(other_users_x, other_users_y)

        target_user_x = user_ratings_table_normed.loc[[user_id]]
        user_user_pred = user_knn.predict(target_user_x)
        # predicted_rating = float(user_user_pred+avg_ratings[user_id])

        predicted_ratings.append(float(user_user_pred))

    tmp = pd.DataFrame([recipe_list, predicted_ratings]).T
    tmp.columns = ['recipe_id','rating']
    tmp = pd.merge(tmp, df_recipes[['name','id']], left_on = 'recipe_id', right_on='id')
    del tmp['id']
    return tmp.sort_values('rating', ascending=False).head(10)


In [None]:
len(np.unique(np.random.choice(train_interact2.recipe_id.unique(), 500, replace=False)))

500

In [None]:
# get random list of 500 recipes
np.random.seed(890)
recipes = [recipe for recipe in np.random.choice(train_interact2.recipe_id.unique(), 500, replace=False)]

# select random user
random_user = np.random.choice(user_ratings_table_normed.index)

user_KNN_CB(recipes, random_user)

# note that these ratings are not denormalized since the actual 5-scale rating doesn't matter

Unnamed: 0,recipe_id,rating,name
360,26389.0,0.115527,mean s basque potatoes
223,18816.0,0.098833,cabbage for those who dislike cabbage
389,63860.0,0.079779,breakfast in a mug
370,87782.0,0.077208,greek potatoes oven roasted and delicious
43,22782.0,0.075644,jo mama s world famous spaghetti
9,34110.0,0.075644,zucchini ribbons with basil butter
214,138317.0,0.07542,mexican hot cocoa
306,284500.0,0.07525,stir fry mushrooms and bell peppers
297,5031.0,0.074416,emeril s essence
45,56103.0,0.071679,roasted brussels sprouts


Using this model on this random user with a random list of 500 recipes leads to the above top 10 recommendations. This could be information shared on their home page under, "Recipes you might like" or used in the Food.com's newsletters to entice the user to return.

## Item-based Collaborative Filtering

Item-based CF uses a similar principle as UB-CF but instead of basing the model on the similarities of users, IB-CF uses the similarities between items.

One way to do this is to look at recipe pairings–we compute recipe similarities based on the ratings of users (think content-based recommendations but using the ratings as the metric as opposed to the recipe characteristics).

For example, if both User A and B like the same scrambled eggs and apple pie recipes, it's likely that these two recipes are similar to one another. If then User C likes the scrambled eggs recipe, then it's likely he will like the apple pie recipe.

In [None]:
items_ratings_table_normed = user_ratings_table_normed.T
items_ratings_table_normed.head()

user_id,4470,5060,6357,8688,9869,13483,17803,28177,29196,29782,37449,37636,37779,39835,41578,47559,47892,50969,52282,53932,56003,58104,61660,67656,80353,88099,89831,95743,101823,104295,107135,107583,124249,125388,126440,128473,130819,131126,133174,136997,...,226863,227978,242729,266635,280271,286566,296809,305531,315565,323186,324390,369715,382071,383346,386585,400708,422893,424680,428885,452355,452940,461834,464080,482376,482933,486725,498271,527607,537937,542159,560491,573325,593927,599450,653438,679953,844554,895132,1072593,1179225
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
246,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.386364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
432,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.574074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
519,-4.690141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.386364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.301887,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536,0.0,0.0,0.0,0.0,0.0,0.0,0.257576,0.0,0.0,0.0,0.0,0.0,0.0,-1.7375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.236486,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.0,0.0,0.0,0.0,0.0,0.29878,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.858289,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Getting cosine similarities
similarities = cosine_similarity(items_ratings_table_normed)

# Wrap the similarities in a DataFrame
cosine_similarity_df = pd.DataFrame(similarities, index=items_ratings_table_normed.index, columns=items_ratings_table_normed.index)
cosine_similarity_df.iloc[:7,:7]

recipe_id,246,432,519,536,607,632,749
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
246,1.0,0.0,0.097476,0.0,0.085073,0.267494,0.0
432,0.0,1.0,0.0,0.0,0.0,0.0,0.0
519,0.097476,0.0,1.0,0.0,0.009472,0.029783,0.0
536,0.0,0.0,0.0,1.0,0.0,0.0,0.0
607,0.085073,0.0,0.009472,0.0,1.0,0.075674,-0.11885
632,0.267494,0.0,0.029783,0.0,0.075674,1.0,0.132491
749,0.0,0.0,0.0,0.0,-0.11885,0.132491,1.0


Let's find the similarity values for a specific recipe:

In [None]:
cosine_similarity_df.sample(1, random_state=1).index

Int64Index([40879], dtype='int64', name='recipe_id')

In [None]:
# Find the similarity values for a specific recipe
cosine_similarity_series = cosine_similarity_df.loc[2886]

# Sort these values highest to lowest
ordered_similarities = cosine_similarity_series.sort_values(ascending=False)

ordered_similarities.head()

recipe_id
2886      1.000000
283073    0.817530
107712    0.670346
37625     0.659538
61816     0.658794
Name: 2886, dtype: float64

In [None]:
cosine_similarity_df.loc[2886].describe()

count    1935.000000
mean        0.001242
std         0.112485
min        -0.685597
25%         0.000000
50%         0.000000
75%         0.001613
max         1.000000
Name: 2886, dtype: float64

Looks like our trusty 'best banana bread' recipe does has a good amount of ratings (as expected since its the most rated recipe on the dataset).

In [None]:
# Find the similarity values for the banana bread recipe
cosine_similarity_series = cosine_similarity_df.loc[2886]

# Sort these values highest to lowest
ordered_similarities = cosine_similarity_series.sort_values(ascending=False)

ordered_similarities.unique()[:10]

array([1.        , 0.81752951, 0.67034642, 0.65953845, 0.65879449,
       0.65057478, 0.64365532, 0.64006868, 0.63523505, 0.62353582])

It looks like the collaborative filtering of items for our banana bread recipe lead to the following top 10 recommendations!

What this means is that those who liked the banana bread recipe are likely to enjoy the following 10 recipes also, based on the historical ratings of other users.

In [None]:
pd.merge(ordered_similarities.to_frame(), df_recipes[['name','id']],left_on=ordered_similarities.index,right_on='id').iloc[1:11]

Unnamed: 0,2886,name,id
1,0.81753,strawberry sweetheart streusel muffins,283073
2,0.670346,alton brown s baked macaroni and cheese,107712
3,0.659538,new york crumb cake,37625
4,0.658794,rosemary garlic focaccia,61816
5,0.650575,broccoli chicken dijon south beach diet,120351
6,0.643655,breakfast bagel sandwiches oamc,120519
7,0.640069,thick and chewy chocolate chip cookies,118487
8,0.635235,incredible oven fried chicken,11642
9,0.623536,snickerdoodle french toast,294481
10,0.598438,addictive chicken tenders one taste and you r...,69990


It's important to note that previous iterations of this method has failed to provide recommendations on certain recipes. The recipes that we could not provide recommendations on were usually those with very few or no ratings. Since collaborative filtering uses the user ratings to find recommendations, recipes with no or few ratings are harder to find recommendations for. This phenomena is known as the ***cold-start problem***.

# Deploying KNN Collaborative Filtering into a function

In [None]:
def user_KNN_CF(recipe_list, user_id):
    '''
    Returns the top 10 recipes from recipe list for a given user
    '''
    predicted_ratings = []
    tmp = None

    for recipe in recipe_list:
        # Get the target data from user_ratings_table
        other_users_y = user_ratings_table_normed[recipe]

        # Get the data for only those that have rated the recipe
        other_users_x = user_ratings_table_normed[other_users_y.notnull()]

        # Remove those that have not rated the recipe from the target
        other_users_y.dropna(inplace=True)

        # Instantiate the user KNN model with 10 NNs
        user_knn = KNeighborsRegressor(metric='cosine', n_neighbors=10)

        # Fit the model and predict the target user
        user_knn.fit(other_users_x, other_users_y)

        target_user_x = user_ratings_table_normed.loc[[user_id]]
        user_user_pred = user_knn.predict(target_user_x)
        # predicted_rating = float(user_user_pred+avg_ratings[user_id])

        predicted_ratings.append(float(user_user_pred))

    tmp = pd.DataFrame([recipe_list, predicted_ratings]).T
    tmp.columns = ['recipe_id','rating']
    tmp = pd.merge(tmp, df_recipes[['name','id']], left_on = 'recipe_id', right_on='id')
    del tmp['id']
    return tmp.sort_values('rating', ascending=False).head(10)

In [None]:
np.random.seed(890)
recipes = [recipe for recipe in np.random.choice(train_interact2.recipe_id.unique(), 500, replace=False)]

# select random user
random_user = np.random.choice(user_ratings_table_normed.index)

user_KNN_CB(recipes, random_user)

Unnamed: 0,recipe_id,rating,name
360,26389.0,0.115527,mean s basque potatoes
223,18816.0,0.098833,cabbage for those who dislike cabbage
389,63860.0,0.079779,breakfast in a mug
370,87782.0,0.077208,greek potatoes oven roasted and delicious
43,22782.0,0.075644,jo mama s world famous spaghetti
9,34110.0,0.075644,zucchini ribbons with basil butter
214,138317.0,0.07542,mexican hot cocoa
306,284500.0,0.07525,stir fry mushrooms and bell peppers
297,5031.0,0.074416,emeril s essence
45,56103.0,0.071679,roasted brussels sprouts
