# Initializiation

In [1]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/My\ Drive/MSBA/BA890/data

Mounted at /gdrive
/gdrive/My Drive/MSBA/BA890/data


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.spatial.distance import pdist, squareform
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KNeighborsRegressor


In [3]:
! ls

ingr_map.pkl		interactions_validation.csv  RAW_interactions.csv
interactions_test.csv	PP_recipes.csv		     RAW_recipes.csv
interactions_train.csv	PP_users.csv


In [4]:
df_recipes = pd.read_csv('RAW_recipes.csv')
df_interact = pd.read_csv('RAW_interactions.csv')
train_interact = pd.read_csv('interactions_train.csv')

In [5]:
df_recipes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231637 entries, 0 to 231636
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   name            231636 non-null  object
 1   id              231637 non-null  int64 
 2   minutes         231637 non-null  int64 
 3   contributor_id  231637 non-null  int64 
 4   submitted       231637 non-null  object
 5   tags            231637 non-null  object
 6   nutrition       231637 non-null  object
 7   n_steps         231637 non-null  int64 
 8   steps           231637 non-null  object
 9   description     226658 non-null  object
 10  ingredients     231637 non-null  object
 11  n_ingredients   231637 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 21.2+ MB


In [6]:
df_interact.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1132367 entries, 0 to 1132366
Data columns (total 5 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   user_id    1132367 non-null  int64 
 1   recipe_id  1132367 non-null  int64 
 2   date       1132367 non-null  object
 3   rating     1132367 non-null  int64 
 4   review     1132198 non-null  object
dtypes: int64(3), object(2)
memory usage: 43.2+ MB


In [7]:
train_interact.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 698901 entries, 0 to 698900
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   user_id    698901 non-null  int64  
 1   recipe_id  698901 non-null  int64  
 2   date       698901 non-null  object 
 3   rating     698901 non-null  float64
 4   u          698901 non-null  int64  
 5   i          698901 non-null  int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 32.0+ MB


In [8]:
# join recipes on interactions to see which recipes are most popular
df = pd.merge(train_interact, df_recipes[['name','id']], left_on='recipe_id', right_on='id')
del df['id']

In [9]:
df.head()

Unnamed: 0,user_id,recipe_id,date,rating,u,i,name
0,2046,4684,2000-02-25,5.0,22095,44367,flank steak with lime chipotle sauce
1,12882,4684,2002-03-13,5.0,10399,44367,flank steak with lime chipotle sauce
2,37758,4684,2002-06-17,5.0,4954,44367,flank steak with lime chipotle sauce
3,37636,4684,2003-05-08,5.0,831,44367,flank steak with lime chipotle sauce
4,54697,4684,2003-06-30,5.0,1147,44367,flank steak with lime chipotle sauce


# Collaborative Filtering

In [10]:
train_interact.shape

(698901, 6)

## User-based Collaborative Filtering

In [11]:
tmp = train_interact.sample(20000, random_state = 1)
tmp.loc[tmp.user_id == 229524]

Unnamed: 0,user_id,recipe_id,date,rating,u,i
157221,229524,181822,2006-08-16,4.0,279,86311
242912,229524,118362,2007-08-18,5.0,279,74778
257100,229524,61962,2007-10-07,5.0,279,120534
312330,229524,68846,2008-03-31,5.0,279,54216
626380,229524,153642,2012-06-03,5.0,279,145246
271373,229524,18914,2007-11-25,5.0,279,130507
219645,229524,25885,2007-05-26,2.0,279,56425


In [12]:
# resample 10,000 from training data due to memory limitations
tmp = train_interact.sample(20000, random_state = 1).reset_index()
user_ratings_table = tmp.pivot(index='user_id', columns='recipe_id', values='rating')

In [13]:
user_ratings_table[user_ratings_table.index == 229524].dropna(axis=1)

recipe_id,18914,25885,61962,68846,118362,153642,181822
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
229524,5.0,2.0,5.0,5.0,5.0,5.0,4.0


In [14]:
# print(user_ratings_table.shape)
user_ratings_table.iloc[:5,:15]

recipe_id,49,58,62,66,70,82,93,108,142,156,240,266,290,298,301
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1533,,,,,,,,,,,,,,,
1535,,,,,,,,,,,,,,,
1634,,,,,,,,,,,,,,,
1891,,,,,,,,,,,,,,,
1962,,,,,,,,,,,,,,,


In [15]:
## We need to normalize the ratings to deal with nulls
# Get the average rating for each user 
avg_ratings = user_ratings_table.mean(axis=1)

# Center each users ratings around 0
user_ratings_table_centered = user_ratings_table.sub(avg_ratings, axis=0)

# Fill in the missing data with 0s
user_ratings_table_normed = user_ratings_table_centered.fillna(0)

In [16]:
user_ratings_table_normed.iloc[:5,:15]

recipe_id,49,58,62,66,70,82,93,108,142,156,240,266,290,298,301
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1533,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1535,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1634,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1891,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1962,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### K-Nearest Neighbors (Users)

In [17]:
from sklearn.neighbors import KNeighborsRegressor

user_knn = KNeighborsRegressor(n_neighbors=5, metric='cosine')
similarities = cosine_similarity(user_ratings_table_normed)

user_cosine_similarity_df = pd.DataFrame(similarities, index = user_ratings_table_normed.index, columns = user_ratings_table_normed.index)
user_cosine_similarity_df.head()

user_id,1533,1535,1634,1891,1962,2178,2310,2312,2586,2595,2999,3111,3205,3288,4291,4439,4470,4740,4862,5060,5523,5672,6164,6258,6357,6512,6550,6651,6702,6923,7108,7308,7676,7802,8239,8377,8526,8527,8580,8606,...,2000555010,2000561332,2000594508,2000605688,2000607685,2000624233,2000689653,2000742562,2000758272,2000943999,2000964508,2000992191,2001040295,2001047423,2001102678,2001157535,2001224044,2001236401,2001241765,2001246372,2001278358,2001297534,2001329932,2001356926,2001359614,2001362355,2001365799,2001402443,2001415211,2001436530,2001472396,2001513060,2001527511,2001550810,2001595439,2001630864,2001704911,2001943610,2002015205,2002312797
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1533,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1535,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1634,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1891,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1962,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
user_similarity_series = user_cosine_similarity_df.loc[1533]

ordered_similarities = user_similarity_series.sort_values(ascending=False)

nearest_neighbors = ordered_similarities[1:6].index

print(nearest_neighbors)

Int64Index([198612, 197456, 197746, 197971, 198059], dtype='int64', name='user_id')


## Item-based Collaborative Filtering

In [19]:
items_ratings_table_normed = user_ratings_table_normed.T
items_ratings_table_normed.iloc[10721,0]

0.0

In [20]:
train_interact.loc[train_interact.user_id == 1533]

Unnamed: 0,user_id,recipe_id,date,rating,u,i
2215,1533,17338,2002-02-19,4.0,4913,41141
3981,1533,24375,2002-04-23,5.0,4913,127816
4274,1533,10721,2002-05-02,5.0,4913,100687
4406,1533,23891,2002-05-06,5.0,4913,120740
4890,1533,24136,2002-05-20,5.0,4913,5300
...,...,...,...,...,...,...
89751,1533,96621,2005-05-25,5.0,4913,7524
89877,1533,116223,2005-05-27,5.0,4913,83882
89882,1533,59135,2005-05-27,5.0,4913,11926
89909,1533,39499,2005-05-27,5.0,4913,129777


In [None]:
# Getting cosine similarities
similarities = cosine_similarity(items_ratings_table_normed)

# Wrap the similarities in a DataFrame
cosine_similarity_df = pd.DataFrame(similarities, index=items_ratings_table_normed.index, columns=items_ratings_table_normed.index)
cosine_similarity_df.iloc[:10,:10]

Let's find the similarity values for a specific recipe:

In [None]:
cosine_similarity_df.sample(1, random_state=1).index

In [None]:
# Find the similarity values for a specific recipe
cosine_similarity_series = cosine_similarity_df.loc[325480]

# Sort these values highest to lowest
ordered_similarities = cosine_similarity_series.sort_values(ascending=False)

ordered_similarities.head()

In [None]:
cosine_similarity_df.loc[325480].describe()

Oof! Based on this method, there doesn't seem to be any recipes that are similar, thus people who like this certain recipe can't be given any recommendations. This is likely because this recipe hasn't been reviewed, which is also known as a ***cold start problem***.

Let's try again with another recipe:

In [None]:
cosine_similarity_df.loc[2886].describe()

Looks like our trusty 'best banana bread' recipe does have ratings (as expected since its the most rated recipe on the dataset).

In [None]:
# Find the similarity values for the banana bread recipe
cosine_similarity_series = cosine_similarity_df.loc[2886]

# Sort these values highest to lowest
ordered_similarities = cosine_similarity_series.sort_values(ascending=False)

ordered_similarities.unique()

It looks like the collaborative filtering of items for our banana bread recipe lead to the following top 10 recommendations!

What this means is that those who liked the banana bread recipe are likely to enjoy the following 10 recipes also, based on the historical ratings of other users.

In [None]:
pd.merge(ordered_similarities.to_frame(), df_recipes[['name','id']],left_on=ordered_similarities.index,right_on='id').iloc[1:11]

### K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(metric='cosine', n_neighbors=10)

cosine_similarity_df.loc[2886]
ordered_similarities = cosine_similarity_df.loc[2886].sort_values(ascending=False)

nearest_neighbors = ordered_similarities[1:11].index

# Extract the ratings of the neighbors
neighbor_ratings = user_ratings_table.reindex(nearest_neighbors)

# Calculate the mean rating given by the users nearest neighbors
print(neighbor_ratings['Apollo 13 (1995)'].mean())

## Discussion: Item-based vs. User-based CF