# Initializiation

In [1]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/My\ Drive/MSBA/BA890/data

Mounted at /gdrive
/gdrive/My Drive/MSBA/BA890/data


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.spatial.distance import pdist, squareform
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KNeighborsRegressor


In [3]:
! ls

ingr_map.pkl		interactions_validation.csv  RAW_interactions.csv
interactions_test.csv	PP_recipes.csv		     RAW_recipes.csv
interactions_train.csv	PP_users.csv


In [4]:
df_recipes = pd.read_csv('RAW_recipes.csv')
df_interact = pd.read_csv('RAW_interactions.csv')
train_interact = pd.read_csv('interactions_train.csv')

In [5]:
df_recipes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231637 entries, 0 to 231636
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   name            231636 non-null  object
 1   id              231637 non-null  int64 
 2   minutes         231637 non-null  int64 
 3   contributor_id  231637 non-null  int64 
 4   submitted       231637 non-null  object
 5   tags            231637 non-null  object
 6   nutrition       231637 non-null  object
 7   n_steps         231637 non-null  int64 
 8   steps           231637 non-null  object
 9   description     226658 non-null  object
 10  ingredients     231637 non-null  object
 11  n_ingredients   231637 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 21.2+ MB


In [6]:
df_interact.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1132367 entries, 0 to 1132366
Data columns (total 5 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   user_id    1132367 non-null  int64 
 1   recipe_id  1132367 non-null  int64 
 2   date       1132367 non-null  object
 3   rating     1132367 non-null  int64 
 4   review     1132198 non-null  object
dtypes: int64(3), object(2)
memory usage: 43.2+ MB


In [7]:
train_interact.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 698901 entries, 0 to 698900
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   user_id    698901 non-null  int64  
 1   recipe_id  698901 non-null  int64  
 2   date       698901 non-null  object 
 3   rating     698901 non-null  float64
 4   u          698901 non-null  int64  
 5   i          698901 non-null  int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 32.0+ MB


In [8]:
# join recipes on interactions to see which recipes are most popular
df = pd.merge(train_interact, df_recipes[['name','id']], left_on='recipe_id', right_on='id')
del df['id']

In [9]:
df.head()

Unnamed: 0,user_id,recipe_id,date,rating,u,i,name
0,2046,4684,2000-02-25,5.0,22095,44367,flank steak with lime chipotle sauce
1,12882,4684,2002-03-13,5.0,10399,44367,flank steak with lime chipotle sauce
2,37758,4684,2002-06-17,5.0,4954,44367,flank steak with lime chipotle sauce
3,37636,4684,2003-05-08,5.0,831,44367,flank steak with lime chipotle sauce
4,54697,4684,2003-06-30,5.0,1147,44367,flank steak with lime chipotle sauce


# Content-Based Recommendations

Content-based recommendations use the similarities of the recipes to provide recommendations. For example, if a user liked cooking an apple pie, they might also like blueberry pies on the account that they are both fruit-based pies. Another example could be that if someone liked cooking an italian dish like spaghetti bolognese, they might also like cooking lasagna.

The trouble with our current data is that there is no straightforward feature that could act as a comparison (e.g. *cuisine*). Therefore, we might need to do some NLP feature engineering.

Let's first try by using the name of the recipes.

In [10]:
# use sample of 5000 to minimize memory requirements
df_recipes2 = df_recipes.sample(5000, random_state=1)
df_recipes2.reset_index(drop=True, inplace=True)

### Using Count Vectorizer and Jaccard

In [11]:
# use Count Vectorizer to get count words in list of names
from sklearn.feature_extraction.text import CountVectorizer
V = CountVectorizer(stop_words = 'english') # initialize vectorizer with common stopwords

X = V.fit_transform(df_recipes2.name)
# V.get_feature_names()
# vectorizer.get_stop_words()

In [12]:
count_vect_df = pd.DataFrame(X.todense(), columns=V.get_feature_names())
count_vect_df

Unnamed: 0,007,10,101,11,12,123,140,15,1968,1998,1pt,1st,2008,21,24,2bleu,30,35,36,3pts,4th,4u,50,52,57,5fix,7points,a1,aarsi,abm,abricots,absolutely,acadia,acapulco,according,achiote,acorn,adams,adaptation,adapted,...,yellow,yemani,yemen,yemeni,yemista,yes,yigandes,yo,yoghurt,yogurt,york,yosemite,yucatan,yucca,yule,yum,yummiest,yummy,za,zabaglione,zalm,zard,zealand,zest,zesty,zinger,zings,zingy,zippy,ziti,zitronenvinaigrette,zo,zrigschntzlets,zucchini,zuppa,zurbian,zurich,zurie,zwina,zwt
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4996,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4997,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [13]:
# set name of recipe as index 
count_vect_df = count_vect_df.set_index(df_recipes2.name)

In [14]:
from scipy.spatial.distance import pdist, squareform

# calculating jaccard distances for each recipe
jaccard_distances = pdist(count_vect_df.values, metric='jaccard')
print(jaccard_distances)

[1. 1. 1. ... 1. 1. 1.]


In [15]:
# Convert the distances to a square matrix
jaccard_similarity_array = 1 - squareform(jaccard_distances)

# Wrap the array in a pandas DataFrame
jaccard_similarity_df = pd.DataFrame(jaccard_similarity_array, index=count_vect_df.index, columns=count_vect_df.index)

In [16]:
jaccard_similarity_df.sample(3, random_state=1)

name,tomato stuffed roasted red peppers,almond cake from albufeira portugal,bean burritos simple for camping,potato gorgonzola gratin,orange glazed barbecued chicken,ghost pepper fresh salsa,musalaydaar baingun,giada de laurentiis fettuccine alfredo,zesty cajun skirt steak,penne with rustic lentil sauce,lemon parmigiano artichoke casserole,overnight cinnamon swirl cranberry french toast,southwest rub,shallow fry buttermilk pecan chicken,awesome bbq pulled pork no smoker or barbecue needed,raspberry poke cake,zippy steak,chutney glazed chicken,taco chicken wings,rachael ray s tilapia with tomatillo sauce,herbed balsamic chicken breasts,twice baked potatoes,swiss cheesy cauliflower,pressure cooker golden mushroom beef stew,cherry filled crescent rolls,corned beef dinner in the crock pot,lennie and donna s souper rice curry,deep fried shrimp cakes taud man goong,chocolate gingerbread cookies,spanish spice rubbed rib eye with sherry vinegar steak sauce,emeril s con queso,artichoke and crab dip,pumpkin curry,stuffed chicken breasts with brie basil and sun dried tomato,crab tampico,rosemary garlic seasoning,strawberry tofu mousse,amish brown sugar pie,broccoli with lemon almond butter,buster brown cake with rich vanilla icing,...,cornmeal pound cake,buffalo chicken fritos pie,border paella,black forest chocolate mousse shots,beer bacon potato soup aka man soup,mushroom bread pudding,daddy s barley casserole,aloha salmon,melt and mix chocolate chunk mud cake,spicy ranch chicken wings,cream of chanterelle soup,apricot pork medallions,tequila lime marinade,lancaster county pa tapioca recipe,marinated beef tenderloin sandwiches,british yogurt cheese,sweet potato black bean salad,moscow bobsled shot drink you decide,vegetable breakfast pizza,cruzan cherry pop,easy greek style string green beans,awesome spinach quiche,lazy stuffed cabbage casserole,yum yum bars,butter tart muffins,roast butternut squash with lemon and mustard,homemade eggnog,grandma s city chicken,ravioli with prosciutto roma tomato and sage,pollis house dressing,cinnamon date cake,power cookies,chili roasted salmon,garlic and herb bread france,peach and papaya salsa,gourmet magazine s asian cucumber ribbon salad,macadamia toffee chip cookies,peel and eat shrimp with spicy cocktail sauce,tapenade in 5 minutes,easy pan con tomato
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
nana s chicken seafood gumbo courtesy the neelys,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.125,0.125,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
apricot and orange cream cream aux abricots et oranges,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
easy healthy weeknight tuna tomato skillet,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285714


Looking at sample of the recipes, it seems that we can find similarities already. 

'**nana's chicken seafood gumbo courtesy the neelys**' is similar to '**orange glazed barbecued chicken**' likely due to presence of the word 'chicken'

Let's see what else might be recommended if we cooked orange glazed barbecue chicken:

In [17]:
jaccard_similarity_df['orange glazed barbecued chicken'].sort_values(ascending=False).to_frame().head(10)

Unnamed: 0_level_0,orange glazed barbecued chicken
name,Unnamed: 1_level_1
orange glazed barbecued chicken,1.0
orange glazed breaded chicken,0.6
simply yummy orange glazed chicken wings,0.428571
marinated barbecued chicken,0.4
orange piccata chicken,0.4
chutney glazed chicken,0.4
soy glazed chicken thighs,0.333333
junior league barbecued chicken,0.333333
chinese zesty orange chicken,0.333333
pork with orange glazed onions,0.333333


For **orange glazed barbecued chicken**, this content-based recommender using the names of the recipes would recommend the following top 3 options:
1. orange glazed breaded chicken
2. simply yummy orange glazed chicken wings
3. marinated barbecued chicken

### Using TF-IDF and Cosine similarity

Now let's try using a different vectorizer and similarity metric. 

Without getting into the math, let's breakdown the main differenc between the vectorizers: unlike count vectorizer, TF-IDF takes into account how frequent the word appears in the bag of words. For example, if the word 'best' appears often in the name of the recipe, less weight is given to the vector score as opposed to somethine less frequent like 'glazed')

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
V = TfidfVectorizer(stop_words = 'english') # initialize vectorizer with common stopwords

X = V.fit_transform(df_recipes2.name)

In [19]:
tfidf_df = pd.DataFrame(X.todense(), columns=V.get_feature_names())
tfidf_df

Unnamed: 0,007,10,101,11,12,123,140,15,1968,1998,1pt,1st,2008,21,24,2bleu,30,35,36,3pts,4th,4u,50,52,57,5fix,7points,a1,aarsi,abm,abricots,absolutely,acadia,acapulco,according,achiote,acorn,adams,adaptation,adapted,...,yellow,yemani,yemen,yemeni,yemista,yes,yigandes,yo,yoghurt,yogurt,york,yosemite,yucatan,yucca,yule,yum,yummiest,yummy,za,zabaglione,zalm,zard,zealand,zest,zesty,zinger,zings,zingy,zippy,ziti,zitronenvinaigrette,zo,zrigschntzlets,zucchini,zuppa,zurbian,zurich,zurie,zwina,zwt
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# set name of recipe as index 
tfidf_df = tfidf_df.set_index(df_recipes2.name)

In [21]:
from scipy.spatial.distance import pdist, squareform

# calculating jaccard distances for each recipe
cosine_distances = pdist(tfidf_df.values, metric='cosine')
print(cosine_distances)

[1. 1. 1. ... 1. 1. 1.]


In [22]:
# Convert the distances to a square matrix
cosine_similarity_array = 1 - squareform(cosine_distances)

# Wrap the array in a pandas DataFrame
cosine_similarity_df = pd.DataFrame(cosine_similarity_array, index=count_vect_df.index, columns=count_vect_df.index)

In [23]:
cosine_similarity_df.sample(3, random_state=1)

name,tomato stuffed roasted red peppers,almond cake from albufeira portugal,bean burritos simple for camping,potato gorgonzola gratin,orange glazed barbecued chicken,ghost pepper fresh salsa,musalaydaar baingun,giada de laurentiis fettuccine alfredo,zesty cajun skirt steak,penne with rustic lentil sauce,lemon parmigiano artichoke casserole,overnight cinnamon swirl cranberry french toast,southwest rub,shallow fry buttermilk pecan chicken,awesome bbq pulled pork no smoker or barbecue needed,raspberry poke cake,zippy steak,chutney glazed chicken,taco chicken wings,rachael ray s tilapia with tomatillo sauce,herbed balsamic chicken breasts,twice baked potatoes,swiss cheesy cauliflower,pressure cooker golden mushroom beef stew,cherry filled crescent rolls,corned beef dinner in the crock pot,lennie and donna s souper rice curry,deep fried shrimp cakes taud man goong,chocolate gingerbread cookies,spanish spice rubbed rib eye with sherry vinegar steak sauce,emeril s con queso,artichoke and crab dip,pumpkin curry,stuffed chicken breasts with brie basil and sun dried tomato,crab tampico,rosemary garlic seasoning,strawberry tofu mousse,amish brown sugar pie,broccoli with lemon almond butter,buster brown cake with rich vanilla icing,...,cornmeal pound cake,buffalo chicken fritos pie,border paella,black forest chocolate mousse shots,beer bacon potato soup aka man soup,mushroom bread pudding,daddy s barley casserole,aloha salmon,melt and mix chocolate chunk mud cake,spicy ranch chicken wings,cream of chanterelle soup,apricot pork medallions,tequila lime marinade,lancaster county pa tapioca recipe,marinated beef tenderloin sandwiches,british yogurt cheese,sweet potato black bean salad,moscow bobsled shot drink you decide,vegetable breakfast pizza,cruzan cherry pop,easy greek style string green beans,awesome spinach quiche,lazy stuffed cabbage casserole,yum yum bars,butter tart muffins,roast butternut squash with lemon and mustard,homemade eggnog,grandma s city chicken,ravioli with prosciutto roma tomato and sage,pollis house dressing,cinnamon date cake,power cookies,chili roasted salmon,garlic and herb bread france,peach and papaya salsa,gourmet magazine s asian cucumber ribbon salad,macadamia toffee chip cookies,peel and eat shrimp with spicy cocktail sauce,tapenade in 5 minutes,easy pan con tomato
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
nana s chicken seafood gumbo courtesy the neelys,0.0,0.0,0.0,0.0,0.052604,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040654,0.0,0.0,0.0,0.059142,0.060595,0.0,0.050469,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.035245,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.046626,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.053814,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
apricot and orange cream cream aux abricots et oranges,0.0,0.0,0.0,0.0,0.12326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181046,0.18701,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
easy healthy weeknight tuna tomato skillet,0.133868,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.09345,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.091545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.09944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.318996


As seen here, **nana's chicken seafood gumbo** is less similar to the **orange glazed barbecued chicken**, likely because the word 'chicken', which is where the similarity lies, appears many times in the bag of words (recipe names).

In [24]:
cosine_similarity_df['orange glazed barbecued chicken'].sort_values(ascending=False).to_frame().head(10)

Unnamed: 0_level_0,orange glazed barbecued chicken
name,Unnamed: 1_level_1
orange glazed barbecued chicken,1.0
orange glazed breaded chicken,0.590422
pork with orange glazed onions,0.540511
marinated barbecued chicken,0.51429
simply yummy orange glazed chicken wings,0.470644
chutney glazed chicken,0.452504
orange pan glazed tempeh recipe,0.428114
southern style honey barbecued chicken,0.407332
barbecued spareribs,0.395633
oven barbecued steak,0.39297


With the TF-IDF vectorizer and cosine similarity, the recommendations change a little.

### Using TF-IDF on ingredients instead of name

In [25]:
df_recipes2.ingredients

0       ['red peppers', 'plum tomatoes', 'green onion'...
1       ['all-purpose flour', 'granulated sugar', 'bak...
2       ['refried beans', 'burrito-size flour tortilla...
3       ['baking potatoes', 'salt and black pepper', '...
4       ['orange juice', 'salt', 'red pepper flakes', ...
                              ...                        
4995    ['seasoned rice vinegar', 'sugar', 'soy sauce'...
4996    ['unsalted butter', 'granulated sugar', 'light...
4997    ['large shrimp', 'kosher salt', 'sugar', 'wate...
4998            ['olive', 'olive oil', 'lemon, juice of']
4999    ['peasant bread', 'olive oil', 'tomatoes', 'ga...
Name: ingredients, Length: 5000, dtype: object

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

# # nulls present in description of 97 recipes, we're dropping those recipes 
# df_recipes2_no_nulls = df_recipes2.dropna(axis=0).reset_index()

# # sampling again because TF-IDF with bigrams on more words will be too much on RAM
# df_recipes2_no_nulls = df_recipes2_no_nulls.sample(500, random_state=1)

V = TfidfVectorizer(stop_words = 'english', analyzer=lambda d: d.split(', '))
X = V.fit_transform(df_recipes2.ingredients)

In [27]:
tfidf_df = pd.DataFrame(X.todense(), columns=V.get_feature_names())
tfidf_df

Unnamed: 0,"""better 'n peanut butter spread""","""campbell's condensed cream of celery soup""","""campbell's condensed tomato soup""","""campbell's cream of mushroom soup""","""campbell's tomato soup""","""captain morgan's spiced rum""","""coleman's dry mustard""","""confectioners' sugar""","""confectioners' sugar""]","""devil's food cake mix""","""emeril's original essence""","""fleischmann's active dry yeast""","""frank's red hot sauce""","""goat's milk""","""hellmann's mayonnaise""","""i can't believe it's not butter fat free""","""lawry's seasoned salt""","""libby's canned pumpkin""","""m&m's plain chocolate candy""","""m&m's plain chocolate candy""]","""mccormick's montreal brand steak seasoning""","""mccormick's montreal brand steak seasoning""]","""mccormick's salt-free all-purpose seasoning""","""o'brien frozen potatoes""","""smucker's hot dark chocolate microwaveable topping""","""sweet 'n low""]","""taco bell home originals thick 'n chunky salsa""]","""yve's veggie ground round""","""za'atar spice mix""","""za'atar spice mix""]",'1% fat buttermilk','1% fat cottage cheese','1% fat cottage cheese'],'1% low-fat milk','2% cheddar cheese','2% evaporated milk','2% low-fat milk','2% low-fat milk'],'35% cream','7-inch flour tortillas',...,['whole turkey breast',['whole turkey',['whole wheat bread',['whole wheat flour',['whole wheat pastry flour',['whole wheat spaghetti',['whole wheat tortilla',['wholemeal self-rising flour',['wide egg noodles',['wild rice',['wine vinegar',['winter squash',['wonton wrappers',['worcestershire sauce',['wraps',['x-rated fusion liqueur',['yeast',['yellow bell pepper',['yellow bell peppers',['yellow cake mix with pudding',['yellow cake mix',['yellow cornmeal',['yellow onion',['yellow onions',['yellow split peas',['yellow squash',['yigandes beans',['yogurt',['yukon gold potato',['yukon gold potatoes',['ziploc bags',['ziti pasta',['zucchini',juice and zest of',juice and zest of'],juice of',juice of'],rind of',zest of',zest of']
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
4996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
4997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
4998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.533093,0.0,0.0,0.0


In [28]:
# set name of recipe as index 
tfidf_df = tfidf_df.set_index(df_recipes2.name)

In [None]:
from scipy.spatial.distance import pdist, squareform

# calculating cosine distances for each recipe
cosine_distances = pdist(tfidf_df.values, metric='cosine')
print(cosine_distances)

In [None]:
# Convert the distances to a square matrix
cosine_similarity_array = 1 - squareform(cosine_distances)

# Wrap the array in a pandas DataFrame
cosine_similarity_df = pd.DataFrame(cosine_similarity_array, index=count_vect_df.index, columns=count_vect_df.index)

In [None]:
cosine_similarity_df.sample(3, random_state=1)

In [None]:
cosine_similarity_df['orange glazed barbecued chicken'].sort_values(ascending=False).to_frame().head(10)

Oddly enough, using ingredients as the distance metric doesn't pay off as well. This is likely because the more 'unique' ingredients, which would be scored higher with TF-IDF, doesn't necessarily mean the most similar recipes.

### Using Count Vectorizer on ingredients

In [None]:
# # nulls present in description of 97 recipes, we're dropping those recipes 
# df_recipes2_no_nulls = df_recipes2.dropna(axis=0).reset_index()

# # sampling again because TF-IDF with bigrams on more words will be too much on RAM
# df_recipes2_no_nulls = df_recipes2_no_nulls.sample(500, random_state=1)

V = CountVectorizer(stop_words = 'english', analyzer=lambda d: d.split(', '))
X = V.fit_transform(df_recipes2.ingredients)

In [None]:
count_vect_df = pd.DataFrame(X.todense(), columns=V.get_feature_names())
count_vect_df

In [None]:
# set name of recipe as index 
count_vect_df = count_vect_df.set_index(df_recipes2.name)

In [None]:
from scipy.spatial.distance import pdist, squareform

# calculating cosine distances for each recipe
cosine_distances = pdist(count_vect_df.values, metric='cosine')
print(cosine_distances)

In [None]:
# Convert the distances to a square matrix
cosine_similarity_array = 1 - squareform(cosine_distances)

# Wrap the array in a pandas DataFrame
cosine_similarity_df = pd.DataFrame(cosine_similarity_array, index=count_vect_df.index, columns=count_vect_df.index)

In [None]:
cosine_similarity_df.sample(3, random_state=1)

In [None]:
cosine_similarity_df['orange glazed barbecued chicken'].sort_values(ascending=False).to_frame().head(10)

Using Count Vectorizer still leads to the same result, whereby the recipes don't look as similar as they would if we used the names of the recipes instead.

### Creating a User Profile

To go a bit off a tangent, let's see how we could implement these kind of recommendations based on user preferences rather than characteristics of the recipe. For example, if User A likes 3 recipes, we'll aggregate the vectorized scores of those 3 recipes and see which recipes are most similar to User A's preferences.

For this instance, let's use the preferences of a specific user already in our dataset. We'll use a random user with high number of interactions.

In [None]:
train_interact.groupby('user_id').count().sort_values('recipe_id', ascending=False).head(50).sample(1, random_state=1)

# In this case User 222564

In [None]:
train_interact.loc[train_interact.user_id == 222564].head(5)

Let's only use the recipes the user likes; in this case, recipes he score 4 or above.

In [None]:
df_user = train_interact.loc[(train_interact.user_id == 222564) & (train_interact.rating >= 4)]
df_user.head()

In [None]:
user_recipes = [recipe for recipe in df_user.recipe_id]

Due to memory limitations, we'll have to subset the recipes dataframe again, but this time make sure to include the 1600+ recipes that user 222564 likes.

In [None]:
tmp = (df_recipes.loc[~df_recipes.id.isin(user_recipes)].sample(10000, random_state=1))
df_recipes3 = pd.concat([tmp, df_recipes.loc[df_recipes.id.isin(user_recipes)]])

In [None]:
df_recipes3

In [None]:
V = TfidfVectorizer(stop_words = 'english') # initialize vectorizer with common stopwords

X = V.fit_transform(df_recipes3.name)

In [None]:
tfidf_df = pd.DataFrame(X.todense(), columns=V.get_feature_names())
tfidf_df

In [None]:
# set name of id as index this time so we can refer to the specific recipes later
tfidf_df = tfidf_df.set_index(df_recipes3.id)

In [None]:
# create tfidf_df of recipes only user has liked
tfidf_df.loc[tfidf_df.index.isin(user_recipes)]

In [None]:
# aggregate these scores by averaging them
user_profile = tfidf_df.loc[tfidf_df.index.isin(user_recipes)].mean(axis=0)
print(user_profile)

In [None]:
# drop recipes that user has seen before
non_user_recipes = tfidf_df.drop(user_recipes, axis=0)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_array = cosine_similarity(user_profile.values.reshape(1, -1), non_user_recipes)
similarity_df = pd.DataFrame(similarity_array.T, index=non_user_recipes.index, columns=["similarity_score"])
similarity_df.head()

In [None]:
# add name information for readibility and sort similarity scores
user_recommnedations = pd.merge(similarity_df, df_recipes[['id','name']], how='left', left_on=similarity_df.index, right_on='id')
user_recommnedations.set_index('name').drop(columns=['id']).sort_values('similarity_score', ascending=False).head(10)

Using the content-based recommendations on the recipe names, User 222564 would likely enjoy the 10 dishes listed above.

They all seem to resemble salads and salad dressings which implies that the user is a big salad eater, specifically chicken salads!