In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

from ast import literal_eval
from sklearn import metrics
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

We load in our datasets, consisting of user data and data for the recipes. We will be using the ratings that users gave in combination with the recipe's ingredients, techniques and calories to perform recommendations

In [6]:
df_users = pd.read_csv('Data/PP_users.csv')
#Note that there are no duplicates
df_users.head()

# u = user_id, 
# techniques = techniques used for items  that were interacted with(index is a technique with the number being a counter),
# items = item_ids of items that were interacted with, 
# n_items = number of items reviewed, 
# ratings = ratings for items reviewed, 
# n_ratings = number of ratings

number_of_users_with_less_than_10_reviews = df_users[df_users["n_ratings"] < 10].shape[0]
number_of_users_with_less_than_5_reviews = df_users[df_users["n_ratings"] < 5].shape[0]
number_of_users = len(df_users.index)

print(str(round((number_of_users_with_less_than_10_reviews / number_of_users) * 100,
                1)) + "% of user have less than 10 reviews")
print(str(round((number_of_users_with_less_than_5_reviews / number_of_users) * 100,
                1)) + "% of user have less than 5 reviews")


62.9% of user have less than 10 reviews
39.9% of user have less than 5 reviews


In [7]:
df_recipes = pd.read_csv('Data/PP_recipes.csv')
#Note that there are no duplicates
df_recipes.head()

# id = recipe_id, i = Recipe ID mapped to contiguous integers from 0, 
# name_tokes = BPE-tokenized recipe name,
# ingredient_tokens = BPE-tokenized ingredients list (list of lists), 
# steps_tokens = BPE-tokenized steps, 
# techniques = List of techniques used in recipe,
# calorie_level = either a 0, 1 or 2 indicating how much calories it contains,
# ingredient_ids = the ids of the ingredients used

Unnamed: 0,id,i,name_tokens,ingredient_tokens,steps_tokens,techniques,calorie_level,ingredient_ids
0,424415,23,"[40480, 37229, 2911, 1019, 249, 6878, 6878, 28...","[[2911, 1019, 249, 6878], [1353], [6953], [153...","[40480, 40482, 21662, 481, 6878, 500, 246, 161...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[389, 7655, 6270, 1527, 3406]"
1,146223,96900,"[40480, 18376, 7056, 246, 1531, 2032, 40481]","[[17918], [25916], [2507, 6444], [8467, 1179],...","[40480, 40482, 729, 2525, 10906, 485, 43, 8393...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[2683, 4969, 800, 5298, 840, 2499, 6632, 7022,..."
2,312329,120056,"[40480, 21044, 16954, 8294, 556, 10837, 40481]","[[5867, 24176], [1353], [6953], [1301, 11332],...","[40480, 40482, 8240, 481, 24176, 296, 1353, 66...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",1,"[1257, 7655, 6270, 590, 5024, 1119, 4883, 6696..."
3,74301,168258,"[40480, 10025, 31156, 40481]","[[1270, 1645, 28447], [21601], [27952, 29471, ...","[40480, 40482, 5539, 21601, 1073, 903, 2324, 4...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[7940, 3609, 7060, 6265, 1170, 6654, 5003, 3561]"
4,76272,109030,"[40480, 17841, 252, 782, 2373, 1641, 2373, 252...","[[1430, 11434], [1430, 17027], [1615, 23, 695,...","[40480, 40482, 14046, 1430, 11434, 488, 17027,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[3484, 6324, 7594, 243]"


In [8]:
df_users = pd.read_csv('Data/PP_users.csv')
df_users.drop('techniques', axis=1, inplace=True)
df_users.drop('n_items', axis=1, inplace=True)
df_users.drop('n_ratings', axis=1, inplace=True)

df_users = df_users.rename(columns={'u': 'user', 'items': 'item', 'ratings': 'rating'})

df_users.head()

Unnamed: 0,user,item,rating
0,0,"[1118, 27680, 32541, 137353, 16428, 28815, 658...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, ..."
1,1,"[122140, 77036, 156817, 76957, 68818, 155600, ...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ..."
2,2,"[168054, 87218, 35731, 1, 20475, 9039, 124834,...","[3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, ..."
3,3,"[163193, 156352, 102888, 19914, 169438, 55772,...","[5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 5.0, 5.0, 5.0, ..."
4,4,"[72857, 38652, 160427, 55772, 119999, 141777, ...","[5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 5.0, 4.0, 5.0, ..."


In [9]:
# Needed to make the explode function, source: https://stackoverflow.com/questions/63472664/pandas-explode-function-not-working-for-list-of-string-column
df_users['rating'] = df_users['rating'].apply(literal_eval)
df_users['item'] = df_users['item'].apply(literal_eval)

df_users = df_users.explode(['rating', 'item'], ignore_index=True)
df_users.head()

Unnamed: 0,user,item,rating
0,0,1118,5.0
1,0,27680,5.0
2,0,32541,5.0
3,0,137353,5.0
4,0,16428,5.0


In [10]:
df_recipes.drop('i', axis=1, inplace=True)
df_recipes.drop('name_tokens', axis=1, inplace=True)
df_recipes.drop('ingredient_tokens', axis=1, inplace=True)
df_recipes.drop('steps_tokens', axis=1, inplace=True)

df_recipes.head()

Unnamed: 0,id,techniques,calorie_level,ingredient_ids
0,424415,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[389, 7655, 6270, 1527, 3406]"
1,146223,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[2683, 4969, 800, 5298, 840, 2499, 6632, 7022,..."
2,312329,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",1,"[1257, 7655, 6270, 590, 5024, 1119, 4883, 6696..."
3,74301,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[7940, 3609, 7060, 6265, 1170, 6654, 5003, 3561]"
4,76272,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[3484, 6324, 7594, 243]"


In [11]:
# Needed to make the explode function, source: https://stackoverflow.com/questions/63472664/pandas-explode-function-not-working-for-list-of-string-column
df_recipes['techniques'] = df_recipes['techniques'].apply(literal_eval)
df_recipes['ingredient_ids'] = df_recipes['ingredient_ids'].apply(literal_eval)

df_recipes = df_recipes.explode('techniques')
df_recipes = df_recipes.explode('ingredient_ids')
df_recipes.head()

Unnamed: 0,id,techniques,calorie_level,ingredient_ids
0,424415,0,0,389
0,424415,0,0,7655
0,424415,0,0,6270
0,424415,0,0,1527
0,424415,0,0,3406


In [12]:
# Where are we removing users and recipes with few ratings?
# Do we still need the explode for anything?
# Should we change the features that we're using?

## Health Filter

In [46]:
# Note that this file is made by us to only have to load nutritional info instead of a giant csv file
nutrition_data = pd.read_csv('Data/nutrition.csv', sep=';')
nutrition_data.head()

Unnamed: 0,id,nutrition
0,38,"[170.9, 3.0, 120.0, 1.0, 6.0, 6.0, 12.0]"
1,39,"[1110.7, 90.0, 81.0, 15.0, 126.0, 82.0, 28.0]"
2,40,"[311.1, 0.0, 308.0, 0.0, 0.0, 0.0, 27.0]"
3,41,"[536.1, 36.0, 128.0, 64.0, 58.0, 19.0, 21.0]"
4,43,"[437.9, 29.0, 170.0, 11.0, 13.0, 54.0, 19.0]"


We want to give the user a (tweakable) health filter, so they can filter their suggestions to only contain healthy recipes.

In [47]:
# This functions returns the recipe_ids
# Note: limits takes max sugar amount, max sodium, min protein amount, max saturated_fat (not in grams, but percentage of nutritional content)
def ApplyHealthFilter(recipe_ids, limits=[.15, .35, .10, .25], debug_prints=False):
    healthy_recipes = []

    # Get the nutritional information for the relevant recipes
    recipes = nutrition_data.loc[nutrition_data['id'].isin(recipe_ids)]

    for index, recipe in recipes.iterrows():
        # Nutrition information in calories, total fat, sugar, sodium, protein, saturated fat, carbohydrates 
        nutrition_values = recipe['nutrition']
        # convert string version of array into a proper array
        nutrition_values = literal_eval(nutrition_values)

        sugar = nutrition_values[2]
        sodium = nutrition_values[3]
        protein = nutrition_values[4]
        saturated_fat = nutrition_values[5]

        # Since the nutritional info is in absolute numbers instead of per 100 grams, we'll normalize
        normalization_factor = sum(nutrition_values[1:])
        normalization_factor = max(normalization_factor, 0.01)

        sugar /= normalization_factor
        sodium /= normalization_factor
        protein /= normalization_factor
        saturated_fat /= normalization_factor

        if sugar < limits[0] and sodium < limits[1] and protein > limits[2] and saturated_fat < limits[3]:
            healthy_recipes.append(recipe['id'])

            if debug_prints:
                print("Healthy: ", index)

        else:
            if debug_prints:
                print("Unhealthy: ", index, sugar, sodium, protein, saturated_fat)

    return healthy_recipes


df_random = df_users.sample(n=20)
recipe_ids = df_random['item']
# limits = [.15, .35, .10, .25]

healhty_recipes = ApplyHealthFilter(recipe_ids)
print(healhty_recipes) # prints healthy recipes ids

[47400, 134651, 151549]


Since we have a lot of high dimensional data, we could make use of SVD to speed up computation.

In [60]:
recipe_data = pd.read_csv('Data/RAW_recipes.csv', sep=',')
recipe_data.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [62]:
# Transform text into numerical data

# tags = vectorizer.fit_transform(tags)
# steps = vectorizer.fit_transform(steps)
# ingredients = vectorizer.fit_transform(ingredients)

# print(tags.shape)
# print(steps.shape)
# print(ingredients.shape)

In [51]:
# print(minutes)
# print(tags)
# print(nutrition)
# print(steps)
# print(ingredients)

In [65]:
# X contains minutes to cook, tags, nutritional values, cooking steps, ingredients
# X = [minutes, tags, nutrition, steps, ingredients]

recipe_data = pd.read_csv('Data/RAW_recipes.csv', sep=',')

recipe_ids = recipe_data['id']
minutes = recipe_data['minutes']
tags = recipe_data['tags']
nutrition = recipe_data['nutrition']
steps = recipe_data['steps']
ingredients = recipe_data['ingredients']

vectorizer = TfidfVectorizer(max_features=500)
X = {'tags':tags, 'steps':steps, 'ingredients':ingredients}
X = pd.DataFrame(data=X)

svd = TruncatedSVD(n_components=100, n_iter=10)
svd.fit_transform(vectorizer.fit_transform(X['tags'] + X['steps'] + X['ingredients']))
print(svd.explained_variance_ratio_)
print(svd.explained_variance_ratio_.sum())

[0.00887291 0.03947365 0.03513896 0.0264154  0.02008977 0.01715533
 0.01501141 0.01363405 0.01270632 0.01189007 0.01054005 0.010048
 0.00980295 0.00898404 0.00859257 0.00819721 0.00781819 0.00759045
 0.00742047 0.00719979 0.0069808  0.00669773 0.00634785 0.00624961
 0.00615796 0.00606874 0.00595034 0.005728   0.00568062 0.00558469
 0.00545302 0.00540818 0.00525652 0.00524105 0.00505959 0.00503632
 0.00493007 0.00484593 0.00462655 0.00455392 0.00440325 0.00436626
 0.00423342 0.00417726 0.00410533 0.00408462 0.00401634 0.00394794
 0.00393414 0.00389133 0.00385075 0.00379887 0.0037172  0.00369417
 0.00367273 0.00366681 0.00356011 0.00354732 0.00350698 0.00343546
 0.00342588 0.00339978 0.00335398 0.00334216 0.00327553 0.00324976
 0.00322123 0.00318379 0.00318001 0.00313709 0.00304357 0.00302976
 0.0029859  0.00297324 0.00293149 0.00289477 0.00285407 0.00283808
 0.00282008 0.0028053  0.00277286 0.00276801 0.00272229 0.00271246
 0.00269225 0.00267459 0.00265962 0.00261751 0.00258542 0.002563

## 1. Individual Recommendations
kNN, Item-based collaborative filtering

In [13]:
users_sample = df_users.sample(n=1000)
recipes_sample = df_recipes.sample(n=1000)
print(users_sample)
print(recipes_sample)

        user    item rating
84322    208   45726    5.0
61316    157   91793    5.0
343313  1662   15052    3.0
6456      23   15076    5.0
11193     44    8561    5.0
...      ...     ...    ...
546660  6956  157037    5.0
6384      21   55912    5.0
185720   483   65230    5.0
382833  2216   25442    5.0
562453  7744   76662    5.0

[1000 rows x 3 columns]
            id techniques  calorie_level ingredient_ids
137413   81784          0              1           5006
35094   182438          0              1           6270
107454  279800          0              2           5006
94434   172733          0              1           6270
110020  308905          0              0           6906
...        ...        ...            ...            ...
76216   317341          0              2           1987
6379    211537          0              0           3269
26581   145381          0              0           7449
102365  105092          0              1           7470
68711     7685         

In [14]:
#USE ONLY HEALTHY RECIPES

#recipe_ids = users_sample['item']
#healthy_recipes = ApplyHealthFilter(recipe_ids, [.15, .35, .10, .25])
#print(healthy_recipes)

#recipes_sample = df_recipes.copy()
#for i in range(len(df_recipes)):
#    for j in range(len(healthy_recipes)):
#        isHealthy = False
#        print(df_recipes.at[i, 'id'])
#        if df_recipes.at[i, 'id'] == healthy_recipes[j]:
#            isHealthy = True
#            break
#        
#        if isHealthy != True:
#            recipes_sample.drop[i]
#recipes_sample

In [17]:
df_recipes = df_recipes.rename(columns={'id': 'recipe_id'})
df_users = df_users.rename(columns={'item': 'recipe_id'})
df = pd.merge(df_users, df_recipes, on='recipe_id')
df_recipes_full = df_recipes_full.rename(columns={'id': 'recipe_id'})
df2 = pd.merge(df, df_recipes_full, on='recipe_id')
df2_sample = df2.sample(n=1000)
print(df2_sample)

                                                name  recipe_id  minutes  \
0         arriba   baked winter squash mexican style     137739       55   
1                   a bit different  breakfast pizza      31490       30   
2                          all in the kitchen  chili     112140      130   
3                                 alouette  potatoes      59389       45   
4                 amish  tomato ketchup  for canning      44061      190   
...                                              ...        ...      ...   
231632                                   zydeco soup     486161       60   
231633                              zydeco spice mix     493372        5   
231634                     zydeco ya ya deviled eggs     308080       40   
231635        cookies by design   cookies on a stick     298512       29   
231636  cookies by design   sugar shortbread cookies     298509       20   

        contributor_id   submitted  \
0                47892  2005-09-16   
1          

In [58]:
pd.set_option("display.max_rows", None, "display.max_columns", None)
print(df2_sample)

            user recipe_id rating techniques  calorie_level ingredient_ids  \
130232290    371     41561    5.0          0              2            840   
55384009    1272    154050    4.0          0              1           6220   
118345822   1487    133743    4.0          0              2           5412   
129643468   2163    147707    5.0          0              0           3635   
146704452    816    142440    4.0          0              2           3219   
158133753   2970    150234    5.0          0              1           5915   
19604277    7698     99787    5.0          0              1            590   
116572705   2623     81650    5.0          0              1           3203   
117721688   1737     89437    5.0          0              2           7962   
152897304   1278     98521    4.0          0              2           3550   
75318654     348     11058    5.0          0              0            289   
118884565   9133    136285    5.0          0              0     

In [54]:
X = df2_sample.iloc[:,[4,7]]; # training data 
y = df2_sample['rating']; # target values, labels

y = y.apply(np.int64)

print(X)
print(y)

           calorie_level  minutes
130232290              2       90
55384009               1       55
118345822              2       50
129643468              0       95
146704452              2       20
...                  ...      ...
162068542              0       30
64559062               2       30
162640035              0       45
3022129                1       45
70870412               1       70

[1000 rows x 2 columns]
130232290    5
55384009     4
118345822    4
129643468    5
146704452    4
            ..
162068542    4
64559062     5
162640035    5
3022129      4
70870412     5
Name: rating, Length: 1000, dtype: int64


130232290    5
55384009     4
118345822    4
129643468    5
146704452    4
Name: rating, dtype: int64

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) # 67% training, 33% testing

model = KNeighborsClassifier(n_neighbors=3)

knn = model.fit(X_train, y_train)

y_pred_train = model.predict(X_train) # y_pred_train predicts X_train
y_pred_test = model.predict(X_test)

print(X_train)
#print(X_test)
print(y_train)
#print(y_test)
#print(y_pred_train)
#print(y_pred_test)

           calorie_level  minutes
33905342               2       55
58811859               0      130
1547663                2       90
27932436               1      380
115753217              0       30
121814665              0       23
72046240               0       75
49233281               1      120
133974627              2       52
110667411              2       40
2799956                0       40
142996811              1       15
15561214               0       44
16945231               1       50
140041435              2       60
25049376               2        0
123486142              1       10
51556082               1       55
77060890               1       25
122862607              2       55
3022129                1       45
70091791               0       35
164957340              1       85
122706414              2       50
89046542               0       40
90742614               0       20
2329132                1       60
12033576               0        2
42403171      

In [75]:
# TODO compare expected outputs with actual outputs

X_new = [[2, 55]] #[calorie leve, minutes]
# [1,10] gives 4
# [1, 15] gives 4
y_predict = model.predict(X_new)

print('For a recipes that has a calorie level of', X_new[0][0], 'and takes',X_new[0][1], 'minutes to cook, gets a rating of', y_predict[0])

For a recipes that has a calorie level of 2 and takes 55 minutes to cook, gets a rating of 5


In [76]:
print('Accuracy on training data =', metrics.accuracy_score(np.array(y_train.to_list()), y_pred_train))
print('Accuracy on testing data =', metrics.accuracy_score(np.array(y_test.to_list()), y_pred_test))
print('')
print(metrics.classification_report(np.array(y_test.to_list()), y_pred_test))

Accuracy on training data = 0.7208955223880597
Accuracy on testing data = 0.6515151515151515

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         6
           1       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         9
           4       0.24      0.13      0.17        70
           5       0.74      0.84      0.79       245

    accuracy                           0.65       330
   macro avg       0.20      0.19      0.19       330
weighted avg       0.60      0.65      0.62       330



  _warn_prf(average, modifier, msg_start, len(result))


In [3]:
df_recipes_full = pd.read_csv('Data/RAW_recipes.csv')
#Note that there are no duplicates
df_recipes_full.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [None]:
df_recipes_full.drop('minutes', axis=1, inplace=True)
df_recipes_full.drop('contributor_id', axis=1, inplace=True)
df_recipes_full.drop('submitted', axis=1, inplace=True)
df_recipes_full.drop('tags', axis=1, inplace=True)
df_recipes_full.drop('nutrition', axis=1, inplace=True)
df_recipes_full.drop('steps', axis=1, inplace=True)
df_recipes_full.drop('ingredients', axis=1, inplace=True)
df_recipes_full.drop('description', axis=1, inplace=True)

df_recipes_full = df_recipes_full.rename(columns={'id': 'item'})

df_recipes_full.head()

Unnamed: 0,name,item,n_steps,n_ingredients
0,arriba baked winter squash mexican style,137739,11,7
1,a bit different breakfast pizza,31490,9,6
2,all in the kitchen chili,112140,6,13
3,alouette potatoes,59389,11,11
4,amish tomato ketchup for canning,44061,5,8


In [None]:
users_ratings = df_users.groupby(['user']).count()  # count the ratings for each user
selected = users_ratings['rating'] > 30  # keep only 30 + ratings
selected_users = users_ratings.loc[selected]
random_selected = selected_users.sample(n=10)

select_column_df = random_selected.reset_index()[
    'user']  # reset_index() create a new index, and the userId became a column. Then, we can filter using the column name
group_users = list(
    select_column_df)  # iloc select by index, since our dataframe only has one row we read it from the index 0
print(group_users)

In [None]:
group_ratings = df_users.loc[df_users['user'].isin(group_users)]
total_recipes = set(df_recipes.index.tolist())
num_ratings_df = df_users.groupby(['item']).count()
considered_recipes = set(num_ratings_df.loc[num_ratings_df['user'] >= 30].reset_index()['item'])

group_seen_recipes = set(group_ratings['item'].tolist())
group_unseen_recipes = considered_recipes - group_seen_recipes

print('Total amount of recipes,', len(total_recipes))
print('Recipes that have at least 20 ratings,', len(considered_recipes))
print('Recipes that have been rated by the currently selected group,', len(group_seen_recipes))
print('New recipes that the group didnt try yet,', len(group_unseen_recipes))

In [None]:
from IPython.core.display import display
import itertools
from lenskit.algorithms import Recommender
from lenskit.algorithms.user_knn import UserUser

user_user = UserUser(15, min_nbrs=3)  # Minimum (3) and maximum (12) number of neighbors to consider
recsys = Recommender.adapt(user_user)
recsys.fit(df_users)
group_unseen_df = pd.DataFrame(list(itertools.product(group_users, group_unseen_recipes)), columns=['user', 'item'])
group_unseen_df['predicted_rating'] = recsys.predict(group_unseen_df)
group_unseen_df = group_unseen_df[
    group_unseen_df['predicted_rating'].notna()]  # remove the recipes we couldn't get a prediction for
display(group_unseen_df.head(10))

In [None]:
#Min-Max normalization of predicted_ratings

maxVal = group_unseen_df['predicted_rating'].max()
minVal = group_unseen_df['predicted_rating'].min()
group_unseen_df['predicted_rating'] = (group_unseen_df['predicted_rating'] - minVal) / (
        maxVal - minVal)  # Normalized to 0 - 1 scale
group_unseen_df['predicted_rating'] *= 5  # Normalized to 0 - 5 scale

display(group_unseen_df.head(10))

#### Least Misery strategy

In [None]:
least_misery_df = group_unseen_df.groupby(['item']).min().reset_index()
# TODO: Find name of recipe from the RAW data
least_misery_df = least_misery_df.join(df_recipes_full['name'], on='item')
items_lm = least_misery_df['item'].copy()
healthy_lm = ApplyHealthFilter(items_lm)

least_misery_df['healthy'] = least_misery_df['item'].apply(lambda x: 1 if x in healthy_lm else 0)
least_misery_df = least_misery_df.sort_values(by="predicted_rating", ascending=False)[
    ['item', 'predicted_rating', 'name', 'healthy']]

least_misery_df = least_misery_df[least_misery_df.healthy == 1]
display(least_misery_df.head(10))

#### Most Pleasure strategy

In [None]:
most_pleasure_df = group_unseen_df.groupby(['item']).max().reset_index()
# TODO: Find name of recipe from the RAW data
items_mp = most_pleasure_df['item'].copy()
healthy_mp = ApplyHealthFilter(items_mp)

most_pleasure_df['healthy'] = most_pleasure_df['item'].apply(lambda x: 1 if x in healthy_mp else 0)

most_pleasure_df = most_pleasure_df.join(df_recipes_full['name'], on='item').reset_index()
most_pleasure_df = most_pleasure_df.sort_values(by="predicted_rating", ascending=False).reset_index()[
    ['item', 'predicted_rating', 'name', 'healthy']]

most_pleasure_df = most_pleasure_df[most_pleasure_df.healthy == 1]

display(most_pleasure_df.head(10))

#### Approval Voting

In [None]:
group_unseen_temp_df = group_unseen_df.copy()
group_unseen_temp_df['voted'] = group_unseen_temp_df['predicted_rating'].apply(lambda x: 1 if x > 3.5 else 0)
approval_df = group_unseen_temp_df.groupby(['item']).sum()
approval_df.drop('user', axis=1, inplace=True)
approval_df['predicted_rating'] /= len(random_selected)  # Normalize rating
# Only keep the items with maximum approval
approval_df = approval_df[approval_df.voted == approval_df.voted.max()]
approval_df = approval_df.sort_values(by="predicted_rating",
                                      ascending=False).reset_index()  # Get the best rated items with max approval
approval_df = approval_df.join(df_recipes_full['name'], on='item')
display(approval_df.head(10))