In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

from ast import literal_eval
from sklearn import metrics
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

We load in our datasets, consisting of user data and data for the recipes. We will be using the ratings that users gave in combination with the recipe's ingredients, techniques and calories to perform recommendations

In [None]:
df_users = pd.read_csv('Data/PP_users.csv')
#Note that there are no duplicates
df_users.head()

# u = user_id, 
# techniques = techniques used for items  that were interacted with(index is a technique with the number being a counter),
# items = item_ids of items that were interacted with, 
# n_items = number of items reviewed, 
# ratings = ratings for items reviewed, 
# n_ratings = number of ratings

number_of_users_with_less_than_10_reviews = df_users[df_users["n_ratings"] < 10].shape[0]
number_of_users_with_less_than_5_reviews = df_users[df_users["n_ratings"] < 5].shape[0]
number_of_users = len(df_users.index)

print(str(round((number_of_users_with_less_than_10_reviews / number_of_users) * 100,
                1)) + "% of user have less than 10 reviews")
print(str(round((number_of_users_with_less_than_5_reviews / number_of_users) * 100,
                1)) + "% of user have less than 5 reviews")


In [None]:
df_recipes = pd.read_csv('Data/PP_recipes.csv')
#Note that there are no duplicates
df_recipes.head()

# id = recipe_id, i = Recipe ID mapped to contiguous integers from 0, 
# name_tokes = BPE-tokenized recipe name,
# ingredient_tokens = BPE-tokenized ingredients list (list of lists), 
# steps_tokens = BPE-tokenized steps, 
# techniques = List of techniques used in recipe,
# calorie_level = either a 0, 1 or 2 indicating how much calories it contains,
# ingredient_ids = the ids of the ingredients used

In [None]:
df_users = pd.read_csv('Data/PP_users.csv')
df_users.drop('techniques', axis=1, inplace=True)
df_users.drop('n_items', axis=1, inplace=True)
df_users.drop('n_ratings', axis=1, inplace=True)

df_users = df_users.rename(columns={'u': 'user', 'items': 'item', 'ratings': 'rating'})

df_users.head()

In [None]:
# Needed to make the explode function, source: https://stackoverflow.com/questions/63472664/pandas-explode-function-not-working-for-list-of-string-column
df_users['rating'] = df_users['rating'].apply(literal_eval)
df_users['item'] = df_users['item'].apply(literal_eval)

df_users = df_users.explode(['rating', 'item'], ignore_index=True)
df_users.head()

In [None]:
df_recipes.drop('i', axis=1, inplace=True)
df_recipes.drop('name_tokens', axis=1, inplace=True)
df_recipes.drop('ingredient_tokens', axis=1, inplace=True)
df_recipes.drop('steps_tokens', axis=1, inplace=True)

df_recipes.head()

In [None]:
# Needed to make the explode function, source: https://stackoverflow.com/questions/63472664/pandas-explode-function-not-working-for-list-of-string-column
df_recipes['techniques'] = df_recipes['techniques'].apply(literal_eval)
df_recipes['ingredient_ids'] = df_recipes['ingredient_ids'].apply(literal_eval)

df_recipes = df_recipes.explode('techniques')
df_recipes = df_recipes.explode('ingredient_ids')
df_recipes.head()

In [None]:
# Where are we removing users and recipes with few ratings?
# Do we still need the explode for anything?
# Should we change the features that we're using?

## Health Filter

In [None]:
# Note that this file is made by us to only have to load nutritional info instead of a giant csv file
nutrition_data = pd.read_csv('Data/nutrition.csv', sep=';')
nutrition_data.head()

We want to give the user a (tweakable) health filter, so they can filter their suggestions to only contain healthy recipes.

In [None]:
# This functions returns the recipe_ids
# Note: limits takes max sugar amount, max sodium, min protein amount, max saturated_fat (not in grams, but percentage of nutritional content)
def ApplyHealthFilter(recipe_ids, limits=[.15, .35, .10, .25], debug_prints=False):
    healthy_recipes = []

    # Get the nutritional information for the relevant recipes
    recipes = nutrition_data.loc[nutrition_data['id'].isin(recipe_ids)]

    for index, recipe in recipes.iterrows():
        # Nutrition information in calories, total fat, sugar, sodium, protein, saturated fat, carbohydrates 
        nutrition_values = recipe['nutrition']
        # convert string version of array into a proper array
        nutrition_values = literal_eval(nutrition_values)

        sugar = nutrition_values[2]
        sodium = nutrition_values[3]
        protein = nutrition_values[4]
        saturated_fat = nutrition_values[5]

        # Since the nutritional info is in absolute numbers instead of per 100 grams, we'll normalize
        normalization_factor = sum(nutrition_values[1:])
        normalization_factor = max(normalization_factor, 0.01)

        sugar /= normalization_factor
        sodium /= normalization_factor
        protein /= normalization_factor
        saturated_fat /= normalization_factor

        if sugar < limits[0] and sodium < limits[1] and protein > limits[2] and saturated_fat < limits[3]:
            healthy_recipes.append(recipe['id'])

            if debug_prints:
                print("Healthy: ", index)

        else:
            if debug_prints:
                print("Unhealthy: ", index, sugar, sodium, protein, saturated_fat)

    return healthy_recipes


df_random = df_users.sample(n=20)
recipe_ids = df_random['item']
# limits = [.15, .35, .10, .25]

healhty_recipes = ApplyHealthFilter(recipe_ids)
print(healhty_recipes)

Since we have a lot of high dimensional data, we could make use of SVD to speed up computation.

In [3]:
recipe_data = pd.read_csv('Data/RAW_recipes.csv', sep=',')
recipe_data.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [24]:
minutes = recipe_data['minutes']
tags = recipe_data['tags']
nutrition = recipe_data['nutrition']
steps = recipe_data['steps']
ingredients = recipe_data['ingredients']


In [27]:
# Transform text into numerical data
vectorizer = TfidfVectorizer(max_features=10000)
tags = vectorizer.fit_transform(tags)
steps = vectorizer.fit_transform(steps)
ingredients = vectorizer.fit_transform(ingredients)

print(tags.shape)
print(steps.shape)
print(ingredients.shape)

(231637, 593)
(231637, 10000)
(231637, 4213)


In [34]:
# print(minutes)
# print(tags)
# print(nutrition)
# print(steps)
# print(ingredients)

In [39]:
# X contains minutes to cook, tags, nutritional values, cooking steps, ingredients
# X = [minutes, tags, nutrition, steps, ingredients]

# svd = TruncatedSVD(n_components=100, n_iter=10)
# svd.fit(X)
# print(svd.explained_variance_ratio_)
# print(svd.explained_variance_ratio_.sum())

  array = np.asarray(array, order=order, dtype=dtype)


ValueError: Expected 2D array, got 1D array instead:
array=[0          55
 1          30
 2         130
 3          45
 4         190
          ...
 231632     60
 231633      5
 231634     40
 231635     29
 231636     20
 Name: minutes, Length: 231637, dtype: int64
 <231637x593 sparse matrix of type '<class 'numpy.float64'>'
 	with 6351801 stored elements in Compressed Sparse Row format>
 0               [51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]
 1           [173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]
 2          [269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]
 3           [368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]
 4           [352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]
                              ...
 231632    [415.2, 26.0, 34.0, 26.0, 44.0, 21.0, 15.0]
 231633          [14.8, 0.0, 2.0, 58.0, 1.0, 0.0, 1.0]
 231634           [59.2, 6.0, 2.0, 3.0, 6.0, 5.0, 0.0]
 231635      [188.0, 11.0, 57.0, 11.0, 7.0, 21.0, 9.0]
 231636       [174.9, 14.0, 33.0, 4.0, 4.0, 11.0, 6.0]
 Name: nutrition, Length: 231637, dtype: object
 <231637x10000 sparse matrix of type '<class 'numpy.float64'>'
 	with 13878050 stored elements in Compressed Sparse Row format>
 <231637x4213 sparse matrix of type '<class 'numpy.float64'>'
 	with 3612449 stored elements in Compressed Sparse Row format>].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

## 1. Individual Recommendations

In [None]:
df_random = df_users.sample(n=10000)

In [None]:
print(df_users)
print(df_recipes)

X = df_random.drop('rating', axis=1)  # training data
y = df_random['rating']  # target values
# TODO take into account df_recipes

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)  # 67% training, 33% testing

knn = KNeighborsRegressor(n_neighbors=3)
knn.fit(X_train, y_train)

y_pred_train = knn.predict(X_train)  # y_pred_train predicts X_train
y_pred_test = knn.predict(X_test)

#y_pred_train = np.rint(y_pred_train)
#y_pred_test = np.rint(y_pred_test)

#print(X_train)
#print((y_pred_train))
#print((np.array(y_train.to_list())))

# TODO compare expected outputs with actual outputs
print(y_pred_test)  # why float value ??
print(y_test)

# TODO solve
print('Accuracy on training data =', metrics.accuracy_score(np.array(y_train.to_list()), y_pred_train))
print('Accuracy on testing data =', metrics.accuracy_score(np.array(y_test.to_list()), y_pred_test))
print('')
print(metrics.classification_report(np.array(y_test.to_list()), y_pred_test))

In [None]:
df_users['rating'].value_counts()

In [None]:
df_recipes_full = pd.read_csv('Data/RAW_recipes.csv')
#Note that there are no duplicates
df_recipes_full.head()

In [None]:
df_recipes_full.drop('minutes', axis=1, inplace=True)
df_recipes_full.drop('contributor_id', axis=1, inplace=True)
df_recipes_full.drop('submitted', axis=1, inplace=True)
df_recipes_full.drop('tags', axis=1, inplace=True)
df_recipes_full.drop('nutrition', axis=1, inplace=True)
df_recipes_full.drop('steps', axis=1, inplace=True)
df_recipes_full.drop('ingredients', axis=1, inplace=True)
df_recipes_full.drop('description', axis=1, inplace=True)

df_recipes_full = df_recipes_full.rename(columns={'id': 'item'})

df_recipes_full.head()

In [None]:
users_ratings = df_users.groupby(['user']).count()  # count the ratings for each user
selected = users_ratings['rating'] > 30  # keep only 30 + ratings
selected_users = users_ratings.loc[selected]
random_selected = selected_users.sample(n=10)

select_column_df = random_selected.reset_index()[
    'user']  # reset_index() create a new index, and the userId became a column. Then, we can filter using the column name
group_users = list(
    select_column_df)  # iloc select by index, since our dataframe only has one row we read it from the index 0
print(group_users)

In [None]:
group_ratings = df_users.loc[df_users['user'].isin(group_users)]
total_recipes = set(df_recipes.index.tolist())
num_ratings_df = df_users.groupby(['item']).count()
considered_recipes = set(num_ratings_df.loc[num_ratings_df['user'] >= 30].reset_index()['item'])

group_seen_recipes = set(group_ratings['item'].tolist())
group_unseen_recipes = considered_recipes - group_seen_recipes

print('Total amount of recipes,', len(total_recipes))
print('Recipes that have at least 20 ratings,', len(considered_recipes))
print('Recipes that have been rated by the currently selected group,', len(group_seen_recipes))
print('New recipes that the group didnt try yet,', len(group_unseen_recipes))

In [None]:
from IPython.core.display import display
import itertools
from lenskit.algorithms import Recommender
from lenskit.algorithms.user_knn import UserUser

user_user = UserUser(15, min_nbrs=3)  # Minimum (3) and maximum (12) number of neighbors to consider
recsys = Recommender.adapt(user_user)
recsys.fit(df_users)
group_unseen_df = pd.DataFrame(list(itertools.product(group_users, group_unseen_recipes)), columns=['user', 'item'])
group_unseen_df['predicted_rating'] = recsys.predict(group_unseen_df)
group_unseen_df = group_unseen_df[
    group_unseen_df['predicted_rating'].notna()]  # remove the recipes we couldn't get a prediction for
display(group_unseen_df.head(10))

In [None]:
#Min-Max normalization of predicted_ratings

maxVal = group_unseen_df['predicted_rating'].max()
minVal = group_unseen_df['predicted_rating'].min()
group_unseen_df['predicted_rating'] = (group_unseen_df['predicted_rating'] - minVal) / (
        maxVal - minVal)  # Normalized to 0 - 1 scale
group_unseen_df['predicted_rating'] *= 5  # Normalized to 0 - 5 scale

display(group_unseen_df.head(10))

#### Least Misery strategy

In [None]:
least_misery_df = group_unseen_df.groupby(['item']).min().reset_index()
# TODO: Find name of recipe from the RAW data
least_misery_df = least_misery_df.join(df_recipes_full['name'], on='item')
items_lm = least_misery_df['item'].copy()
healthy_lm = ApplyHealthFilter(items_lm)

least_misery_df['healthy'] = least_misery_df['item'].apply(lambda x: 1 if x in healthy_lm else 0)
least_misery_df = least_misery_df.sort_values(by="predicted_rating", ascending=False)[
    ['item', 'predicted_rating', 'name', 'healthy']]

least_misery_df = least_misery_df[least_misery_df.healthy == 1]
display(least_misery_df.head(10))

#### Most Pleasure strategy

In [None]:
most_pleasure_df = group_unseen_df.groupby(['item']).max().reset_index()
# TODO: Find name of recipe from the RAW data
items_mp = most_pleasure_df['item'].copy()
healthy_mp = ApplyHealthFilter(items_mp)

most_pleasure_df['healthy'] = most_pleasure_df['item'].apply(lambda x: 1 if x in healthy_mp else 0)

most_pleasure_df = most_pleasure_df.join(df_recipes_full['name'], on='item').reset_index()
most_pleasure_df = most_pleasure_df.sort_values(by="predicted_rating", ascending=False).reset_index()[
    ['item', 'predicted_rating', 'name', 'healthy']]

most_pleasure_df = most_pleasure_df[most_pleasure_df.healthy == 1]

display(most_pleasure_df.head(10))

#### Approval Voting

In [None]:
group_unseen_temp_df = group_unseen_df.copy()
group_unseen_temp_df['voted'] = group_unseen_temp_df['predicted_rating'].apply(lambda x: 1 if x > 3.5 else 0)
approval_df = group_unseen_temp_df.groupby(['item']).sum()
approval_df.drop('user', axis=1, inplace=True)
approval_df['predicted_rating'] /= len(random_selected)  # Normalize rating
# Only keep the items with maximum approval
approval_df = approval_df[approval_df.voted == approval_df.voted.max()]
approval_df = approval_df.sort_values(by="predicted_rating",
                                      ascending=False).reset_index()  # Get the best rated items with max approval
approval_df = approval_df.join(df_recipes_full['name'], on='item')
display(approval_df.head(10))