In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from ast import literal_eval


from sklearn import metrics
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split

We load in our datasets, consisting of user data and data for the recipes. We will be using the ratings that users gave in combination with the recipe's ingredients, techniques and calories to perform recommendations

In [75]:
df_users = pd.read_csv('Data/PP_users.csv')
#Note that there are no duplicates
df_users.head()

# u = user_id, 
# techniques = techniques used for items  that were interacted with(index is a technique with the number being a counter),
# items = item_ids of items that were interacted with, 
# n_items = number of items reviewed, 
# ratings = ratings for items reviewed, 
# n_ratings = number of ratings

number_of_users_with_less_than_10_reviews = df_users[df_users["n_ratings"] < 10].shape[0]
number_of_users_with_less_than_5_reviews = df_users[df_users["n_ratings"] < 5].shape[0]
number_of_users = len(df_users.index)

print(str(round((number_of_users_with_less_than_10_reviews / number_of_users) * 100,
                1)) + "% of user have less than 10 reviews")
print(str(round((number_of_users_with_less_than_5_reviews / number_of_users) * 100,
                1)) + "% of user have less than 5 reviews")


62.9% of user have less than 10 reviews
39.9% of user have less than 5 reviews


In [76]:
df_recipes = pd.read_csv('Data/PP_recipes.csv')
#Note that there are no duplicates
df_recipes.head()

# id = recipe_id, i = Recipe ID mapped to contiguous integers from 0, 
# name_tokes = BPE-tokenized recipe name,
# ingredient_tokens = BPE-tokenized ingredients list (list of lists), 
# steps_tokens = BPE-tokenized steps, 
# techniques = List of techniques used in recipe,
# calorie_level = either a 0, 1 or 2 indicating how much calories it contains,
# ingredient_ids = the ids of the ingredients used

Unnamed: 0,id,i,name_tokens,ingredient_tokens,steps_tokens,techniques,calorie_level,ingredient_ids
0,424415,23,"[40480, 37229, 2911, 1019, 249, 6878, 6878, 28...","[[2911, 1019, 249, 6878], [1353], [6953], [153...","[40480, 40482, 21662, 481, 6878, 500, 246, 161...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[389, 7655, 6270, 1527, 3406]"
1,146223,96900,"[40480, 18376, 7056, 246, 1531, 2032, 40481]","[[17918], [25916], [2507, 6444], [8467, 1179],...","[40480, 40482, 729, 2525, 10906, 485, 43, 8393...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[2683, 4969, 800, 5298, 840, 2499, 6632, 7022,..."
2,312329,120056,"[40480, 21044, 16954, 8294, 556, 10837, 40481]","[[5867, 24176], [1353], [6953], [1301, 11332],...","[40480, 40482, 8240, 481, 24176, 296, 1353, 66...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",1,"[1257, 7655, 6270, 590, 5024, 1119, 4883, 6696..."
3,74301,168258,"[40480, 10025, 31156, 40481]","[[1270, 1645, 28447], [21601], [27952, 29471, ...","[40480, 40482, 5539, 21601, 1073, 903, 2324, 4...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[7940, 3609, 7060, 6265, 1170, 6654, 5003, 3561]"
4,76272,109030,"[40480, 17841, 252, 782, 2373, 1641, 2373, 252...","[[1430, 11434], [1430, 17027], [1615, 23, 695,...","[40480, 40482, 14046, 1430, 11434, 488, 17027,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[3484, 6324, 7594, 243]"


In [77]:
df_users = pd.read_csv('Data/PP_users.csv')
df_users.drop('techniques', axis=1, inplace=True)
df_users.drop('n_items', axis=1, inplace=True)
df_users.drop('n_ratings', axis=1, inplace=True)

df_users = df_users.rename(columns={'u': 'user', 'items': 'item'})

df_users.head()

Unnamed: 0,user,item,ratings
0,0,"[1118, 27680, 32541, 137353, 16428, 28815, 658...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, ..."
1,1,"[122140, 77036, 156817, 76957, 68818, 155600, ...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ..."
2,2,"[168054, 87218, 35731, 1, 20475, 9039, 124834,...","[3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, ..."
3,3,"[163193, 156352, 102888, 19914, 169438, 55772,...","[5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 5.0, 5.0, 5.0, ..."
4,4,"[72857, 38652, 160427, 55772, 119999, 141777, ...","[5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 5.0, 4.0, 5.0, ..."


In [78]:
# Copied from previous cells for debugging uses
df_users = pd.read_csv('Data/PP_users.csv')
df_users.drop('techniques', axis=1, inplace=True)
df_users.drop('n_items', axis=1, inplace=True)
df_users.drop('n_ratings', axis=1, inplace=True)

df_users = df_users.rename(columns={'u': 'user', 'items': 'item', 'ratings': 'rating'})
#

# Needed to make the explode function, source: https://stackoverflow.com/questions/63472664/pandas-explode-function-not-working-for-list-of-string-column
df_users['rating'] = df_users['rating'].apply(literal_eval)
df_users['item'] = df_users['item'].apply(literal_eval)

df_users = df_users.explode(['rating', 'item'], ignore_index=True)
df_users.head()

ValueError: column must be a scalar

In [37]:
df_recipes.drop('i', axis=1, inplace=True)
df_recipes.drop('name_tokens', axis=1, inplace=True)
df_recipes.drop('ingredient_tokens', axis=1, inplace=True)
df_recipes.drop('steps_tokens', axis=1, inplace=True)

df_recipes.head()

  df_recipes.drop('i', 1, inplace=True)
  df_recipes.drop('name_tokens', 1, inplace=True)
  df_recipes.drop('ingredient_tokens', 1, inplace=True)
  df_recipes.drop('steps_tokens', 1, inplace=True)


Unnamed: 0,id,techniques,calorie_level,ingredient_ids
0,424415,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[389, 7655, 6270, 1527, 3406]"
1,146223,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[2683, 4969, 800, 5298, 840, 2499, 6632, 7022,..."
2,312329,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",1,"[1257, 7655, 6270, 590, 5024, 1119, 4883, 6696..."
3,74301,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[7940, 3609, 7060, 6265, 1170, 6654, 5003, 3561]"
4,76272,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[3484, 6324, 7594, 243]"


In [38]:
# Copied from previous cells for debugging uses
df_recipes = pd.read_csv('Data/PP_recipes.csv')

df_recipes.drop('i', axis=1, inplace=True)
df_recipes.drop('name_tokens', axis=1, inplace=True)
df_recipes.drop('ingredient_tokens', axis=1, inplace=True)
df_recipes.drop('steps_tokens', axis=1, inplace=True)
#

# Needed to make the explode function, source: https://stackoverflow.com/questions/63472664/pandas-explode-function-not-working-for-list-of-string-column
df_recipes['techniques'] = df_recipes['techniques'].apply(literal_eval)
df_recipes['ingredient_ids'] = df_recipes['ingredient_ids'].apply(literal_eval)

df_recipes = df_recipes.explode('techniques')
df_recipes = df_recipes.explode('ingredient_ids')
df_recipes.head()

Unnamed: 0,id,techniques,calorie_level,ingredient_ids
0,424415,0,0,389
0,424415,0,0,7655
0,424415,0,0,6270
0,424415,0,0,1527
0,424415,0,0,3406


TODO: use algorithm, verify results

## 1. Individual Recommendations

In [58]:
df_random = df_users.sample(n=10000)

Unnamed: 0,user,item
55,55,127979
373,373,150
1072,1072,76716
377,377,99781
3300,3300,11814
...,...,...
423,423,77798
190,190,70424
2725,2725,145411
172,172,160310


In [93]:
print(df_users)
print(df_recipes)

X = df_random.drop('ratings', axis=1) # training data
y = df_random['ratings'] # target values
# TODO take into account df_recipes

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) # 67% training, 33% testing

knn = KNeighborsRegressor(n_neighbors=3)
knn.fit(X_train, y_train)

y_pred_train = knn.predict(X_train) # y_pred_train predicts X_train
y_pred_test = knn.predict(X_test)

#y_pred_train = np.rint(y_pred_train)
#y_pred_test = np.rint(y_pred_test)

#print(X_train)
#print((y_pred_train))
#print((np.array(y_train.to_list())))

# TODO compare expected outputs with actual outputs
print(y_pred_test) # why float value ??
print(y_test)

# TODO solve
print('Accuracy on training data =', metrics.accuracy_score(np.array(y_train.to_list()), y_pred_train))
print('Accuracy on testing data =', metrics.accuracy_score(np.array(y_test.to_list()), y_pred_test))
print('')
print(metrics.classification_report(np.array(y_test.to_list()), y_pred_test))

        user    item ratings
0          0    1118       5
0          0   27680       5
0          0   32541       5
0          0  137353       5
0          0   16428       5
...      ...     ...     ...
25074  25074  159896       5
25075  25075  166686       4
25075  25075  157084       4
25075  25075  166686       4
25075  25075  157084       4

[397169391 rows x 3 columns]
            id techniques  calorie_level ingredient_ids
0       424415          0              0            389
0       424415          0              0           7655
0       424415          0              0           6270
0       424415          0              0           1527
0       424415          0              0           3406
...        ...        ...            ...            ...
178264  263840          0              0            335
178264  263840          0              0           1563
178264  263840          0              0           1511
178264  263840          0              0           3248
178264

ValueError: Classification metrics can't handle a mix of multiclass and unknown targets

In [46]:
df_users['ratings'].value_counts()

5.0    316545991
4.0     65490132
3.0      8813273
0.0      4712449
2.0      1249602
1.0       357944
Name: ratings, dtype: int64

## 2. Group Recommendations

In [39]:
users_ratings = df_users.groupby(['user']).count()  # count the ratings for each user
selected = users_ratings['rating'] > 30  # keep only 30 + ratings
selected_users = users_ratings.loc[selected]
random_selected = selected_users.sample(n=10)

select_column_df = random_selected.reset_index()[
    'user']  # reset_index() create a new index, and the userId became a column. Then, we can filter using the column name
group_users = list(
    select_column_df)  # iloc select by index, since our dataframe only has one row we read it from the index 0
print(group_users)

[3414, 3682, 10025, 2327, 6937, 2981, 3023, 715, 2118, 1826]


In [40]:
group_ratings = df_users.loc[df_users['user'].isin(group_users)]
total_recipes = set(df_recipes.index.tolist())
num_ratings_df = df_users.groupby(['item']).count()
considered_recipes = set(num_ratings_df.loc[num_ratings_df['user'] >= 30].reset_index()['item'])

group_seen_recipes = set(group_ratings['item'].tolist())
group_unseen_recipes = considered_recipes - group_seen_recipes

print('Total amount of recipes,', len(total_recipes))
print('Recipes that have at least 20 ratings,', len(considered_recipes))
print('Recipes that have been rated by the currently selected group,', len(group_seen_recipes))
print('New recipes that the group didnt try yet,', len(group_unseen_recipes))

Total amount of recipes, 178265
Recipes that have at least 20 ratings, 2456
Recipes that have been rated by the currently selected group, 766
New recipes that the group didnt try yet, 2240


In [41]:
from IPython.core.display import display
import itertools
from lenskit.algorithms import Recommender
from lenskit.algorithms.user_knn import UserUser

user_user = UserUser(15, min_nbrs=3)  # Minimum (3) and maximum (12) number of neighbors to consider
recsys = Recommender.adapt(user_user)
recsys.fit(df_users)
group_unseen_df = pd.DataFrame(list(itertools.product(group_users, group_unseen_recipes)), columns=['user', 'item'])
group_unseen_df['predicted_rating'] = recsys.predict(group_unseen_df)
group_unseen_df = group_unseen_df[
    group_unseen_df['predicted_rating'].notna()]  # remove the recipes we couldn't get a prediction for
display(group_unseen_df.head(10))

Unnamed: 0,user,item,predicted_rating
0,3414,73729,4.426717
1,3414,172034,4.768642
2,3414,172037,4.684071
4,3414,32780,4.624956
5,3414,155665,4.715182
7,3414,106520,4.875091
9,3414,90144,4.330455
11,3414,65578,4.790447
12,3414,90160,2.92449
14,3414,139315,4.008952


In [42]:
#Min-Max normalization of predicted_ratings

maxVal = group_unseen_df['predicted_rating'].max()
minVal = group_unseen_df['predicted_rating'].min()
group_unseen_df['predicted_rating'] = (group_unseen_df['predicted_rating'] - minVal) / (
            maxVal - minVal)  # Normalized to 0 - 1 scale
group_unseen_df['predicted_rating'] *= 5  # Normalized to 0 - 5 scale

display(group_unseen_df.head(10))

Unnamed: 0,user,item,predicted_rating
0,3414,73729,3.130235
1,3414,172034,3.387293
2,3414,172037,3.323712
4,3414,32780,3.27927
5,3414,155665,3.347102
7,3414,106520,3.46732
9,3414,90144,3.057865
11,3414,65578,3.403685
12,3414,90160,2.000868
14,3414,139315,2.816161


#### Least Misery strategy

In [43]:
least_misery_df = group_unseen_df.groupby('item').min()
# TODO: Find name of recipe from the RAW data
least_misery_df = least_misery_df.sort_values(by="predicted_rating", ascending=False).reset_index()[
    ['item', 'predicted_rating']]
display(least_misery_df.head(10))

Unnamed: 0,item,predicted_rating
0,80096,3.706521
1,112305,3.680464
2,118496,3.633306
3,114447,3.615273
4,128535,3.606302
5,172853,3.6051
6,33272,3.585411
7,165260,3.577748
8,15775,3.573277
9,133911,3.560739


#### Most Pleasure strategy

In [44]:
most_pleasure_df = group_unseen_df.groupby('item').max()
# TODO: Find name of recipe from the RAW data
most_pleasure_df = most_pleasure_df.sort_values(by="predicted_rating", ascending=False).reset_index()[
    ['item', 'predicted_rating']]
display(most_pleasure_df.head(10))

Unnamed: 0,item,predicted_rating
0,15728,5.0
1,170188,4.349244
2,156278,4.205091
3,125296,4.119584
4,62117,4.06509
5,166665,4.041326
6,65759,4.034062
7,38699,4.033526
8,4827,4.020633
9,136059,3.98972


#### Approval Voting

In [45]:
group_unseen_temp_df = group_unseen_df.copy()
group_unseen_temp_df['voted'] = group_unseen_temp_df['predicted_rating'].apply(lambda x: 1 if x > 3.5 else 0)
approval_df = group_unseen_temp_df.groupby('item').sum()
approval_df = approval_df.sort_values(by="voted", ascending=False)
approval_df.drop('user', axis=1, inplace=True)
approval_df['predicted_rating'] /= len(random_selected) # Normalize rating
display(approval_df.head(10))

Unnamed: 0_level_0,predicted_rating,voted
item,Unnamed: 1_level_1,Unnamed: 2_level_1
151474,3.461494,5
36207,3.349747,4
28688,2.081879,4
153273,3.384419,4
882,3.295421,4
99885,3.459514,4
60561,3.387407,4
47837,3.380006,3
89947,1.724844,3
15728,3.288484,3
