In [2]:
import numpy as np
import csv
from scipy import sparse
import pickle
import pandas as pd
import json

Code for downloading then unzipping the data files
~~~
pip install kaggle
export KAGGLE_USERNAME=eastonpotokar
export KAGGLE_KEY=1bfd140356aae8ffd62b234617295b77
kaggle datasets download -d shuyangli94/food-com-recipes-and-user-interactions
unzip food-com-recipes-and-user-interactions.zip
~~~

Clean out the Recipes Dataset, extracting the items that we want

In [4]:
#save map from ingredients id to ingredients
maps = pickle.load( open('ingr_map.pkl', 'rb') )
ing_decoder = {i['id']: i['replaced'] for index, i in maps.iterrows()}
pickle.dump(ing_decoder, open("../data-cleaned/ingredient_decoder.pkl", "wb"))

#we put recipes into similar format as our dataset
pp_recipes = pd.read_csv('PP_recipes.csv')
raw_recipes = pd.read_csv('RAW_recipes.csv')
recipes = pd.merge(pp_recipes, raw_recipes, on='id')

#clean and get tags all ready
tags = set()
for i in raw_recipes.tags:
    tags.update( json.loads(i.replace("'", '"')) )
    
tags = sorted(list(tags))
tag_encoder = {tag: i for i, tag in enumerate(tags)}
tag_decoder = {i: tag for i, tag in enumerate(tags)}
pickle.dump(tag_decoder, open("../data-cleaned/tag_decoder.pkl", "wb"))

#also make a recipe decoder
recipe_decoder = {i['i']: i['name'] for _, i in recipes.iterrows()}
pickle.dump(recipe_decoder, open("../data-cleaned/recipe_decoder.pkl", "wb"))

#get the number of everything we're doing
num_rec = len(list(recipes['i']))
num_ing = max(ing_decoder.keys())+1
num_tags = max(tag_decoder.keys())+1
num_others = 1 #calories

In [None]:
#arrays to use for calculating tf-idf
row_sum = np.zeros(num_rec)
col_sum = np.zeros(num_ing+num_tags)

#iterate through, saving the tags and ingredients
row_recipe = []
column_item = []
for _, recipe in recipes.iterrows():
    #get all ingredients
    for i in json.loads(recipe['ingredient_ids']):
        row_recipe.append(recipe['i'])
        column_item.append(i)
        row_sum[recipe['i']] += 1
        col_sum[i] += 1
    #get all tags
    for j in json.loads(recipe['tags'].replace("'", '"')):
        row_recipe.append(recipe['i'])
        column_item.append(tag_encoder[j] + num_ing)
        row_sum[recipe['i']] += 1
        col_sum[tag_encoder[j] + num_ing] += 1

#one of each ingredient and tag added
counts = [1]*len(row_recipe)

#### MINUTES seemed very noisy - information also included in tags. we remove them b/c of it
#add in minutes
# row_recipe += list(recipes['i'])
# column_item += [num_ing+num_tags]*len(list(recipes.index))
# counts += list(recipes["minutes"])

### CALORIES also very noisy - we use "calorie level" instead
#add in calories
row_recipe += list(recipes['i'])
column_item += [num_ing+num_tags]*len(list(recipes.index))
counts += list(recipes['calorie_level'])
# counts += [json.loads(i)[0] for i in list(recipes['nutrition'])]

#save unscaled matrix
X = sparse.csr_matrix((counts, [row_recipe, column_item]), shape=(num_rec, num_ing + num_tags + num_others), dtype=np.float)
sparse.save_npz('../data-cleaned/recipes.npz', X)

#do the scaling
for i, (row, col) in enumerate(zip(row_recipe, column_item)):
    #don't rescale the calories/minutes yet
    if col == num_ing+num_tags+1 or col == num_ing+num_tags:
        continue
    counts[i] = (counts[i] / row_sum[row])*np.log(num_rec/col_sum[col])
    
#save scaled matrix
Xhat = sparse.csr_matrix((counts, [row_recipe, column_item]), shape=(num_rec, num_ing + num_tags + num_others), dtype=np.float)
sparse.save_npz('../data-cleaned/recipes_tfidf.npz', Xhat)

Now we go through the users dataset, extracting what we need

In [64]:
#first we'll load all the data
user_train = pd.read_csv('interactions_train.csv')
user_valid = pd.read_csv('interactions_validation.csv')
user_test  = pd.read_csv('interactions_test.csv')

#TRAIN SET save the user number, recipe and rating
num_user = max(user_train['u'].max(), user_valid['u'].max(), user_test['u'].max())+1
row_user = list(user_train['u'])
column_rec = list(user_train['i'])
rating = np.array(list(user_train['rating']))
rating[rating==0] = 1
rating = list(rating)

#make and save
X = sparse.csr_matrix((rating, [row_user, column_rec]), shape=(num_user, num_rec), dtype=np.float)
sparse.save_npz('../data-cleaned/user_train.npz', X)


#sum rows and columns
row_sum = np.squeeze( np.array(X.sum(axis=1)) )
col_sum = np.squeeze( np.array(X.sum(axis=0)) )

#do the scaling
for i, (row, col) in enumerate(zip(row_user, column_rec)):
    if rating[i] != 0:
        rating[i] = (rating[i] / row_sum[row])*np.log(num_user/col_sum[col])

#save scaled matrix
Xhat = sparse.csr_matrix((rating, [row_user, column_rec]), shape=(num_user, num_rec), dtype=np.float)
sparse.save_npz('../data-cleaned/user_train_tfidf.npz', Xhat)


#TRAIN SET save the user number, recipe and rating
temp = pd.concat((user_valid, user_test))
row_user = list(temp['u']) + list(temp['u'].unique())
column_rec = list(temp['i']) + [num_rec]*len(temp['u'].unique())
rating_temp = np.array(list(temp['rating']))
rating_temp[rating_temp==0] = 1
rating = list(rating_temp) + list(temp['u'].unique()) 

#make and save
X = sparse.csr_matrix((rating, [row_user, column_rec]), shape=(num_user, num_rec+1), dtype=np.float)

#remove nonzero rows
X = X[X.getnnz(1)>0]
sparse.save_npz('../data-cleaned/user_test.npz', X)

### Train User Set seems good!

In [4]:
U = sparse.load_npz("../data-cleaned/user_train.npz")
U[22095].nonzero()[1]

array([ 44367,  87844, 101723, 134551], dtype=int32)

In [5]:
U[22095, 44367], U[22095, 87844]

(5.0, 5.0)

In [6]:
recipe_decoder[44367], recipe_decoder[87844]

('flank steak with lime chipotle sauce', 'greek stuffed meatloaf')

### User Test set also seems good!

In [7]:
U = sparse.load_npz("../data-cleaned/user_test.npz")
print(U[0].nonzero()[1])
print(U[0, 173538], U[0, 178265])

[173538 178265]
4.0 2.0


### How to decode tags from array

In [8]:
X = sparse.load_npz("../data-cleaned/recipes.npz")
tags = X[23].nonzero()[1]
tags = tags[tags>num_ing][:-2] - num_ing
[tag_decoder[i] for i in tags[:-1]]

['4-hours-or-less',
 '5-ingredients-or-less',
 'asian',
 'course',
 'cuisine',
 'dietary',
 'easy',
 'indian',
 'main-ingredient',
 'occasion',
 'pasta-rice-and-grains',
 'preparation',
 'rice',
 'side-dishes',
 'time-to-make',
 'vegan',
 'vegetarian']