In [1]:
import numpy as np
import csv
from scipy import sparse
import pickle
import pandas as pd
import json

Code for downloading then unzipping the data files
~~~
pip install kaggle
export KAGGLE_USERNAME=eastonpotokar
export KAGGLE_KEY=1bfd140356aae8ffd62b234617295b77
kaggle datasets download -d shuyangli94/food-com-recipes-and-user-interactions
unzip food-com-recipes-and-user-interactions.zip
~~~

Clean out the Recipes Dataset, extracting the items that we want

In [33]:
#save map from ingredients id to ingredients
maps = pickle.load( open('ingr_map.pkl', 'rb') )
ing_decoder = {i['id']: i['replaced'] for index, i in maps.iterrows()}
pickle.dump(ing_decoder, open("../data-cleaned/ingredient_decoder.pkl", "wb"))

#we put recipes into similar format as our dataset
pp_recipes = pd.read_csv('PP_recipes.csv')
raw_recipes = pd.read_csv('RAW_recipes.csv')
recipes = pd.merge(pp_recipes, raw_recipes, on='id')

#clean and get tags all ready
tags = set()
for i in raw_recipes.tags:
    tags.update( json.loads(i.replace("'", '"')) )
    
tags = sorted(list(tags))
tag_encoder = {tag: i for i, tag in enumerate(tags)}
tag_decoder = {i: tag for i, tag in enumerate(tags)}
pickle.dump(tag_decoder, open("../data-cleaned/tag_decoder.pkl", "wb"))

#also make a recipe decoder
recipe_decoder = {i['i']: i['name'] for _, i in recipes.iterrows()}
pickle.dump(recipe_decoder, open("../data-cleaned/recipe_decoder.pkl", "wb"))

num_rec = len(list(recipes['i']))
num_ing = max(ing_decoder.keys())
num_tags = max(tag_decoder.keys())
num_others = 2 #calories and minutes

#iterate through, saving the tags and ingredients
row_recipe = []
column_item = []
for _, recipe in recipes.iterrows():
    #get all ingredients
    for i in json.loads(recipe['ingredient_ids']):
        row_recipe.append(recipe['i'])
        column_item.append(i)
    #get all tags
    for j in json.loads(recipe['tags'].replace("'", '"')):
        row_recipe.append(recipe['i'])
        column_item.append(tag_encoder[j] + num_ing)

#one of each ingredient and tag added
counts = [1]*len(row_recipe)

#add in minutes
row_recipe += list(recipes['i'])
column_item += [num_ing+num_tags]*len(list(recipes.index))
counts += list(recipes["minutes"])

#add in calories
row_recipe += list(recipes['i'])
column_item += [num_ing+num_tags+1]*len(list(recipes.index))
counts += [json.loads(i)[0] for i in list(recipes['nutrition'])]

X = sparse.csr_matrix((counts, [row_recipe, column_item]), shape=(num_rec, num_ing + num_tags + num_others), dtype=np.float)

sparse.save_npz('../data-cleaned/recipes.npz', X)

Now we go through the users dataset, extracting what we need

In [8]:
#first we'll load all the data
user_train = pd.read_csv('interactions_train.csv')
user_valid = pd.read_csv('interactions_validation.csv')
user_test  = pd.read_csv('interactions_test.csv')

num_user = max(user_train['u'].max(), user_valid['u'].max(), user_test['u'].max())+1

#TRAIN SET save the user number, recipe and rating
row_user = list(user_train['u'])
column_rec = list(user_train['i'])
rating = list(user_train['rating'])

#make and save
X = sparse.csr_matrix((rating, [row_user, column_rec]), shape=(num_user, num_rec), dtype=np.float)
sparse.save_npz('../data-cleaned/user_train.npz', X)

#TRAIN SET save the user number, recipe and rating
temp = pd.concat((user_valid, user_test))
row_user = list(np.arange(len(temp))) * 2
column_rec = list(temp['i']) + [0]*len(temp)
rating = list(temp['rating']) + list(temp['u']) 

#make and save
X = sparse.csr_matrix((rating, [row_user, column_rec]), shape=(len(temp), num_rec+1), dtype=np.float)
sparse.save_npz('../data-cleaned/user_test.npz', X)

### TODO: Check User Arrays to make sure they seem right!

### How to decode tags from array

In [60]:
X = sparse.load_npz("../data-cleaned/recipes.npz")
tags = X[23].nonzero()[1]
tags = tags[tags>num_ing][:-2] - num_ing
[tag_decoder[i] for i in tags[:-1]]

['4-hours-or-less',
 '5-ingredients-or-less',
 'asian',
 'course',
 'cuisine',
 'dietary',
 'easy',
 'indian',
 'main-ingredient',
 'occasion',
 'pasta-rice-and-grains',
 'preparation',
 'rice',
 'side-dishes',
 'time-to-make',
 'vegan',
 'vegetarian']