This file creates the json files we will need for our algorithm.
If you change any of the data_files, you will need to run this file again to update the json files.

In [2]:
import csv
import json
import data_processing



In [3]:
with open('data/recipes.csv', 'r') as f:
    recipes = list(csv.DictReader(f))

with open("data/recipes_urls.csv", "r") as f:
    reader = csv.reader(f)
    urls = list(reader)

# Take off first row
urls = urls[1:]

id_to_url = {int(row[0]): row[2] for row in urls}

In [4]:
# Filter out recipes with no images
recipes_with_images = [recipe for recipe in recipes if recipe['Images'] != "character(0)"]

# Filter out recipes with no images and no ratings
recipes_filtered = [recipe for recipe in recipes_with_images if recipe['AggregatedRating'] != "NA"]

# Randomly sample 30,000 recipes
import random
recipes_sampled = random.sample(recipes_filtered, 30000)

In [5]:
# Pre-compute the inverted index, idf, recipe norms, and id to recipe mapping
inv_idx = data_processing.build_inverted_index(recipes_sampled)
id_to_recipe = data_processing.build_id_to_recipe(recipes_sampled)
idf = data_processing.build_idf(inv_idx, len(recipes_sampled))
recipe_norms = data_processing.build_recipe_norms(inv_idx, idf)

In [6]:
# Add urls
for recipe_id, recipe in id_to_recipe.items():
    recipe['Url'] = id_to_url[recipe_id]

In [7]:
# Save the pre-computed data
with open('data/inv_idx.json', 'w') as f:
    json.dump(inv_idx, f)

with open('data/idf.json', 'w') as f:
    json.dump(idf, f)

with open('data/recipe_norms.json', 'w') as f:
    json.dump(recipe_norms, f)

with open('data/id_to_recipe.json', 'w') as f:
    json.dump(id_to_recipe, f)

In [8]:
with open('data/recipes_urls.csv', 'r') as f:
    recipes_urls = list(csv.DictReader(f))

recipes_urls = recipes_urls[:10000]

with open('data/recipe_urls.json', 'w') as f:
    json.dump(recipes_urls, f)

# Added for SVD

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

corpus = []
for id, recipe in id_to_recipe.items():
    text = (
        " ".join(recipe["ingredients"])
        + " "
        + recipe["description"]
        + " "
        + recipe["instructions"]
    )
    corpus.append(text)

vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = vectorizer.fit_transform(corpus)

n_components = 150
svd = TruncatedSVD(n_components=n_components)
U = svd.fit_transform(tfidf_matrix)

In [13]:
with open("data/svd.json", "w") as f:
    json.dump(U.tolist(), f)