In [1]:
import pandas as pd
import json
import sqlite3
import numpy as np
from datetime import datetime

# Load in the data

In [2]:
recipes = pd.read_csv("../data/RecSys/raw-data_recipe.csv")

In [3]:
interactions = pd.read_csv("../data/RecSys/raw-data_interaction.csv")

# Format Recipes

## Rename Columns

I rename some of the columns for ease. I won't be using the reviews here so dropping that.

In [4]:
recipes = recipes.rename(columns={'recipe_id':'id', 'recipe_name':'name', 'aver_rate':'ave_rating', 'cooking_directions':'directions', 'nutritions':'nutrients'})
recipes = recipes.drop('reviews', axis=1)
recipes.head()

Unnamed: 0,id,name,ave_rating,image_url,review_nums,ingredients,directions,nutrients
0,222388,Homemade Bacon,5.0,https://images.media-allrecipes.com/userphotos...,3,pork belly^smoked paprika^kosher salt^ground b...,{'directions': u'Prep\n5 m\nCook\n2 h 45 m\nRe...,"{u'niacin': {u'hasCompleteData': False, u'name..."
1,240488,"Pork Loin, Apples, and Sauerkraut",4.764706,https://images.media-allrecipes.com/userphotos...,29,sauerkraut drained^Granny Smith apples sliced^...,{'directions': u'Prep\n15 m\nCook\n2 h 30 m\nR...,"{u'niacin': {u'hasCompleteData': False, u'name..."
2,218939,Foolproof Rosemary Chicken Wings,4.571429,https://images.media-allrecipes.com/userphotos...,12,chicken wings^sprigs rosemary^head garlic^oliv...,"{'directions': u""Prep\n20 m\nCook\n40 m\nReady...","{u'niacin': {u'hasCompleteData': True, u'name'..."
3,87211,Chicken Pesto Paninis,4.625,https://images.media-allrecipes.com/userphotos...,163,focaccia bread quartered^prepared basil pesto^...,{'directions': u'Prep\n15 m\nCook\n5 m\nReady ...,"{u'niacin': {u'hasCompleteData': True, u'name'..."
4,245714,Potato Bacon Pizza,4.5,https://images.media-allrecipes.com/userphotos...,2,red potatoes^strips bacon^Sauce:^heavy whippin...,{'directions': u'Prep\n20 m\nCook\n45 m\nReady...,"{u'niacin': {u'hasCompleteData': True, u'name'..."


## Cooking Directions

Format the directions. See the effect.

In [5]:
directions = [ x.replace("{'directions': u", "").replace("'}", '') for x in recipes.directions ]
recipes['directions'] = directions

In [6]:
recipes.head()

Unnamed: 0,id,name,ave_rating,image_url,review_nums,ingredients,directions,nutrients
0,222388,Homemade Bacon,5.0,https://images.media-allrecipes.com/userphotos...,3,pork belly^smoked paprika^kosher salt^ground b...,'Prep\n5 m\nCook\n2 h 45 m\nReady In\n11 h 50 ...,"{u'niacin': {u'hasCompleteData': False, u'name..."
1,240488,"Pork Loin, Apples, and Sauerkraut",4.764706,https://images.media-allrecipes.com/userphotos...,29,sauerkraut drained^Granny Smith apples sliced^...,'Prep\n15 m\nCook\n2 h 30 m\nReady In\n2 h 45 ...,"{u'niacin': {u'hasCompleteData': False, u'name..."
2,218939,Foolproof Rosemary Chicken Wings,4.571429,https://images.media-allrecipes.com/userphotos...,12,chicken wings^sprigs rosemary^head garlic^oliv...,"""Prep\n20 m\nCook\n40 m\nReady In\n1 h\nPrehea...","{u'niacin': {u'hasCompleteData': True, u'name'..."
3,87211,Chicken Pesto Paninis,4.625,https://images.media-allrecipes.com/userphotos...,163,focaccia bread quartered^prepared basil pesto^...,'Prep\n15 m\nCook\n5 m\nReady In\n20 m\nPrehea...,"{u'niacin': {u'hasCompleteData': True, u'name'..."
4,245714,Potato Bacon Pizza,4.5,https://images.media-allrecipes.com/userphotos...,2,red potatoes^strips bacon^Sauce:^heavy whippin...,'Prep\n20 m\nCook\n45 m\nReady In\n1 h 10 m\nP...,"{u'niacin': {u'hasCompleteData': True, u'name'..."


## Ingredients

Format the ingredients. It has ^ to separate the individual ingredients. Want to use ',' instead. There are 2 current instances of commas so just replace those with a ' '.

So first find and replace the existing commas.

In [7]:
any_commas = np.array([ x.find(',') for x in recipes.ingredients ])
np.sum(any_commas > (-1))

2

In [8]:
# Where are those commas
for x in recipes.ingredients[any_commas > (-1)]:
    print(x.split('^'))

['chicken', 'chicken broth', 'large carrots', 'celery', 'large turnip,cut into 1/4-inch dice', 'parsnips cut into 1/4-inch dice', 'leek chopped', 'onion', 'chopped fresh dill', 'matzo ball mix', 'eggs', 'vegetable oil', 'water', 'salt and ground black pepper to taste']
['Corn "Cream":', 'ears fresh white corn', 'chicken broth', 'Pasta:', 'olive oil', 'diced bacon', 'diced sweet red pepper', 'diced zucchini', 'salt and pepper to taste', 'cayenne pepper', 'cavatelli pasta', 'corn "cream,"', 'halved sweet cherry tomatoes', 'chopped Italian parsley', 'finely sliced fresh basil leaves', 'grated Parmigiano-Reggiano cheese']


In [9]:
for ind in np.nonzero(any_commas > (-1))[0]:
    recipes.loc[ind,"ingredients"] = recipes.loc[ind,"ingredients"].replace(",", " ")

Replace out all ^ with ,

In [10]:
recipes['ingredients'] = [ x.replace('^', ', ') for x in recipes.ingredients ]

In [11]:
recipes.head()

Unnamed: 0,id,name,ave_rating,image_url,review_nums,ingredients,directions,nutrients
0,222388,Homemade Bacon,5.0,https://images.media-allrecipes.com/userphotos...,3,"pork belly, smoked paprika, kosher salt, groun...",'Prep\n5 m\nCook\n2 h 45 m\nReady In\n11 h 50 ...,"{u'niacin': {u'hasCompleteData': False, u'name..."
1,240488,"Pork Loin, Apples, and Sauerkraut",4.764706,https://images.media-allrecipes.com/userphotos...,29,"sauerkraut drained, Granny Smith apples sliced...",'Prep\n15 m\nCook\n2 h 30 m\nReady In\n2 h 45 ...,"{u'niacin': {u'hasCompleteData': False, u'name..."
2,218939,Foolproof Rosemary Chicken Wings,4.571429,https://images.media-allrecipes.com/userphotos...,12,"chicken wings, sprigs rosemary, head garlic, o...","""Prep\n20 m\nCook\n40 m\nReady In\n1 h\nPrehea...","{u'niacin': {u'hasCompleteData': True, u'name'..."
3,87211,Chicken Pesto Paninis,4.625,https://images.media-allrecipes.com/userphotos...,163,"focaccia bread quartered, prepared basil pesto...",'Prep\n15 m\nCook\n5 m\nReady In\n20 m\nPrehea...,"{u'niacin': {u'hasCompleteData': True, u'name'..."
4,245714,Potato Bacon Pizza,4.5,https://images.media-allrecipes.com/userphotos...,2,"red potatoes, strips bacon, Sauce:, heavy whip...",'Prep\n20 m\nCook\n45 m\nReady In\n1 h 10 m\nP...,"{u'niacin': {u'hasCompleteData': True, u'name'..."


## Format Interactions

This is mostly dealing with the dateLastModified column.

In [12]:
interactions.head()

Unnamed: 0,user_id,recipe_id,rating,dateLastModified
0,8542392,222388,5,2017-04-22T12:46:43.663\n
1,11174581,222388,5,2013-06-20T15:50:25.96\n
2,8262477,222388,5,2015-02-14T07:27:51.307\n
3,3574785,240488,5,2017-10-07T18:20:08.973\n
4,12145410,240488,2,2018-01-06T00:06:09.563\n


In [13]:
# Strip the \n at the end of dateLastModified
interactions['dateLastModified'] = [ x.strip('\n') for x in interactions.dateLastModified ]

* The date last modified includes time-scales faster than seconds. 
* Those microseconds aren't needed + are missing on many of the rows. 
* So we will restrict the date part to only a length of 19

In [14]:
# We can the length of the dateLastModified
lens = pd.Series([ len(x) for x in interactions.dateLastModified ])
lens.value_counts()

23    2278354
22    1360687
21     137670
19      17292
dtype: int64

In [15]:
# We want to limit the length to 19
interactions['dateLastModified'] = [ x[:19] for x in interactions.dateLastModified ]

In [16]:
# Convert datetime into integer
interactions['dateLastModified'] = [ datetime.fromisoformat(x).strftime('%s') for x in interactions.dateLastModified ]

In [17]:
interactions.head()

Unnamed: 0,user_id,recipe_id,rating,dateLastModified
0,8542392,222388,5,1492879603
1,11174581,222388,5,1371757825
2,8262477,222388,5,1423916871
3,3574785,240488,5,1507414808
4,12145410,240488,2,1515215169


## Format Nutrients

## Compile Nutrients Separately

In [19]:
def convert_nutrients_row(recipe_id, nutrients):
    """
    Takes in a json string of the nutrients for a given recipe. 
    Returns a dataframe with rows being each nutrient + columns giving the information.
    """
    # Clean
    nutrients_clean = nutrients.replace("'", '"').replace("u","").replace("True","true").replace("False","false").replace("Tre","true")
    # To Dist
    d = json.loads(nutrients_clean)
    # To DataFrame
    df = pd.DataFrame(list(d.values()))
    # Rename
    df = df.rename(columns={'hasCompleteData': 'complete', 'amont': 'value', 'percentDailyVale': 'percent_daily_value', 
                       'displayVale': 'display_value', 'nit': 'value_unit'})
    # Add the recipe
    df.insert(0, 'recipe_id', recipe_id)
    return df

# Check if one recipe works
convert_nutrients_row(recipes.id[0], recipes.nutrients[0]).head()

Unnamed: 0,recipe_id,complete,name,value,percent_daily_value,display_value,value_unit
0,222388,False,Niacin Eqivalents,9.319291,72,9,mg
1,222388,True,Sgars,0.093559,0,0.1,g
2,222388,True,Sodim,2017.13,81,2017,mg
3,222388,True,Carbohydrates,1.797819,< 1,1.8,g
4,222388,False,Vitamin B6,0.23298,15,< 1,mg


In [20]:
lst_nutrients = []
bad_inds = [] # Nutrient information isn't available for these ids
for index,row in recipes.iterrows():
    #print(index)
    try:
        df_nutrients = convert_nutrients_row(row['id'], row['nutrients'])
        lst_nutrients.append(df_nutrients)
    except:
        bad_inds.append(index)

In [21]:
# Some of the recipes are missing nutrient information as shown here
recipes.iloc[bad_inds[10],:]['nutrients']

"{u'niacin': {u'hasCompleteData': False, u'name': None, u'amount': 0.0, u'percentDailyValue': None, u'displayValue': None, u'unit': None}, u'sugars': {u'hasCompleteData': False, u'name': None, u'amount': 0.0, u'percentDailyValue': None, u'displayValue': None, u'unit': None}, u'sodium': {u'hasCompleteData': False, u'name': None, u'amount': 0.0, u'percentDailyValue': None, u'displayValue': None, u'unit': None}, u'carbohydrates': {u'hasCompleteData': False, u'name': None, u'amount': 0.0, u'percentDailyValue': None, u'displayValue': None, u'unit': None}, u'vitaminB6': {u'hasCompleteData': False, u'name': None, u'amount': 0.0, u'percentDailyValue': None, u'displayValue': None, u'unit': None}, u'calories': {u'hasCompleteData': False, u'name': None, u'amount': 0.0, u'percentDailyValue': None, u'displayValue': None, u'unit': None}, u'thiamin': {u'hasCompleteData': False, u'name': None, u'amount': 0.0, u'percentDailyValue': None, u'displayValue': None, u'unit': None}, u'fat': {u'hasCompleteData

In [22]:
# Prop of recipes with nutrient info
np.round(len(lst_nutrients)/(len(lst_nutrients) + len(bad_inds)), 2)

0.98

In [23]:
# Nutrients
nutrients = pd.concat(lst_nutrients)
nutrients.head()

Unnamed: 0,recipe_id,complete,name,value,percent_daily_value,display_value,value_unit
0,222388,False,Niacin Eqivalents,9.319291,72,9,mg
1,222388,True,Sgars,0.093559,0,0.1,g
2,222388,True,Sodim,2017.13,81,2017,mg
3,222388,True,Carbohydrates,1.797819,< 1,1.8,g
4,222388,False,Vitamin B6,0.23298,15,< 1,mg


## Format some nutrient columns

In [24]:
# Fix the percent value
nutrients.loc[nutrients.percent_daily_value == '< 1', 'percent_daily_value'] = '0'
nutrients.loc[nutrients.percent_daily_value == '-', 'percent_daily_value'] = np.NaN
nutrients = nutrients.astype({'percent_daily_value': 'float32'})

In [31]:
# I want to add a nutrient id => labels
labels, levels = pd.factorize(nutrients.name)
labels += 1
## like R's factor function
print(labels)
print(levels)
## Add the labels as a new column
nutrients.insert(0, 'nutrient_id', labels)

[ 1  2  3 ... 18 19 20]
Index(['Niacin Eqivalents', 'Sgars', 'Sodim', 'Carbohydrates', 'Vitamin B6',
       'Calories', 'Thiamin', 'Fat', 'Folate', 'Calories from Fat', 'Calcim',
       'Dietary Fiber', 'Magnesim', 'Iron', 'Cholesterol', 'Protein',
       'Vitamin A - IU', 'Potassim', 'Satrated Fat', 'Vitamin C'],
      dtype='object')


ValueError: cannot insert nutrient_id, already exists

In [32]:
nutrients.head()

Unnamed: 0,nutrient_id,recipe_id,complete,name,value,percent_daily_value,display_value,value_unit
0,1,222388,False,Niacin Eqivalents,9.319291,72.0,9,mg
1,2,222388,True,Sgars,0.093559,0.0,0.1,g
2,3,222388,True,Sodim,2017.13,81.0,2017,mg
3,4,222388,True,Carbohydrates,1.797819,0.0,1.8,g
4,5,222388,False,Vitamin B6,0.23298,15.0,< 1,mg


In [33]:
# Remove the nutrients from the recipes
recipes = recipes.drop('nutrients', axis=1)
recipes.head()

Unnamed: 0,id,name,ave_rating,image_url,review_nums,ingredients,directions
0,222388,Homemade Bacon,5.0,https://images.media-allrecipes.com/userphotos...,3,"pork belly, smoked paprika, kosher salt, groun...",'Prep\n5 m\nCook\n2 h 45 m\nReady In\n11 h 50 ...
1,240488,"Pork Loin, Apples, and Sauerkraut",4.764706,https://images.media-allrecipes.com/userphotos...,29,"sauerkraut drained, Granny Smith apples sliced...",'Prep\n15 m\nCook\n2 h 30 m\nReady In\n2 h 45 ...
2,218939,Foolproof Rosemary Chicken Wings,4.571429,https://images.media-allrecipes.com/userphotos...,12,"chicken wings, sprigs rosemary, head garlic, o...","""Prep\n20 m\nCook\n40 m\nReady In\n1 h\nPrehea..."
3,87211,Chicken Pesto Paninis,4.625,https://images.media-allrecipes.com/userphotos...,163,"focaccia bread quartered, prepared basil pesto...",'Prep\n15 m\nCook\n5 m\nReady In\n20 m\nPrehea...
4,245714,Potato Bacon Pizza,4.5,https://images.media-allrecipes.com/userphotos...,2,"red potatoes, strips bacon, Sauce:, heavy whip...",'Prep\n20 m\nCook\n45 m\nReady In\n1 h 10 m\nP...


# Send to SQL

In [35]:
conn = sqlite3.connect('../data/recsys.db')
c = conn.cursor()

In [41]:
c.execute('DROP TABLE IF EXISTS recipes;')
c.execute('DROP TABLE IF EXISTS interactions;')
c.execute('DROP TABLE IF EXISTS nutrients;')
conn.commit()

In [42]:
# Create table for recipes
rec_sql = '''CREATE TABLE recipes(
                id INTEGER PRIMARY KEY,
                name TEXT NOT NULL,
                ave_rating REAL,
                review_nums INTEGER,
                image_url TEXT,
                ingredients TEXT,
                directions TEXT)'''

int_sql = '''CREATE TABLE interactions(
                user_id INTEGER NOT NULL,
                recipe_id INTEGER NOT NULL, 
                rating INTEGER NOT NULL, 
                date_mod INTEGER NOT NULL,
                PRIMARY KEY (user_id, recipe_id),
                FOREIGN KEY (recipe_id) REFERENCES recipes (id) 
                    ON DELETE CASCADE ON UPDATE NO ACTION)'''

nutr_sql = '''CREATE TABLE nutrients(
                nutrient_id INTEGER NOT NULL, 
                recipe_id INTEGER NOT NULL, 
                complete INTEGER DEFAULT 0, 
                name TEXT NOT NULL, 
                value REAL NOT NULL, 
                percent_daily_value INTEGER NOT NULL, 
                display_value TEXT NOT NULL, 
                value_unit TEXT NOT NULL, 
                PRIMARY KEY (nutrient_id, recipe_id), 
                FOREIGN KEY (recipe_id) REFERENCES recipes (id)
                    ON DELETE CASCADE ON UPDATE NO ACTION)'''

c.execute(rec_sql)
c.execute(int_sql)
c.execute(nutr_sql)

conn.commit()

In [43]:
recipes.to_sql('recipes', conn, if_exists='replace', index=False)

In [44]:
interactions.to_sql('interactions', conn, if_exists='replace', index=False)

In [45]:
nutrients.to_sql('nutrients', conn, if_exists='replace', index=False)