# Generate the table

In our dataset, there's no data on the users, but we do have the tables of interactions between users and recipes (rating).  
We can generate a table of users by extracting the users from the interactions tables.  

The users table defined as follows:
```sql
-- User table
CREATE TABLE dbs.user (
    user_id INT PRIMARY KEY,
    first_name VARCHAR(45),
    last_name VARCHAR(45),
    date_of_birth DATE,
    weight TINYINT,
    height TINYINT,
    sex ENUM('M', 'F'),
    email VARCHAR(255) UNIQUE NOT NULL,
    password_hash VARCHAR(60) NOT NULL,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP
);
```

# 0. Importing the necessary libraries

In [1]:
!pip install pandas Faker tqdm bcrypt



In [2]:
import os
import ast
import json
import bcrypt
import numpy as np
import pandas as pd
from tqdm import tqdm

from faker import Faker

os.chdir(r"C:\Users\gilbe\Documents\Academics\M.Sc\Year 4\Semester A\DB Workshop")

In [3]:
path = "raw-data_interaction.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,user_id,recipe_id,rating,dateLastModified
0,8542392,222388,5,2017-04-22T12:46:43.663\n
1,11174581,222388,5,2013-06-20T15:50:25.96\n
2,8262477,222388,5,2015-02-14T07:27:51.307\n
3,3574785,240488,5,2017-10-07T18:20:08.973\n
4,12145410,240488,2,2018-01-06T00:06:09.563\n


In [4]:
# pd.to_datetime(df["dateLastModified"].str[:-1], format="%Y-%m-%dT%H:%M:%S.%f")
df["datetime"] = pd.to_datetime(df["dateLastModified"].str[:-1], format="mixed")

In [5]:
df["datetime"].describe()

count                          3794003
mean     2010-10-22 14:10:57.267878656
min         2000-02-07 21:06:30.137000
25%         2008-02-25 06:23:30.880000
50%      2010-10-25 12:15:48.150000128
75%      2013-12-24 06:31:24.813499904
max         2018-03-16 07:19:52.260000
Name: datetime, dtype: object

In [6]:
# Let's move the dates forward, so the last modified date is today
offset = pd.to_datetime("today") - df["datetime"].max()
df["datetime"] = df["datetime"] + offset

In [7]:
df["datetime"].describe()

count                          3794003
mean     2017-09-01 02:56:54.697124352
min         2006-12-18 09:52:27.566245
25%      2015-01-04 19:09:28.309244928
50%      2017-09-04 01:01:45.579245056
75%      2020-11-02 19:17:22.242744832
max         2025-01-23 20:05:49.689245
Name: datetime, dtype: object

In [8]:
df["user_id"].value_counts().describe().to_frame().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
count,1160267.0,3.26994,13.54242,1.0,1.0,1.0,2.0,4003.0


In [9]:
from typing import Optional, TypedDict

class User(TypedDict):
    user_id: int
    name: str
    date_of_birth: str
    weight: int
    height: int
    sex: str
    email: str
    password_hash: str
    created_at: str
    updated_at: Optional[str]

# 1. Simulate the user data

In [10]:
def generate_date_of_birth(created_at: pd.Series, mean_age: float = 70, std_age: float = 27, size: int = None) -> pd.Series:
    """Generate random dates of birth using normal distribution with mean age.
    
    Args:
        created_at: Series of user account creation dates
        mean_age: Mean age in years (default 70 based on data)
        std_age: Standard deviation of age in years (default 27 based on data)
        size: Number of dates to generate (default None, uses length of created_at)

    Returns:
        Series of birth dates as YYYY-MM-DD strings
    """
    if size is None:
        size = len(created_at)
        
    # Sample ages from normal distribution, ensuring minimum age of 18
    ages = np.maximum(18, np.random.normal(mean_age, std_age, size))
    
    # Ensure created_at and ages have same length before calculating birth dates
    if len(created_at) != len(ages):
        ages = ages[:len(created_at)]
    birth_dates = created_at - pd.TimedeltaIndex(ages.astype(int) * 365.25, unit='D')
    
    return birth_dates.dt.strftime("%Y-%m-%d")

For each user, we will generate: name, email, password_hash, phone, address, date_of_birth, weight, height and sex


In [11]:
user_ids = df["user_id"].unique()
print(f"Number of Users: {len(user_ids)}")

Number of Users: 1160267


1.16M users is too big dataset for this project, let's sample only 200K

In [12]:
# Sample 200K random user IDs
np.random.seed(42)
user_ids = np.random.choice(user_ids, size=20_000, replace=False)


In [13]:
df = df[df["user_id"].isin(user_ids)]

In [14]:
users = dict()
fake = Faker()
Faker.seed(1337)

password = "123456"
password_hash = bcrypt.hashpw(password.encode(), bcrypt.gensalt()).decode()

weight_params = {
    "M": {"mean": 80, "std": 10},
    "F": {"mean": 60, "std": 10},
}
height_params = {
    "M": {"mean": 180, "std": 10}, 
    "F": {"mean": 165, "std": 10},
}

In [15]:
# Pre-calculate first interaction dates for all users
first_interactions = df.groupby("user_id")["datetime"].min()

In [16]:
# Generate random days offset vector for all users at once
days_offset = np.random.randint(1, 31, size=len(user_ids))

# Generate sex for all users at once 
sexes = np.random.choice(["M", "F"], size=len(user_ids))

# Generate weights and heights vectors
weights = np.where(sexes == "M", 
                  np.random.normal(weight_params["M"]["mean"], weight_params["M"]["std"], len(user_ids)),
                  np.random.normal(weight_params["F"]["mean"], weight_params["F"]["std"], len(user_ids)))

heights = np.where(sexes == "M",
                  np.random.normal(height_params["M"]["mean"], height_params["M"]["std"], len(user_ids)), 
                  np.random.normal(height_params["F"]["mean"], height_params["F"]["std"], len(user_ids)))

In [17]:
# Pre-generate names and emails
names = [fake.name() for _ in tqdm(range(len(user_ids)))]

  3%|▎         | 620/20000 [00:00<00:07, 2689.83it/s]

100%|██████████| 20000/20000 [00:04<00:00, 4101.87it/s]


In [18]:
dates_of_birth = generate_date_of_birth(created_at=first_interactions, mean_age=30, std_age=3, size=len(user_ids))

In [19]:
for i, user_id in enumerate(tqdm(user_ids)):
    created_at = first_interactions[user_id] - pd.Timedelta(days=days_offset[i])
    email = f"{user_id}@example.com"
    
    user = {
        "user_id": int(user_id),
        "name": names[i],
        "email": email, 
        "password_hash": password_hash,
        "date_of_birth": dates_of_birth.iloc[i],
        "weight": weights[i],
        "height": heights[i],
        "sex": sexes[i],
        "created_at": created_at.strftime("%Y-%m-%d"),
    }
    users[int(user_id)] = user

100%|██████████| 20000/20000 [00:01<00:00, 10051.08it/s]


In [20]:
with open("users.json", "w") as f:
    json.dump(users, f, indent=4)

In [21]:
with open("users.json", "r") as f:
    users_int = json.load(f)


In [22]:
user_df = pd.DataFrame(users_int).T
user_df = user_df.reset_index(drop=True)
user_df.head()

Unnamed: 0,user_id,name,email,password_hash,date_of_birth,weight,height,sex,created_at
0,12657644,Kim Diaz,12657644@example.com,$2b$12$qnrwNNUhwDzKx9eEva3WeuXVNynSVOUTnAtynFK...,1973-01-06,78.765213,180.178011,M,2020-10-02
1,2107266,Carol Holloway,2107266@example.com,$2b$12$qnrwNNUhwDzKx9eEva3WeuXVNynSVOUTnAtynFK...,1976-03-02,102.44985,183.788793,M,2015-10-12
2,2746591,Richard Dorsey,2746591@example.com,$2b$12$qnrwNNUhwDzKx9eEva3WeuXVNynSVOUTnAtynFK...,1978-05-01,54.18413,167.45474,F,2019-03-03
3,1972959,Willie Harris,1972959@example.com,$2b$12$qnrwNNUhwDzKx9eEva3WeuXVNynSVOUTnAtynFK...,1978-04-07,101.561287,170.823651,M,2015-02-05
4,4541271,Susan Rodriguez,4541271@example.com,$2b$12$qnrwNNUhwDzKx9eEva3WeuXVNynSVOUTnAtynFK...,1977-12-19,74.976382,173.127955,M,2020-12-11


In [23]:
user_df["weight"] = user_df["weight"].astype(int)
user_df["height"] = user_df["height"].astype(int)

In [24]:
(2025 - pd.to_datetime(user_df["date_of_birth"]).dt.year).describe().to_frame().T


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
date_of_birth,20000.0,36.85595,5.285508,21.0,33.0,37.0,40.0,56.0


In [25]:
user_df.to_csv("users.csv", index=False)

In [26]:
user_df = pd.read_csv("users.csv")
user_df.head()


Unnamed: 0,user_id,name,email,password_hash,date_of_birth,weight,height,sex,created_at
0,12657644,Kim Diaz,12657644@example.com,$2b$12$qnrwNNUhwDzKx9eEva3WeuXVNynSVOUTnAtynFK...,1973-01-06,78,180,M,2020-10-02
1,2107266,Carol Holloway,2107266@example.com,$2b$12$qnrwNNUhwDzKx9eEva3WeuXVNynSVOUTnAtynFK...,1976-03-02,102,183,M,2015-10-12
2,2746591,Richard Dorsey,2746591@example.com,$2b$12$qnrwNNUhwDzKx9eEva3WeuXVNynSVOUTnAtynFK...,1978-05-01,54,167,F,2019-03-03
3,1972959,Willie Harris,1972959@example.com,$2b$12$qnrwNNUhwDzKx9eEva3WeuXVNynSVOUTnAtynFK...,1978-04-07,101,170,M,2015-02-05
4,4541271,Susan Rodriguez,4541271@example.com,$2b$12$qnrwNNUhwDzKx9eEva3WeuXVNynSVOUTnAtynFK...,1977-12-19,74,173,M,2020-12-11


# 2. Generate the other tables

## 2.1. load the processed data: nutritions, recipes and contains

In [27]:
nutritions_df = pd.read_csv("nutritions.csv")
nutritions_df.head()

Unnamed: 0,name,unit,average_daily_value
0,Carbohydrates,g,300.3424
1,Niacin Equivalents,mg,12.706413
2,Dietary Fiber,g,24.048217
3,Thiamin,mg,1.099229
4,Vitamin A - IU,IU,4834.5025


In [28]:
recipes_df = pd.read_csv("recipes.csv")
recipes_df.head()


Unnamed: 0,recipe_id,recipe_name,total_time,image,directions,ingredients
0,240488,"Pork Loin, Apples, and Sauerkraut",165,https://images.media-allrecipes.com/userphotos...,"""Prep\n15 m\nCook\n2 h 30 m\nReady In\n2 h 45...",sauerkraut drained^Granny Smith apples sliced^...
1,218939,Foolproof Rosemary Chicken Wings,60,https://images.media-allrecipes.com/userphotos...,"""Prep\n20 m\nCook\n40 m\nReady In\n1 h\nPrehe...",chicken wings^sprigs rosemary^head garlic^oliv...
2,87211,Chicken Pesto Paninis,20,https://images.media-allrecipes.com/userphotos...,"""Prep\n15 m\nCook\n5 m\nReady In\n20 m\nPrehe...",focaccia bread quartered^prepared basil pesto^...
3,245714,Potato Bacon Pizza,70,https://images.media-allrecipes.com/userphotos...,"""Prep\n20 m\nCook\n45 m\nReady In\n1 h 10 m\n...",red potatoes^strips bacon^Sauce:^heavy whippin...
4,218545,Latin-Inspired Spicy Cream Chicken Stew,505,https://images.media-allrecipes.com/userphotos...,"""Prep\n10 m\nCook\n8 h 15 m\nReady In\n8 h 25...",skinless boneless chicken breast halves^diced ...


In [29]:
contains_df = pd.read_csv("contains.csv")
contains_df.head()

Unnamed: 0,recipe_id,nutrition_name,amount
0,240488,Vitamin C,52.76848
1,218939,Vitamin C,5.307448
2,87211,Vitamin C,18.01502
3,245714,Vitamin C,0.905797
4,218545,Vitamin C,14.06487


In [30]:
# validate that all recipe_ids in the contains table are in the recipes table
assert set(contains_df["recipe_id"].unique()) == set(recipes_df["recipe_id"].unique())


## 2.2. Create the rating table

In [31]:
interactions_df = pd.read_csv("raw-data_interaction.csv")
interactions_df.head()

Unnamed: 0,user_id,recipe_id,rating,dateLastModified
0,8542392,222388,5,2017-04-22T12:46:43.663\n
1,11174581,222388,5,2013-06-20T15:50:25.96\n
2,8262477,222388,5,2015-02-14T07:27:51.307\n
3,3574785,240488,5,2017-10-07T18:20:08.973\n
4,12145410,240488,2,2018-01-06T00:06:09.563\n


In [32]:
# validate all recipe_ids and user_ids are in the recipes and users tables
if not set(interactions_df["recipe_id"].unique()) == set(recipes_df["recipe_id"].unique()):
    print("Some recipe_ids are not in the recipes table")
    interactions_df = interactions_df[interactions_df["recipe_id"].isin(recipes_df["recipe_id"])]
if not set(interactions_df["user_id"].unique()) == set(user_df["user_id"].unique()):
    print("Some user_ids are not in the users table")
    interactions_df = interactions_df[interactions_df["user_id"].isin(user_df["user_id"])]

Some recipe_ids are not in the recipes table
Some user_ids are not in the users table


In [33]:
interactions_df["datetime"] = pd.to_datetime(interactions_df["dateLastModified"].str[:-1], format="mixed")
interactions_df["datetime"].describe()

count                            65006
mean     2010-12-02 12:25:35.379311360
min         2000-02-09 02:22:02.627000
25%      2008-02-27 14:54:50.147749888
50%         2010-11-23 20:14:57.728000
75%      2014-03-23 10:06:42.360749824
max         2018-03-14 21:54:12.470000
Name: datetime, dtype: object

In [34]:
# Let's move the dates forward, so the last modified date is today
offset = pd.to_datetime("today") - interactions_df["datetime"].max()
interactions_df["datetime"] = interactions_df["datetime"] + offset

# Get current month start and end dates
current_month_start = pd.to_datetime("today").replace(day=1)
current_month_end = pd.to_datetime("today")

# Calculate how many interactions we need in current month (10% of total)
num_interactions = len(interactions_df)
num_current_month = int(num_interactions * 0.1)

# Randomly select interactions to move to current month
current_month_mask = np.random.choice([True, False], size=len(interactions_df), p=[0.1, 0.9])
current_month_dates = pd.date_range(start=current_month_start, end=current_month_end, periods=num_current_month)

# Update dates for selected interactions
interactions_df.loc[current_month_mask, "datetime"] = np.random.choice(current_month_dates, size=current_month_mask.sum())

interactions_df["datetime"].describe()

count                            65006
mean     2018-07-07 08:26:22.660603648
min         2006-12-21 00:34:21.917931
25%      2015-06-10 17:07:28.523430912
50%      2018-04-24 16:30:20.682430976
75%      2022-07-03 05:27:03.613180928
max         2025-01-23 20:06:31.760931
Name: datetime, dtype: object

In [35]:
interactions_df = interactions_df.drop(columns=["dateLastModified"])
interactions_df.rename(
    columns={
        "datetime": "created_at",
    },
    inplace=True,
)
interactions_df.to_csv("rating.csv", index=False)

In [36]:
del interactions_df

In [37]:
rating_df = pd.read_csv("rating.csv")
rating_df.head()

Unnamed: 0,user_id,recipe_id,rating,created_at
0,9823884,240488,5,2024-11-19 00:11:35.560931000
1,16087568,87211,5,2023-09-04 13:41:45.650931000
2,5517874,87211,5,2025-01-04 11:53:05.237518570
3,2971638,87211,4,2017-08-01 18:51:54.187931000
4,8028715,218545,5,2021-03-12 21:33:08.050931000


## 2.3. Create the diet table


In [38]:
diets_keywords = {
    "vegan": [
        "chicken",
        "beef",
        "meat",
        "egg",
        "milk",
        "cheese",
        "yogurt",
        "cream",
        "butter",
        "fish",
        "seafood",
        "honey",
        "fish",
        "seafood",
        "gelatin",
        "lard",
        "bacon",
        "pork",
        "ham",
    ],
    "vegetarian": [
        "chicken",
        "beef",
        "meat",
        "fish",
        "seafood",
        "gelatin",
        "lard",
        "bacon",
        "pork",
        "ham",
    ],
    "gluten_free": [
        "wheat",
        "flour",
        "bread",
        "pasta",
        "barley",
        "rye",
        "oats",
        "breadcrumbs",
        "soy sauce",
    ],
    "dairy_free": [
        "milk",
        "cheese",
        "yogurt",
        "cream",
        "butter",
        "whey",
        "casein",
    ],
    "nut_allergy": [
        "peanuts",
        "almonds",
        "cashews",
        "walnuts",
        "pecans",
        "pistachios",
        "hazelnuts",
        "macadamia",
        "pine nuts",
    ],
    "shellfish_allergy": [
        "shrimp",
        "crab",
        "lobster",
        "clams",
        "mussels",
        "oysters",
        "scallops",
        "crawfish",
    ],
    "egg_allergy": ["egg", "mayonnaise", "meringue", "custard", "hollandaise", "aioli"],
}

In [39]:
diet_df = pd.DataFrame()
diet_df["name"] = diets_keywords.keys()
diet_df["keywords"] = diets_keywords.values()
diet_df["keywords"] = diet_df["keywords"].apply(lambda x: json.dumps(x))
diet_df = diet_df.reset_index().rename(columns={"index": "diet_id"})
diet_df["diet_id"] = diet_df["diet_id"] + 1
diet_df.to_csv("diet.csv", index=False)


In [40]:
diet_df = pd.read_csv("diet.csv")
diet_df["keywords"] = diet_df["keywords"].apply(json.loads)
diet_df.head()

Unnamed: 0,diet_id,name,keywords
0,1,vegan,"[chicken, beef, meat, egg, milk, cheese, yogur..."
1,2,vegetarian,"[chicken, beef, meat, fish, seafood, gelatin, ..."
2,3,gluten_free,"[wheat, flour, bread, pasta, barley, rye, oats..."
3,4,dairy_free,"[milk, cheese, yogurt, cream, butter, whey, ca..."
4,5,nut_allergy,"[peanuts, almonds, cashews, walnuts, pecans, p..."


## 2.4. Create the fits table


In [41]:
recipes_df.head()

Unnamed: 0,recipe_id,recipe_name,total_time,image,directions,ingredients
0,240488,"Pork Loin, Apples, and Sauerkraut",165,https://images.media-allrecipes.com/userphotos...,"""Prep\n15 m\nCook\n2 h 30 m\nReady In\n2 h 45...",sauerkraut drained^Granny Smith apples sliced^...
1,218939,Foolproof Rosemary Chicken Wings,60,https://images.media-allrecipes.com/userphotos...,"""Prep\n20 m\nCook\n40 m\nReady In\n1 h\nPrehe...",chicken wings^sprigs rosemary^head garlic^oliv...
2,87211,Chicken Pesto Paninis,20,https://images.media-allrecipes.com/userphotos...,"""Prep\n15 m\nCook\n5 m\nReady In\n20 m\nPrehe...",focaccia bread quartered^prepared basil pesto^...
3,245714,Potato Bacon Pizza,70,https://images.media-allrecipes.com/userphotos...,"""Prep\n20 m\nCook\n45 m\nReady In\n1 h 10 m\n...",red potatoes^strips bacon^Sauce:^heavy whippin...
4,218545,Latin-Inspired Spicy Cream Chicken Stew,505,https://images.media-allrecipes.com/userphotos...,"""Prep\n10 m\nCook\n8 h 15 m\nReady In\n8 h 25...",skinless boneless chicken breast halves^diced ...


In [42]:
recipe_diets = set()
for recipe_id, ingredients in zip(recipes_df["recipe_id"], recipes_df["ingredients"]):
    for i, (diet_id, diet_name, keywords) in diet_df.iterrows():
        if any(keyword in ingredients for keyword in keywords):
            continue
        recipe_diets.add((recipe_id, diet_id))
fits_df = pd.DataFrame(recipe_diets, columns=["recipe_id", "diet_id"])
fits_df.to_csv("fits.csv", index=False)
fits_df.head()


Unnamed: 0,recipe_id,diet_id
0,144410,7
1,240418,5
2,240022,5
3,254213,6
4,223354,2


In [43]:
fits_df = pd.read_csv("fits.csv")
fits_df.head()

Unnamed: 0,recipe_id,diet_id
0,144410,7
1,240418,5
2,240022,5
3,254213,6
4,223354,2


In [44]:
if 0 in fits_df["diet_id"].unique():
    fits_df["diet_id"] = fits_df["diet_id"] + 1

## 2.5. Create the eats table

This table will be created by joining the rating table and the users table.
we'll set an assumption that every user rate 20-60% of the recipes he/she has seen.

So we have two steps:
1. Randomly select 20-60% of the recipes for each user
2. Create the eats table by joining the rating table and the users table

The eats table will have the following columns:
- user_id
- recipe_id
- created_at

In [45]:
# the east table based on the rating table and the users table
# if the user has rated the recipe, he/she has eaten it ~2-3 hours before the rating

eats_df = rating_df.copy()
eats_df["created_at"] = pd.to_datetime(eats_df["created_at"]) - pd.Timedelta(hours=2)
eats_df.drop(columns=["rating"], inplace=True)
eats_df.to_csv("eats.csv", index=False)
eats_df.head()


Unnamed: 0,user_id,recipe_id,created_at
0,9823884,240488,2024-11-18 22:11:35.560931000
1,16087568,87211,2023-09-04 11:41:45.650931000
2,5517874,87211,2025-01-04 09:53:05.237518570
3,2971638,87211,2017-08-01 16:51:54.187931000
4,8028715,218545,2021-03-12 19:33:08.050931000


In [46]:
eats_df = pd.read_csv("eats.csv")
eats_df.head()

Unnamed: 0,user_id,recipe_id,created_at
0,9823884,240488,2024-11-18 22:11:35.560931000
1,16087568,87211,2023-09-04 11:41:45.650931000
2,5517874,87211,2025-01-04 09:53:05.237518570
3,2971638,87211,2017-08-01 16:51:54.187931000
4,8028715,218545,2021-03-12 19:33:08.050931000


In [47]:
# Get all unique users and recipes once
unique_users = eats_df["user_id"].unique()
all_recipes = np.array(list(recipes_df["recipe_id"].unique()))

# Create mapping of user_id to recipes eaten for faster lookup
user_recipes_map = eats_df.groupby("user_id")["recipe_id"].apply(set).to_dict()

# Create mapping of user_id to creation date for faster lookup
user_created_map = user_df.set_index("user_id")["created_at"].to_dict()

# Pre-calculate now timestamp
now = pd.Timestamp.now().timestamp()

In [48]:
# Pre-allocate list with estimated size
n_users = len(unique_users)
additional_eats_size = n_users * 3  # Average of 3 new recipes per user
additional_eats = []

# Convert to numpy arrays for faster operations
all_recipes_set = set(all_recipes)
now_ts = now

# For each user with progress bar
for user_id in tqdm(unique_users, desc="Processing users"):
    # Get recipes this user has already eaten from map
    user_recipes = user_recipes_map.get(user_id, set())
    
    # Get available recipes using set difference (faster than np.isin)
    available_recipes = list(all_recipes_set - user_recipes)
    
    if available_recipes:
        # Sample 1-5 new recipes for this user to eat
        n_new_recipes = min(np.random.randint(1, 6), len(available_recipes))
        new_recipes = np.random.choice(available_recipes, size=n_new_recipes, replace=False)
        
        # Get user creation time from map and convert to timestamp
        user_created = pd.to_datetime(user_created_map[user_id]).timestamp()
        
        # Generate timestamps
        timestamps = np.random.uniform(user_created, now_ts, size=n_new_recipes)
        
        # Extend list with new records
        additional_eats.extend({
            "user_id": user_id,
            "recipe_id": recipe_id,
            "created_at": pd.Timestamp.fromtimestamp(ts)
        } for recipe_id, ts in zip(new_recipes, timestamps))

# Create DataFrame from list of dicts
additional_eats_df = pd.DataFrame(additional_eats)

# Combine with existing data and sort
eats_df = pd.concat([eats_df, additional_eats_df], ignore_index=True)
eats_df.sort_values(["user_id", "created_at"], inplace=True)

# Save updated eats table
eats_df.to_csv("eats.csv", index=False)

Processing users: 100%|██████████| 19917/19917 [04:44<00:00, 70.06it/s]


In [49]:
print(eats_df.shape)
eats_df.head()

(124684, 3)


Unnamed: 0,user_id,recipe_id,created_at
98459,192,14307,2009-06-10 03:16:13.302292
98457,192,231749,2018-05-15 10:38:01.485992
98458,192,12062,2023-08-29 22:28:56.025623
98460,192,230025,2023-10-22 05:07:40.159263
28734,192,7286,2007-01-08 11:48:48.073931000


In [50]:
# validate that all recipe_ids in the eats table are in the recipes table
assert eats_df["recipe_id"].isin(set(recipes_df["recipe_id"].unique())).all()
# validate that all user_ids in the eats table are in the users table
assert eats_df["user_id"].isin(set(user_df["user_id"].unique())).all()


## 2.6. Create the follows table  

This table is the many-to-many relationship between users and diets.  
We'll populate this table by reviewing all the recipes that each user has eaten and then checking if the recipe fits any of the diets.


In [51]:
fits_df.head()

Unnamed: 0,recipe_id,diet_id
0,144410,7
1,240418,5
2,240022,5
3,254213,6
4,223354,2


In [52]:
# group the fits_df by recipe_id to get the list of diets for each recipe
diest_lists = fits_df.groupby("recipe_id")["diet_id"].apply(list).to_frame()
diest_lists.head()


Unnamed: 0_level_0,diet_id
recipe_id,Unnamed: 1_level_1
6663,"[3, 5, 2, 6]"
6664,"[6, 5, 2]"
6665,"[6, 4, 5, 2]"
6666,"[6, 4, 2]"
6667,"[6, 7, 5, 2]"


In [53]:
enriched_recipes_df = pd.merge(recipes_df, diest_lists.reset_index(), how="left", left_on="recipe_id", right_on="recipe_id")
enriched_recipes_df.head()


Unnamed: 0,recipe_id,recipe_name,total_time,image,directions,ingredients,diet_id
0,240488,"Pork Loin, Apples, and Sauerkraut",165,https://images.media-allrecipes.com/userphotos...,"""Prep\n15 m\nCook\n2 h 30 m\nReady In\n2 h 45...",sauerkraut drained^Granny Smith apples sliced^...,"[4, 5, 6, 3, 7]"
1,218939,Foolproof Rosemary Chicken Wings,60,https://images.media-allrecipes.com/userphotos...,"""Prep\n20 m\nCook\n40 m\nReady In\n1 h\nPrehe...",chicken wings^sprigs rosemary^head garlic^oliv...,"[3, 4, 5, 6, 7]"
2,87211,Chicken Pesto Paninis,20,https://images.media-allrecipes.com/userphotos...,"""Prep\n15 m\nCook\n5 m\nReady In\n20 m\nPrehe...",focaccia bread quartered^prepared basil pesto^...,"[5, 6, 7]"
3,245714,Potato Bacon Pizza,70,https://images.media-allrecipes.com/userphotos...,"""Prep\n20 m\nCook\n45 m\nReady In\n1 h 10 m\n...",red potatoes^strips bacon^Sauce:^heavy whippin...,"[5, 6, 7]"
4,218545,Latin-Inspired Spicy Cream Chicken Stew,505,https://images.media-allrecipes.com/userphotos...,"""Prep\n10 m\nCook\n8 h 15 m\nReady In\n8 h 25...",skinless boneless chicken breast halves^diced ...,"[3, 6, 7, 5]"


In [54]:
enriched_eats_df = pd.merge(eats_df, enriched_recipes_df, how="left", left_on="recipe_id", right_on="recipe_id")
enriched_eats_df.head()

Unnamed: 0,user_id,recipe_id,created_at,recipe_name,total_time,image,directions,ingredients,diet_id
0,192,14307,2009-06-10 03:16:13.302292,Broccoli-Cauliflower Salad,25,https://images.media-allrecipes.com/userphotos...,"""Prep\n10 m\nCook\n15 m\nReady In\n25 m\nPlac...",broccoli florets^cauliflower florets^hard-cook...,"[3, 6, 5]"
1,192,231749,2018-05-15 10:38:01.485992,Strawberry Raspberry Smoothie,10,https://images.media-allrecipes.com/userphotos...,"""Prep\n10 m\nReady In\n10 m\nBlend strawberri...",hulled fresh strawberries^frozen raspberries^l...,"[3, 5, 2, 6, 7]"
2,192,12062,2023-08-29 22:28:56.025623,Noodles Romanoff II,0,https://images.media-allrecipes.com/userphotos...,"""In a large pot with boiling salted water coo...",wide egg noodles^sour cream^grated Parmesan ch...,"[6, 5, 2, 3]"
3,192,230025,2023-10-22 05:07:40.159263,Tea Cakes,15,http://images.media-allrecipes.com/userphotos/...,"""Prep\n10 m\nCook\n5 m\nReady In\n15 m\nPrehe...",butter^white sugar^all-purpose flour^vanilla e...,"[2, 6, 7, 5]"
4,192,7286,2007-01-08 11:48:48.073931000,Creme de Menthe Cake I,0,http://images.media-allrecipes.com/userphotos/...,"""Prepare 1 box white cake mix as directed, ex...",white cake mix^creme de menthe liqueur^chocola...,"[1, 2, 5, 6, 7, 3, 4]"


In [55]:
# get the list of diets for each user
user_diet_eats = enriched_eats_df[enriched_eats_df["diet_id"].notna()].groupby("user_id")["diet_id"].apply(list).to_frame()
user_diet_eats.head()


Unnamed: 0_level_0,diet_id
user_id,Unnamed: 1_level_1
192,"[[3, 6, 5], [3, 5, 2, 6, 7], [6, 5, 2, 3], [2,..."
209,"[[6, 7, 3, 4, 5, 1, 2], [7, 2, 6, 3], [6, 3, 4..."
694,"[[2, 6], [6, 2]]"
758,"[[5, 2, 3, 6, 7], [5, 6, 7]]"
905,"[[3, 6, 7, 5, 2], [3, 2, 5, 6, 7], [2, 5, 6, 7..."


In [56]:
# Now let's create the follows table. If a user choose all of his recipes by specific diets, we will create a follows record for each diet.
def count_diet_eats(diet_ids_lists):
    if not diet_ids_lists:
        return set()
    follows = set()
    all_diets = set(diet_df["diet_id"])
    for diet_id in all_diets:
        if all(diet_id in diet_ids_list for diet_ids_list in diet_ids_lists):
            follows.add(diet_id)
    return follows

user_diet_eats["follows"] = user_diet_eats["diet_id"].apply(count_diet_eats)
user_diet_eats.head()

Unnamed: 0_level_0,diet_id,follows
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
192,"[[3, 6, 5], [3, 5, 2, 6, 7], [6, 5, 2, 3], [2,...","{5, 6}"
209,"[[6, 7, 3, 4, 5, 1, 2], [7, 2, 6, 3], [6, 3, 4...",{6}
694,"[[2, 6], [6, 2]]","{2, 6}"
758,"[[5, 2, 3, 6, 7], [5, 6, 7]]","{5, 6, 7}"
905,"[[3, 6, 7, 5, 2], [3, 2, 5, 6, 7], [2, 5, 6, 7...","{5, 6}"


In [57]:
user_diet_eats["follows"].apply(len).describe().to_frame().T


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
follows,19917.0,1.876387,1.038176,0.0,1.0,2.0,2.0,7.0


In [58]:
user_diet_eats.head().reset_index()[["user_id", "follows"]].explode("follows").dropna()

Unnamed: 0,user_id,follows
0,192,5
0,192,6
1,209,6
2,694,2
2,694,6
3,758,5
3,758,6
3,758,7
4,905,5
4,905,6


In [59]:
# Now create the follows table
follows_df = user_diet_eats.reset_index()[["user_id", "follows"]].explode("follows")
follows_df.rename(columns={"follows": "diet_id"}, inplace=True)
follows_df.dropna(inplace=True)
follows_df.to_csv("follows.csv", index=False)
follows_df.head()

Unnamed: 0,user_id,diet_id
0,192,5
0,192,6
1,209,6
2,694,2
2,694,6


In [60]:
follows_df = pd.read_csv("follows.csv")
follows_df.head()

Unnamed: 0,user_id,diet_id
0,192,5
1,192,6
2,209,6
3,694,2
4,694,6


## 2.7. Create the admin table


In [61]:
ADMIN_IDS = [2746591, 4541271, 12657644]
user_df[user_df["user_id"].isin(ADMIN_IDS)]

Unnamed: 0,user_id,name,email,password_hash,date_of_birth,weight,height,sex,created_at
0,12657644,Kim Diaz,12657644@example.com,$2b$12$qnrwNNUhwDzKx9eEva3WeuXVNynSVOUTnAtynFK...,1973-01-06,78,180,M,2020-10-02
2,2746591,Richard Dorsey,2746591@example.com,$2b$12$qnrwNNUhwDzKx9eEva3WeuXVNynSVOUTnAtynFK...,1978-05-01,54,167,F,2019-03-03
4,4541271,Susan Rodriguez,4541271@example.com,$2b$12$qnrwNNUhwDzKx9eEva3WeuXVNynSVOUTnAtynFK...,1977-12-19,74,173,M,2020-12-11


In [62]:
# Now create the admin table - columns are user_id, date_promoted (today)
admin_df = pd.DataFrame(
    {"user_id": ADMIN_IDS, "promoted_at": pd.Timestamp.now().strftime("%Y-%m-%d")}
)
admin_df.to_csv("admin.csv", index=False)
admin_df.head()

Unnamed: 0,user_id,promoted_at
0,2746591,2025-01-23
1,4541271,2025-01-23
2,12657644,2025-01-23


In [63]:
admin_df = pd.read_csv("admin.csv")
admin_df.head()

Unnamed: 0,user_id,promoted_at
0,2746591,2025-01-23
1,4541271,2025-01-23
2,12657644,2025-01-23


## 2.8. Create the user_age_group table (tracks table)


In [78]:
daily_nutrition_df = pd.read_csv("daily_nutrition_values.csv")
daily_nutrition_df.head()


Unnamed: 0,age_group,sex,nutrition_name,recommended_daily_value
0,14-18,M,Dietary Fiber,28.0
1,14-18,F,Dietary Fiber,25.0
2,19-30,M,Dietary Fiber,34.0
3,19-30,F,Dietary Fiber,28.0
4,31-50,M,Dietary Fiber,31.0


In [66]:
def get_age_group(date_of_birth: str) -> str:
    age = 2025 - pd.to_datetime(date_of_birth).year
    if age < 18:
        return "14-18"
    elif age < 31:
        return "19-30"
    elif age < 51:
        return "31-50"
    else:
        return "51+"

In [67]:
age_groups = user_df["date_of_birth"].apply(get_age_group)
age_groups.value_counts()


date_of_birth
31-50    17663
19-30     2216
51+        121
Name: count, dtype: int64

In [71]:
user_df_age = user_df[["user_id"]].copy()
user_df_age["age_group"] = age_groups
user_df_age.head()

Unnamed: 0,user_id,age_group
0,12657644,51+
1,2107266,31-50
2,2746591,31-50
3,1972959,31-50
4,4541271,31-50


In [79]:
user_df_age.to_csv("user_age_group.csv", index=False)

In [81]:
# verify that all user_ids in the user_nutrition table are in the users table
assert user_df_age["user_id"].isin(set(user_df["user_id"].unique())).all()
assert user_df_age["age_group"].isin(set(daily_nutrition_df["age_group"].unique())).all()

# 3. Create the database

In [82]:
import mysql.connector

In [89]:
mysql_user = "root"
mysql_password = "root"
mysql_host = "localhost"
mysql_port = 3306
mysql_database = "dbs"

# Create the database
conn = mysql.connector.connect(
    host=mysql_host,
    port=mysql_port,
    user=mysql_user,
    password=mysql_password,
    database=mysql_database,
    time_zone='+00:00'
)

cur = conn.cursor()
# test the connection
cur.execute("SHOW DATABASES")
print(cur.fetchall())

[('dbs',), ('information_schema',), ('mysql',), ('performance_schema',), ('sakila',), ('sys',), ('world',)]


In [90]:
# list the tables
cur.execute("SHOW TABLES")
print(cur.fetchall())

[('admin',), ('contains',), ('diet',), ('eats',), ('fits',), ('nutrition',), ('nutrition_per_age',), ('rating',), ('recipe',), ('user',), ('user_age_group',), ('user_diet',)]


In [91]:
table_names_to_df = {
    "admin": admin_df,
    "contains": contains_df,
    "diet": diet_df,
    "eats": eats_df,
    "fits": fits_df,
    "nutrition": nutritions_df,
    "rating": rating_df,
    "recipe": recipes_df,
    "user": user_df,
    "user_diet": follows_df,
    "nutrition_per_age": daily_nutrition_df,
    "user_age_group": user_df_age,
}

In [92]:
# First, let's validate the columns of the tables
for table_name, df in table_names_to_df.items():
    columns = ", ".join([f"`{col}`" for col in df.columns])
    query = f"SHOW COLUMNS FROM {table_name}"
    cur.execute(query)
    db_columns = [col[0] for col in cur.fetchall()]
    for col in df.columns:
        if col not in db_columns:
            raise ValueError(f"Column {col} not found in database: {table_name}")
    


Now, let's insert the rest of the tables

In [93]:
tables_order = [
    "user",
    "recipe",
    "admin", 
    "rating",
    "diet",
    "nutrition",
    "contains",
    "eats",
    "fits",
    "user_diet",
    "nutrition_per_age",
    "user_age_group",

]

for table_name in tables_order:
    df = table_names_to_df[table_name]
    columns = ", ".join([f"`{col}`" for col in df.columns])
    values = ", ".join(["%s" for _ in df.columns])

    # Convert values to list of tuples for batch insert
    value_tuples = [
        tuple(str(val) if not isinstance(val, list) else json.dumps(val) 
              for val in row)
        for _, row in tqdm(df.iterrows(), desc="Generating row values", total=len(df))
    ]
    # Batch insert in chunks of 5000
    query = f"INSERT INTO {table_name} ({columns}) VALUES ({values})"
    chunk_size = 5000
    for i in tqdm(range(0, len(value_tuples), chunk_size), 
                  desc=f"Inserting {table_name} rows"):
        chunk = value_tuples[i:i + chunk_size]
        try:
            cur.executemany(query, chunk)
            conn.commit()
        except Exception as e:
            print(f"Error inserting chunk starting at row {i} into {table_name}: {e}")
            print(f"Query: {query}")
            raise e

Generating row values: 100%|██████████| 20000/20000 [00:02<00:00, 9455.00it/s]
Inserting user rows: 100%|██████████| 4/4 [00:02<00:00,  1.97it/s]
Generating row values: 100%|██████████| 45630/45630 [00:07<00:00, 6050.50it/s]
Inserting recipe rows: 100%|██████████| 10/10 [00:06<00:00,  1.50it/s]
Generating row values: 100%|██████████| 3/3 [00:00<00:00, 1501.36it/s]
Inserting admin rows: 100%|██████████| 1/1 [00:00<00:00, 105.01it/s]
Generating row values: 100%|██████████| 65006/65006 [00:07<00:00, 9260.79it/s] 
Inserting rating rows: 100%|██████████| 14/14 [00:05<00:00,  2.58it/s]
Generating row values: 100%|██████████| 7/7 [00:00<00:00, 2321.88it/s]
Inserting diet rows: 100%|██████████| 1/1 [00:00<00:00, 142.79it/s]
Generating row values: 100%|██████████| 20/20 [00:00<00:00, 5001.26it/s]
Inserting nutrition rows: 100%|██████████| 1/1 [00:00<00:00, 93.95it/s]
Generating row values: 100%|██████████| 45630/45630 [00:05<00:00, 8013.74it/s] 
Inserting contains rows: 100%|██████████| 10/10 [