In [2]:
import os
import pandas as pd
import polars as pl

In [3]:
def load_data_pandas(data_folder_path="data"):
    # Load the interactions and recipes
    interactions_df = pd.read_csv(os.path.join(data_folder_path, "RAW_interactions.csv")) 
    recipes_df = pd.read_csv(os.path.join(data_folder_path, "RAW_recipes.csv"))
    
    # Load the ingredient mapping
    ingredient_mapping = pd.read_pickle(os.path.join(data_folder_path, "ingr_map.pkl"))
    
    # Return each
    return interactions_df, recipes_df, ingredient_mapping

def load_data_polars(data_folder_path="data"):
    # Load the interactions and recipes
    interactions_df = pl.read_csv(os.path.join(data_folder_path, "RAW_interactions.csv")) 
    recipes_df = pl.read_csv(os.path.join(data_folder_path, "RAW_recipes.csv"))
    
    # Load the ingredient mapping
    ingredient_mapping = pd.read_pickle(os.path.join(data_folder_path, "ingr_map.pkl"))
    
    # Return each
    return interactions_df, recipes_df, ingredient_mapping

In [5]:
interactions_df, recipes_df, ingredient_mapping = load_data_polars(data_folder_path=os.path.join("..", "data"))

In [6]:
display(interactions_df.head())

user_id,recipe_id,date,rating,review
i64,i64,str,i64,str
38094,40893,"""2003-02-17""",4,"""Great with a s…"
1293707,40893,"""2011-12-21""",5,"""So simple, so …"
8937,44394,"""2002-12-01""",4,"""This worked ve…"
126440,85009,"""2010-02-27""",5,"""I made the Mex…"
57222,85009,"""2011-10-01""",5,"""Made the chedd…"


In [7]:
display(recipes_df.head())

name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
str,i64,i64,i64,str,str,str,i64,str,str,str,i64
"""arriba baked…",137739,55,47892,"""2005-09-16""","""['60-minutes-o…","""[51.5, 0.0, 13…",11,"""['make a choic…","""autumn is my f…","""['winter squas…",7
"""a bit differen…",31490,30,26278,"""2002-06-17""","""['30-minutes-o…","""[173.4, 18.0, …",9,"""['preheat oven…","""this recipe ca…","""['prepared piz…",6
"""all in the kit…",112140,130,196586,"""2005-02-25""","""['time-to-make…","""[269.8, 22.0, …",6,"""['brown ground…","""this modified …","""['ground beef'…",13
"""alouette pota…",59389,45,68585,"""2003-04-14""","""['60-minutes-o…","""[368.1, 17.0, …",11,"""['place potato…","""this is a supe…","""['spreadable c…",11
"""amish tomato …",44061,190,41706,"""2002-10-25""","""['weeknight', …","""[352.9, 1.0, 3…",5,"""['mix all ingr…","""my dh's amish …","""['tomato juice…",8


In [8]:
display(ingredient_mapping[ingredient_mapping["raw_ingr"] == "sausage patty"]['id'])

1284    6324
Name: id, dtype: int16

## Combine the data

In [14]:
# Merge the interactions and recipes dataframes with polars
combined_df = interactions_df.join(recipes_df, left_on='recipe_id', right_on="id")
combined_df = combined_df[['user_id', 'recipe_id', 'minutes', 'nutrition', 'n_steps', 'ingredients', 'rating']]
combined_df.head()
#combined_df = pd.merge(interactions_df, recipes_df, left_on='recipe_id', right_on="id")

user_id,recipe_id,minutes,nutrition,n_steps,ingredients,rating
i64,i64,i64,str,i64,str,i64
38094,40893,495,"""[204.8, 5.0, 9…",4,"""['great northe…",4
1293707,40893,495,"""[204.8, 5.0, 9…",4,"""['great northe…",5
8937,44394,20,"""[132.3, 11.0, …",5,"""[""devil's food…",4
126440,85009,10,"""[2786.2, 342.0…",3,"""['mayonnaise',…",5
57222,85009,10,"""[2786.2, 342.0…",3,"""['mayonnaise',…",5


## Process the data

In [27]:
processed_df = combined_df
new_columns = ['calories', 'total_fat', 'sugar', 'sodium', 'protein', 'saturated_fat', 'carbohydrates']

# Split the nutrition column into 6 columns
processed_df = processed_df.with_columns([pl.col("nutrition").str.split(",").alias('nutrition-split')]).explode('nutrition-split').with_columns(
        ("string_" + pl.arange(0, pl.count()).cast(pl.Utf8).str.zfill(2))
        .alias("col_nm")
    ).pivot(
        index=['user_id', 'recipe_id', 'minutes', 'n_steps', 'ingredients', 'rating'],
        values='nutrition-split',
        columns='col_nm',
    )
#processed_df[new_columns] = combined_df['nutrition'].str.split(',', expand=True)
#processed_df = processed_df.drop(columns=['nutrition'])
# Remove the [ from calories and ] from carbohydrates
#processed_df['calories'] = processed_df['calories'].str.replace("[", "")
#processed_df['carbohydrates'] = processed_df['carbohydrates'].str.replace(']', '')

# Replace the ingredient names with the ingredient ids
#processed_df['ingredients'] = processed_df['ingredients']
#processed_df['ingredients'] = processed_df['ingredients'].apply(lambda x: [ingredient_mapping[ingredient_mapping["raw_ingr"] == ingredient]["id"] for ingredient in x])

# Show the results
processed_df.head()

  processed_df = processed_df.with_columns([pl.col("nutrition").str.split(",").alias('nutrition-split')]).explode('nutrition-split').with_column(


: 

: 

## Split the data