In [29]:
import pandas as pd
import re
import ast

print("Starting formatted data cleaning...")

# PART 1: RECIPES
try:
    print("\n--- Processing Recipes ---")

    data = pd.read_csv("recipes.csv")
except Exception as e:
    print(f"An error occurred while processing recipes: {e}")

Starting formatted data cleaning...

--- Processing Recipes ---


In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1090 entries, 0 to 1089
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    1090 non-null   int64  
 1   recipe_name   1090 non-null   object 
 2   prep_time     1039 non-null   object 
 3   cook_time     782 non-null    object 
 4   total_time    1045 non-null   object 
 5   servings      1090 non-null   int64  
 6   yield         879 non-null    object 
 7   ingredients   1090 non-null   object 
 8   directions    1090 non-null   object 
 9   rating        1090 non-null   float64
 10  url           1090 non-null   object 
 11  cuisine_path  1090 non-null   object 
 12  nutrition     1090 non-null   object 
 13  timing        1090 non-null   object 
 14  img_src       1090 non-null   object 
dtypes: float64(1), int64(2), object(12)
memory usage: 127.9+ KB


In [31]:
data = data.rename(columns={
    'recipe_name': 'Name',
    'prep_time': 'Prep Time',
    'cook_time': 'Cook Time',
    'total_time': 'Total Time',
    'servings': 'Servings',
    'ingredients': 'Ingredients',
    'directions': 'Directions',
    'rating': 'Rating',
    'url': 'URL'
})

In [32]:
def extract_calories(nutrition_str):
    if not isinstance(nutrition_str, str):
        return 0
    match = re.search(r'Calories\s+(\d+)', nutrition_str)
    if match:
        return int(match.group(1))
    return 0

if 'nutrition' in data.columns:
    data['Calories'] = data['nutrition'].apply(extract_calories)

In [33]:
data['Name'] = data['Name'].fillna("Unnamed Recipe")
data['Ingredients'] = data['Ingredients'].fillna("")
data['Directions'] = data['Directions'].fillna("")
data['Rating'] = pd.to_numeric(data['Rating'], errors='coerce').fillna(0)

In [34]:
cols_to_keep = ['Name', 'Total Time', 'Servings', 'Ingredients', 'Directions', 'Rating', 'Calories']

existing_cols = [c for c in cols_to_keep if c in data.columns]
recipes_clean = data[existing_cols]

In [35]:
print("\n[Inspection] First row example:")
print(recipes_clean.iloc[0])

recipes_clean.to_csv("recipes_cleaned.csv", index=False)
print(f"✅ Saved 'recipes_cleaned.csv' with {len(recipes_clean)} rows.")


[Inspection] First row example:
Name                                    Apple-Cranberry Crostada
Total Time                                                   NaN
Servings                                                       8
Ingredients    3 tablespoons butter, 2 pounds Granny Smith ap...
Directions     Heat butter in a large skillet over medium-hig...
Rating                                                       4.4
Calories                                                       0
Name: 0, dtype: object
✅ Saved 'recipes_cleaned.csv' with 1090 rows.


In [36]:
# PART 2: GYM TRACKING
try:
    print("\n--- Processing Gym Data ---")

    gym_data = pd.read_csv("gym_members_exercise_tracking.csv")
except Exception as e:
    print(f"An error occurred while processing gym data: {e}")


--- Processing Gym Data ---


In [37]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1090 entries, 0 to 1089
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    1090 non-null   int64  
 1   Name          1090 non-null   object 
 2   Prep Time     1039 non-null   object 
 3   Cook Time     782 non-null    object 
 4   Total Time    1045 non-null   object 
 5   Servings      1090 non-null   int64  
 6   yield         879 non-null    object 
 7   Ingredients   1090 non-null   object 
 8   Directions    1090 non-null   object 
 9   Rating        1090 non-null   float64
 10  URL           1090 non-null   object 
 11  cuisine_path  1090 non-null   object 
 12  nutrition     1090 non-null   object 
 13  timing        1090 non-null   object 
 14  img_src       1090 non-null   object 
 15  Calories      1090 non-null   int64  
dtypes: float64(1), int64(3), object(12)
memory usage: 136.4+ KB


In [38]:
gym_data['Session Duration (minutes)'] = gym_data['Session_Duration (hours)'] * 60

level_map = {1: 'Beginner', 2: 'Intermediate', 3: 'Advanced'}
gym_data['Experience Level'] = gym_data['Experience_Level'].map(level_map).fillna('Unknown')

In [39]:

gym_data = gym_data.rename(columns={
    'Age': 'Age',
    'Gender': 'Gender',
    'Weight (kg)': 'Weight',
    'Height (m)': 'Height',
    'Avg_BPM': 'Avg BPM',
    'Calories_Burned': 'Calories Burned',
    'Workout_Type': 'Workout Type',
    'BMI': 'BMI'
})

In [40]:
gym_cols = [
    'Age', 'Gender', 'Workout Type',
    'Session Duration (minutes)', 'Calories Burned',
    'Avg BPM', 'Experience Level', 'BMI'
]
gym_clean = gym_data[gym_cols]

In [41]:
print("\n[Inspection] First row example:")
print(gym_clean.iloc[0])

gym_clean.to_csv("gym_data_cleaned.csv", index=False)
print(f"✅ Saved 'gym_data_cleaned.csv' with {len(gym_clean)} rows.")



[Inspection] First row example:
Age                                 56
Gender                            Male
Workout Type                      Yoga
Session Duration (minutes)       101.4
Calories Burned                 1313.0
Avg BPM                            157
Experience Level              Advanced
BMI                               30.2
Name: 0, dtype: object
✅ Saved 'gym_data_cleaned.csv' with 973 rows.
