# Gym Personal Training Plan Recommender

## Exploratory Data Analysis (EDA) & Preprocessing

### Imports

In [103]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import joblib
from tensorflow import keras
from tensorflow.keras import layers

### Load Datasets

In [104]:
# Gym Members Dataset
members_df = pd.read_csv("./data/gym_members_exercise_tracking_synthetic_data.csv")

# Program Datasets
programs_df = pd.read_csv("./data/program_summary.csv")
exercise_df = pd.read_csv("./data/programs_detailed_boostcamp_kaggle.csv")

### Data Analysis

In [105]:
# Explore Gym Members Dataset
print("\n********** Gym Members Dataset **********")

print("\nShape:")
print(members_df.shape)

print("\nColumns:")
print(members_df.columns.to_list())

print("\nInfo:")
print(members_df.info())

print("\nCheck missing values:")
print(members_df.isna().sum())

print("\nSample rows:")
display(members_df.head())
display(members_df.tail())


********** Gym Members Dataset **********

Shape:
(1800, 15)

Columns:
['Age', 'Gender', 'Weight (kg)', 'Height (m)', 'Max_BPM', 'Avg_BPM', 'Resting_BPM', 'Session_Duration (hours)', 'Calories_Burned', 'Workout_Type', 'Fat_Percentage', 'Water_Intake (liters)', 'Workout_Frequency (days/week)', 'Experience_Level', 'BMI']

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1800 entries, 0 to 1799
Data columns (total 15 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Age                            1790 non-null   float64
 1   Gender                         1729 non-null   object 
 2   Weight (kg)                    1778 non-null   float64
 3   Height (m)                     1774 non-null   float64
 4   Max_BPM                        1779 non-null   object 
 5   Avg_BPM                        1770 non-null   float64
 6   Resting_BPM                    1781 non-null   float64
 7   Session_Duration (h

Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,Fat_Percentage,Water_Intake (liters),Workout_Frequency (days/week),Experience_Level,BMI
0,34.0,Female,86.7,1.86,174,152.0,74.0,1.12,712.0,Strength,12.8,2.4,5.0,2.0,14.31
1,26.0,Female,84.7,1.83,166,156.0,73.0,1.0,833.0,Strength,27.9,2.8,5.0,2.0,33.49
2,22.0,Male,64.8,1.85,187,166.0,64.0,1.24,1678.0,Cardio,28.7,1.9,3.0,2.0,12.73
3,54.0,Female,75.3,1.82,187,169.0,58.0,1.45,628.0,Cardio,31.8,2.4,4.0,1.0,20.37
4,34.0,Female,52.8,1.74,177,169.0,66.0,1.6,1286.0,Strength,26.4,3.2,4.0,2.0,20.83


Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,Fat_Percentage,Water_Intake (liters),Workout_Frequency (days/week),Experience_Level,BMI
1795,54.0,Male,88.5,2.0,173,134.0,58.0,1.11,1388.0,HIIT,27.7,3.7,3.0,2.0,36.73
1796,52.0,Male,84.3,1.69,164,169.0,54.0,0.77,1367.0,HIIT,32.6,2.9,3.0,2.0,15.11
1797,47.0,Male,70.1,1.84,188,129.0,67.0,1.2,1261.0,Strength,28.4,2.5,3.0,2.0,17.99
1798,35.0,Male,49.3,1.71,180,152.0,73.0,1.04,956.0,Cardio,32.9,1.7,4.0,3.0,12.65
1799,40.0,Male,79.5,1.92,184,156.0,52.0,1.29,1178.0,Strength,14.7,1.5,4.0,1.0,23.51


In [106]:
# Explore Program Dataset
print("\n********** Program Dataset **********")

print("\nShape:")
print(programs_df.shape)

print("\nColumns:")
print(programs_df.columns.to_list())

print("\nInfo:")
print(programs_df.info())

print("\nCheck missing values:")
print(programs_df.isna().sum())

print("\nSample rows:")
display(programs_df.head())
display(programs_df.tail())


********** Program Dataset **********

Shape:
(2598, 10)

Columns:
['title', 'description', 'level', 'goal', 'equipment', 'program_length', 'time_per_workout', 'total_exercises', 'created', 'last_edit']

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2598 entries, 0 to 2597
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   title             2598 non-null   object 
 1   description       2594 non-null   object 
 2   level             2598 non-null   object 
 3   goal              2598 non-null   object 
 4   equipment         2597 non-null   object 
 5   program_length    2597 non-null   float64
 6   time_per_workout  2598 non-null   float64
 7   total_exercises   2598 non-null   int64  
 8   created           2597 non-null   object 
 9   last_edit         2596 non-null   object 
dtypes: float64(2), int64(1), object(7)
memory usage: 203.1+ KB
None

Check missing values:
title               0
desc

Unnamed: 0,title,description,level,goal,equipment,program_length,time_per_workout,total_exercises,created,last_edit
0,(MASS MONSTER) High Intensity 4 Day Upper Lowe...,Build tones of muscular with this high intensi...,['Intermediate'],"['Muscle & Sculpting', 'Bodyweight Fitness']",Full Gym,12.0,90.0,384,2024-01-20 10:23:00,2025-06-29 12:39:00
1,(NOT MY PROGRAM)SHJ Jotaro,Build strength and size,"['Advanced', 'Intermediate']",['Bodybuilding'],Full Gym,8.0,60.0,224,2024-07-08 02:28:00,2025-06-18 09:15:00
2,1 PowerLift Per Day Powerbuilding 5 Day Bro Split,Based off of Andy Baker's KCS (Kingwood Streng...,"['Beginner', 'Novice', 'Intermediate']","['Athletics', 'Powerlifting', 'Powerbuilding']",Full Gym,6.0,90.0,237,2025-04-23 09:21:00,2025-06-18 11:55:00
3,10 Week Mass Building Program,This workout is designed to increase your musc...,"['Intermediate', 'Advanced']",['Powerbuilding'],Garage Gym,10.0,70.0,280,2024-09-07 03:44:00,2025-06-18 08:01:00
4,10 week deadlift focus,Increase deadlift,"['Intermediate', 'Advanced']","['Powerbuilding', 'Powerlifting', 'Bodybuildin...",Full Gym,10.0,80.0,356,2024-12-23 03:13:00,2025-06-18 12:19:00


Unnamed: 0,title,description,level,goal,equipment,program_length,time_per_workout,total_exercises,created,last_edit
2593,🎧,Lihaskasvu,"['Intermediate', 'Advanced']",['Bodybuilding'],Garage Gym,12.0,90.0,228,2024-10-10 04:20:00,2025-06-18 11:32:00
2594,👾Reza's Routine👾,This is a beginner friendly routine made for m...,"['Beginner', 'Intermediate']",['Muscle & Sculpting'],Dumbbell Only,1.0,60.0,60,2024-09-15 08:45:00,2025-06-18 07:48:00
2595,"🔥 ""Upper Body Dominance: 3-Day Growth System"" 🔥","""Upper Body Dominance: A science-based 3-day w...","['Intermediate', 'Novice']",['Muscle & Sculpting'],Full Gym,6.0,60.0,96,2025-02-15 08:18:00,2025-06-18 07:48:00
2596,🙈🙉🙊🐵,Muscle Memory Training,['Intermediate'],['Bodybuilding'],Full Gym,8.0,90.0,211,2024-12-08 01:04:00,2025-06-18 11:35:00
2597,🥷🥷🥷,To become stronger without becoming “bulky”,"['Intermediate', 'Novice']","['Bodybuilding', 'Powerbuilding']",Garage Gym,9.0,100.0,216,2025-05-15 10:44:00,2025-06-18 12:08:00


### Preprocessing

##### Drop unnecessary columns

In [107]:
members_df = members_df.drop(columns=['BMI', 'Max_BPM', 'Avg_BPM', 'Resting_BPM', 'Calories_Burned', 'Water_Intake (liters)', 'Fat_Percentage'])
print("\nGym members sample rows:")
display(members_df.head())

programs_df = programs_df.drop(columns=['created','last_edit'])
print("\nProgram sample rows:")
display(programs_df.head())


Gym members sample rows:


Unnamed: 0,Age,Gender,Weight (kg),Height (m),Session_Duration (hours),Workout_Type,Workout_Frequency (days/week),Experience_Level
0,34.0,Female,86.7,1.86,1.12,Strength,5.0,2.0
1,26.0,Female,84.7,1.83,1.0,Strength,5.0,2.0
2,22.0,Male,64.8,1.85,1.24,Cardio,3.0,2.0
3,54.0,Female,75.3,1.82,1.45,Cardio,4.0,1.0
4,34.0,Female,52.8,1.74,1.6,Strength,4.0,2.0



Program sample rows:


Unnamed: 0,title,description,level,goal,equipment,program_length,time_per_workout,total_exercises
0,(MASS MONSTER) High Intensity 4 Day Upper Lowe...,Build tones of muscular with this high intensi...,['Intermediate'],"['Muscle & Sculpting', 'Bodyweight Fitness']",Full Gym,12.0,90.0,384
1,(NOT MY PROGRAM)SHJ Jotaro,Build strength and size,"['Advanced', 'Intermediate']",['Bodybuilding'],Full Gym,8.0,60.0,224
2,1 PowerLift Per Day Powerbuilding 5 Day Bro Split,Based off of Andy Baker's KCS (Kingwood Streng...,"['Beginner', 'Novice', 'Intermediate']","['Athletics', 'Powerlifting', 'Powerbuilding']",Full Gym,6.0,90.0,237
3,10 Week Mass Building Program,This workout is designed to increase your musc...,"['Intermediate', 'Advanced']",['Powerbuilding'],Garage Gym,10.0,70.0,280
4,10 week deadlift focus,Increase deadlift,"['Intermediate', 'Advanced']","['Powerbuilding', 'Powerlifting', 'Bodybuildin...",Full Gym,10.0,80.0,356


##### Convert string representation of lists into Python lists

In [108]:
import ast

programs_df['level'] = programs_df['level'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
programs_df['goal'] = programs_df['goal'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

##### Handle missing values

In [109]:
# Gym Members Dataset
numeric_cols = ['Age', 'Weight (kg)', 'Height (m)', 'Session_Duration (hours)', 'Workout_Frequency (days/week)', 'Experience_Level']
imputer = SimpleImputer(strategy='median')
members_df[numeric_cols] = imputer.fit_transform(members_df[numeric_cols])

categorical_cols = ['Gender', 'Workout_Type']
imputer = SimpleImputer(strategy='most_frequent')
members_df[categorical_cols] = imputer.fit_transform(members_df[categorical_cols])

print("\nCheck missing values:")
print(members_df.isna().sum())


Check missing values:
Age                              0
Gender                           0
Weight (kg)                      0
Height (m)                       0
Session_Duration (hours)         0
Workout_Type                     0
Workout_Frequency (days/week)    0
Experience_Level                 0
dtype: int64


In [110]:
# Programs Dataset
numeric_cols = ['program_length']
imputer = SimpleImputer(strategy='median')
programs_df[numeric_cols] = imputer.fit_transform(programs_df[numeric_cols])

categorical_cols = ['description', 'equipment']
imputer = SimpleImputer(strategy='most_frequent')
programs_df[categorical_cols] = imputer.fit_transform(programs_df[categorical_cols])

print("\nCheck missing values:")
print(programs_df.isna().sum())


Check missing values:
title               0
description         0
level               0
goal                0
equipment           0
program_length      0
time_per_workout    0
total_exercises     0
dtype: int64


In [111]:
# Get rows where the 'level' column contains empty lists
empty_level_rows = programs_df[programs_df['level'].apply(lambda x: isinstance(x, list) and len(x) == 0)]
display(empty_level_rows)

# Get rows where the 'goal' column contains empty lists
empty_goal_rows = programs_df[programs_df['goal'].apply(lambda x: isinstance(x, list) and len(x) == 0)]
display(empty_goal_rows)

rows_to_drop = programs_df[
    programs_df['level'].apply(lambda x: isinstance(x, list) and len(x) == 0) | 
    programs_df['goal'].apply(lambda x: isinstance(x, list) and len(x) == 0)
].index
print("\nRows to drop:")
print(rows_to_drop)

# Drop rows
programs_df = programs_df.drop(rows_to_drop)
print("\nPrograms dataset shape:")
print(programs_df.shape)

Unnamed: 0,title,description,level,goal,equipment,program_length,time_per_workout,total_exercises
1394,Lyle McDonald Routine (Strength/Hypertrophy Vers),IMPORTANT: THIS IS NOT THE ORIGINAL LYLE MCDON...,[],[],Full Gym,12.0,90.0,1152
1423,Mania (Upper/Lower),Mania is a 4 day per week training program tha...,[],[],Full Gym,12.0,90.0,288
1520,Monster Bench,Increase your 1RM on barbell bench.,[],[],Full Gym,4.0,60.0,104
1915,Reese Training Program,This is Reese's premium Whaley gym Training pr...,[],[],Dumbbell Only,16.0,50.0,144
1952,Rugby forward off season training,The purpose of this program is to build a soli...,[],[],Full Gym,12.0,90.0,388
2172,Swole 5 (Advanced),Same as intermediate but with more revocery in...,[],[],Garage Gym,12.0,80.0,144
2321,Treino do Josu,Hipertrofia,[],[],Full Gym,1.0,60.0,26
2423,Viking strong,Gain functional strength and hypertrophy,[],[],Full Gym,12.0,60.0,208


Unnamed: 0,title,description,level,goal,equipment,program_length,time_per_workout,total_exercises
1394,Lyle McDonald Routine (Strength/Hypertrophy Vers),IMPORTANT: THIS IS NOT THE ORIGINAL LYLE MCDON...,[],[],Full Gym,12.0,90.0,1152
1423,Mania (Upper/Lower),Mania is a 4 day per week training program tha...,[],[],Full Gym,12.0,90.0,288
1520,Monster Bench,Increase your 1RM on barbell bench.,[],[],Full Gym,4.0,60.0,104
1915,Reese Training Program,This is Reese's premium Whaley gym Training pr...,[],[],Dumbbell Only,16.0,50.0,144
1952,Rugby forward off season training,The purpose of this program is to build a soli...,[],[],Full Gym,12.0,90.0,388
2172,Swole 5 (Advanced),Same as intermediate but with more revocery in...,[],[],Garage Gym,12.0,80.0,144
2321,Treino do Josu,Hipertrofia,[],[],Full Gym,1.0,60.0,26
2423,Viking strong,Gain functional strength and hypertrophy,[],[],Full Gym,12.0,60.0,208



Rows to drop:
Index([1394, 1423, 1520, 1915, 1952, 2172, 2321, 2423], dtype='int64')

Programs dataset shape:
(2590, 8)


##### Preparing columns for merging

In [112]:
import ast

def normalize_words(words_list):
    if isinstance(words_list, str):
        words_list = ast.literal_eval(words_list)
    return [g.lower().strip() for g in words_list]

# Convert values: 0 -> beginner, 1 -> intermediate, 2 -> advanced
members_df['exp_level_match'] = members_df['Experience_Level'].map({
    1: 'beginner',
    2: 'intermediate',
    3: 'advanced'
})

level_order = {
    'novice': 0,
    'beginner': 0,
    'intermediate': 1,
    'advanced': 2
}

inverse_level_order = {
    0: 'beginner',
    1: 'intermediate',
    2: 'advanced'
}

def map_exp_levels_to_single_level(level_list):
    level_list = normalize_words(level_list)
    numeric_levels = [level_order.get(lvl, 0) for lvl in level_list]
    max_level = max(numeric_levels)
    return inverse_level_order[max_level]

# Convert values: ['Intermediate', 'Advanced'] -> advanced, ['Beginner', 'Novice', 'Intermediate'] -> intermediate
programs_df['level_mapped'] = programs_df['level'].apply(map_exp_levels_to_single_level)

print("\nGym members sample rows:")
display(members_df.head())
print(members_df.isna().sum())

print("\nProgram sample rows:")
display(programs_df.head())
print(programs_df.isna().sum())


Gym members sample rows:


Unnamed: 0,Age,Gender,Weight (kg),Height (m),Session_Duration (hours),Workout_Type,Workout_Frequency (days/week),Experience_Level,exp_level_match
0,34.0,Female,86.7,1.86,1.12,Strength,5.0,2.0,intermediate
1,26.0,Female,84.7,1.83,1.0,Strength,5.0,2.0,intermediate
2,22.0,Male,64.8,1.85,1.24,Cardio,3.0,2.0,intermediate
3,54.0,Female,75.3,1.82,1.45,Cardio,4.0,1.0,beginner
4,34.0,Female,52.8,1.74,1.6,Strength,4.0,2.0,intermediate


Age                              0
Gender                           0
Weight (kg)                      0
Height (m)                       0
Session_Duration (hours)         0
Workout_Type                     0
Workout_Frequency (days/week)    0
Experience_Level                 0
exp_level_match                  0
dtype: int64

Program sample rows:


Unnamed: 0,title,description,level,goal,equipment,program_length,time_per_workout,total_exercises,level_mapped
0,(MASS MONSTER) High Intensity 4 Day Upper Lowe...,Build tones of muscular with this high intensi...,[Intermediate],"[Muscle & Sculpting, Bodyweight Fitness]",Full Gym,12.0,90.0,384,intermediate
1,(NOT MY PROGRAM)SHJ Jotaro,Build strength and size,"[Advanced, Intermediate]",[Bodybuilding],Full Gym,8.0,60.0,224,advanced
2,1 PowerLift Per Day Powerbuilding 5 Day Bro Split,Based off of Andy Baker's KCS (Kingwood Streng...,"[Beginner, Novice, Intermediate]","[Athletics, Powerlifting, Powerbuilding]",Full Gym,6.0,90.0,237,intermediate
3,10 Week Mass Building Program,This workout is designed to increase your musc...,"[Intermediate, Advanced]",[Powerbuilding],Garage Gym,10.0,70.0,280,advanced
4,10 week deadlift focus,Increase deadlift,"[Intermediate, Advanced]","[Powerbuilding, Powerlifting, Bodybuilding, Mu...",Full Gym,10.0,80.0,356,advanced


title               0
description         0
level               0
goal                0
equipment           0
program_length      0
time_per_workout    0
total_exercises     0
level_mapped        0
dtype: int64


##### Merge datasets

In [121]:
# Merge datasets into one dataset
merged_df = members_df.merge(
    programs_df,
    left_on='exp_level_match',
    right_on='level_mapped',
    how='left'
)

# Drop unnecessary (duplicated) columns
merged_df = merged_df.drop(columns=['Experience_Level', 'level', 'level_mapped'])

print("\nShape:")
print(merged_df.shape)

print("\nColumns:")
print(merged_df.columns.to_list())

print("\nInfo:")
print(merged_df.info())

print("\nCheck missing values:")
print(merged_df.isna().sum())

print("\nSample rows:")
display(merged_df.head())



# Check for duplicates across all columns
# Convert lists to tuples for duplicate check
merged_df['goal'] = merged_df['goal'].apply(lambda x: tuple(x) if isinstance(x, list) else x)

duplicates = merged_df[merged_df.duplicated()]
print(f"Number of duplicate rows: {duplicates.shape[0]}")

program_count = merged_df.groupby('exp_level_match')['title'].nunique()
print("\nCount how many unique programs (based on 'title' column) are available for each experience level:")
display(program_count.to_frame())

program_count = merged_df['exp_level_match'].value_counts()
print("\nCount how many users belong to each experience level in merged dataset:")
display(program_count.to_frame())


Shape:
(1594532, 15)

Columns:
['Age', 'Gender', 'Weight (kg)', 'Height (m)', 'Session_Duration (hours)', 'Workout_Type', 'Workout_Frequency (days/week)', 'exp_level_match', 'title', 'description', 'goal', 'equipment', 'program_length', 'time_per_workout', 'total_exercises']

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1594532 entries, 0 to 1594531
Data columns (total 15 columns):
 #   Column                         Non-Null Count    Dtype  
---  ------                         --------------    -----  
 0   Age                            1594532 non-null  float64
 1   Gender                         1594532 non-null  object 
 2   Weight (kg)                    1594532 non-null  float64
 3   Height (m)                     1594532 non-null  float64
 4   Session_Duration (hours)       1594532 non-null  float64
 5   Workout_Type                   1594532 non-null  object 
 6   Workout_Frequency (days/week)  1594532 non-null  float64
 7   exp_level_match                1594532 n

Unnamed: 0,Age,Gender,Weight (kg),Height (m),Session_Duration (hours),Workout_Type,Workout_Frequency (days/week),exp_level_match,title,description,goal,equipment,program_length,time_per_workout,total_exercises
0,34.0,Female,86.7,1.86,1.12,Strength,5.0,intermediate,(MASS MONSTER) High Intensity 4 Day Upper Lowe...,Build tones of muscular with this high intensi...,"[Muscle & Sculpting, Bodyweight Fitness]",Full Gym,12.0,90.0,384
1,34.0,Female,86.7,1.86,1.12,Strength,5.0,intermediate,1 PowerLift Per Day Powerbuilding 5 Day Bro Split,Based off of Andy Baker's KCS (Kingwood Streng...,"[Athletics, Powerlifting, Powerbuilding]",Full Gym,6.0,90.0,237
2,34.0,Female,86.7,1.86,1.12,Strength,5.0,intermediate,100 Push Ups in 6 Weeks,3 day a week push up program gradually increas...,"[Bodybuilding, Muscle & Sculpting]",At Home,6.0,10.0,18
3,34.0,Female,86.7,1.86,1.12,Strength,5.0,intermediate,1000 lbs Club,To hit 1000lbs!,[Bodybuilding],Full Gym,12.0,60.0,324
4,34.0,Female,86.7,1.86,1.12,Strength,5.0,intermediate,100x Push Up’s / Day,100x push up’s per day for 12 weeks,[Bodybuilding],At Home,12.0,20.0,84


Number of duplicate rows: 0

Count how many unique programs (based on 'title' column) are available for each experience level:


Unnamed: 0_level_0,title
exp_level_match,Unnamed: 1_level_1
advanced,848
beginner,550
intermediate,1192



Count how many users belong to each experience level in merged dataset:


Unnamed: 0_level_0,count
exp_level_match,Unnamed: 1_level_1
intermediate,923800
beginner,366300
advanced,304432
