In [1]:
import pandas as pd

# Load both datasets
health_fitness_df = pd.read_csv('DataSets/health_fitness_dataset.csv')
workout_fitness_df = pd.read_csv('DataSets/workout_fitness_tracker_data.csv')

# Renaming participant_id in health_fitness_df to User ID for consistency
health_fitness_df.rename(columns={'participant_id': 'User ID', 'height_cm': 'Height (cm)', 'weight_kg': 'Weight (kg)'}, inplace=True)

# Merging datasets on 'User ID'
merged_df = pd.merge(health_fitness_df, workout_fitness_df, on='User ID', how='inner')

# Check the merged data
print(merged_df.head())

   User ID        date  age gender  Height (cm)_x  Weight (kg)_x  \
0        1  2024-01-01   56      F          165.3           53.7   
1        1  2024-01-04   56      F          165.3           53.9   
2        1  2024-01-05   56      F          165.3           54.2   
3        1  2024-01-07   56      F          165.3           54.4   
4        1  2024-01-09   56      F          165.3           54.7   

     activity_type  duration_minutes intensity  calories_burned  ...  \
0          Dancing                41       Low              3.3  ...   
1         Swimming                28       Low              2.9  ...   
2         Swimming                21    Medium              2.6  ...   
3  Weight Training                99    Medium             10.7  ...   
4         Swimming               100    Medium             12.7  ...   

   Distance (km)  Workout Intensity  Sleep Hours  Water Intake (liters)  \
0          14.44               High          8.2                    1.9   
1       

In [2]:
# Checking missing values
print(merged_df.isnull().sum())

User ID                          0
date                             0
age                              0
gender                           0
Height (cm)_x                    0
Weight (kg)_x                    0
activity_type                    0
duration_minutes                 0
intensity                        0
calories_burned                  0
avg_heart_rate                   0
hours_sleep                      0
stress_level                     0
daily_steps                      0
hydration_level                  0
bmi                              0
resting_heart_rate               0
blood_pressure_systolic          0
blood_pressure_diastolic         0
health_condition            490275
smoking_status                   0
fitness_level                    0
Age                              0
Gender                           0
Height (cm)_y                    0
Weight (kg)_y                    0
Workout Type                     0
Workout Duration (mins)          0
Calories Burned     

In [3]:
# Convert necessary columns to numeric values (if they are not already)
merged_df['Age'] = pd.to_numeric(merged_df['Age'], errors='coerce')
merged_df['Weight (kg)'] = pd.to_numeric(merged_df['Weight (kg)_x'], errors='coerce')
merged_df['Height (cm)'] = pd.to_numeric(merged_df['Height (cm)_x'], errors='coerce')

# Check if all data types are correct
print(merged_df.dtypes)

User ID                       int64
date                         object
age                           int64
gender                       object
Height (cm)_x               float64
Weight (kg)_x               float64
activity_type                object
duration_minutes              int64
intensity                    object
calories_burned             float64
avg_heart_rate                int64
hours_sleep                 float64
stress_level                  int64
daily_steps                   int64
hydration_level             float64
bmi                         float64
resting_heart_rate          float64
blood_pressure_systolic     float64
blood_pressure_diastolic    float64
health_condition             object
smoking_status               object
fitness_level               float64
Age                           int64
Gender                       object
Height (cm)_y                 int64
Weight (kg)_y                 int64
Workout Type                 object
Workout Duration (mins)     

In [4]:
# Calculate BMI if it's missing using the formula BMI = Weight (kg) / (Height (m))^2
merged_df['bmi_calculated'] = merged_df['Weight (kg)'] / (merged_df['Height (cm)'] / 100) ** 2

# Fill the missing 'bmi' column with the calculated BMI values, if it's missing
merged_df['bmi'] = merged_df['bmi'].fillna(merged_df['bmi_calculated'])

# Set Pandas to display more rows (adjust the number as needed)
pd.set_option('display.max_rows', 100)  # Change 100 to a number that fits your dataset

# Now print the relevant columns
print(merged_df[['User ID', 'Weight (kg)', 'Height (cm)', 'bmi', 'bmi_calculated']])



        User ID  Weight (kg)  Height (cm)   bmi  bmi_calculated
0             1         53.7        165.3  19.6       19.652988
1             1         53.9        165.3  19.6       19.726183
2             1         54.2        165.3  19.6       19.835976
3             1         54.4        165.3  19.6       19.909172
4             1         54.7        165.3  19.6       20.018965
...         ...          ...          ...   ...             ...
687696     3000        112.9        165.7  20.7       41.119604
687697     3000        113.1        165.7  20.7       41.192447
687698     3000        113.4        165.7  20.7       41.301710
687699     3000        113.6        165.7  20.7       41.374553
687700     3000        113.9        165.7  20.7       41.483817

[687701 rows x 5 columns]
