In [1]:
from scipy.spatial import KDTree
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load datasets
dataset1 = pd.read_csv("data/workout_fitness_tracker_data.csv")
dataset2 = pd.read_csv("data/health_fitness_dataset.csv")

dataset1["BMI"] = dataset1["Weight (kg)"] / ((dataset1["Height (cm)"] / 100) ** 2)

# Standardize Gender Labels
gender_map = {"Male": "M", "Female": "F", "Other": "Other"}  # Keep "Other"
dataset1["Gender"] = dataset1["Gender"].map(gender_map)

# Rename Dataset 2 columns for consistency
dataset2 = dataset2.rename(columns={
    "age": "Age",
    "height_cm": "Height (cm)",
    "weight_kg": "Weight (kg)",
    "resting_heart_rate": "Resting Heart Rate (bpm)",
    "bmi": "BMI",
    "gender": "Gender"
})

# Define matching features & their weights
features = ["Age", "Height (cm)", "Weight (kg)", "Resting Heart Rate (bpm)", "BMI"]
feature_weights = np.array([1, 1, 1, 3, 3])  # More weight for Resting HR & BMI

# Separate "Other" from gender-based matching
gender_groups = ["M", "F"]
merged_dataframes = []

for gender in gender_groups:  # Match "M" to "M" and "F" to "F"
    ds1_filtered = dataset1[dataset1["Gender"] == gender].copy()
    ds2_filtered = dataset2[dataset2["Gender"] == gender].copy()

    if ds2_filtered.empty or ds1_filtered.empty:
        print(f"⚠️ No matching users found in Dataset 2 for gender: {gender}")
        continue

    # Extract & normalize features
    scaler = MinMaxScaler()
    ds1_scaled = scaler.fit_transform(ds1_filtered[features]) * feature_weights
    ds2_scaled = scaler.transform(ds2_filtered[features]) * feature_weights

    # KD-Tree for nearest-neighbor search
    kd_tree = KDTree(ds2_scaled)
    distances, best_matches = kd_tree.query(ds1_scaled, k=1)

    # Merge results
    best_match_df = ds2_filtered.iloc[best_matches].reset_index(drop=True)
    merged_df = ds1_filtered.reset_index(drop=True).copy()

    # Keep Dataset 1 values and add only unique columns from Dataset 2
    ds2_unique_cols = [col for col in dataset2.columns if col not in dataset1.columns]
    merged_df[ds2_unique_cols] = best_match_df[ds2_unique_cols].values
    merged_df["Match Distance"] = distances  # Lower = better match

    merged_dataframes.append(merged_df)

# Handle "Other" separately (match based on closest features, ignoring gender)
ds1_other = dataset1[dataset1["Gender"] == "Other"].copy()
if not ds1_other.empty:
    scaler = MinMaxScaler()
    ds1_other_scaled = scaler.fit_transform(ds1_other[features]) * feature_weights
    ds2_all_scaled = scaler.transform(dataset2[features]) * feature_weights  # Match to all dataset2 users

    kd_tree = KDTree(ds2_all_scaled)
    distances, best_matches = kd_tree.query(ds1_other_scaled, k=1)

    best_match_df = dataset2.iloc[best_matches].reset_index(drop=True)
    merged_other = ds1_other.reset_index(drop=True).copy()

    # Add unique columns from Dataset 2
    merged_other[ds2_unique_cols] = best_match_df[ds2_unique_cols].values
    merged_other["Match Distance"] = distances  # Lower = better match

    merged_dataframes.append(merged_other)

# Combine results
final_merged_dataset = pd.concat(merged_dataframes, ignore_index=True)

# Save final dataset
final_merged_dataset.to_csv("merged_dataset_filter_by_gender3.csv", index=False)

print("✅ Matching completed (including 'Other' category)! Data saved as merged_dataset.csv")

✅ Matching completed (including 'Other' category)! Data saved as merged_dataset.csv


In [2]:
print(final_merged_dataset.columns)

Index(['User ID', 'Age', 'Gender', 'Height (cm)', 'Weight (kg)',
       'Workout Type', 'Workout Duration (mins)', 'Calories Burned',
       'Heart Rate (bpm)', 'Steps Taken', 'Distance (km)', 'Workout Intensity',
       'Sleep Hours', 'Water Intake (liters)', 'Daily Calories Intake',
       'Resting Heart Rate (bpm)', 'VO2 Max', 'Body Fat (%)',
       'Mood Before Workout', 'Mood After Workout', 'BMI', 'participant_id',
       'date', 'activity_type', 'duration_minutes', 'intensity',
       'calories_burned', 'avg_heart_rate', 'hours_sleep', 'stress_level',
       'daily_steps', 'hydration_level', 'blood_pressure_systolic',
       'blood_pressure_diastolic', 'health_condition', 'smoking_status',
       'fitness_level', 'Match Distance'],
      dtype='object')


In [6]:

final_merged_dataset = final_merged_dataset.drop(columns=['duration_minutes', 'calories_burned', 'avg_heart_rate', 
                      'daily_steps', 'Match Distance', 'hours_sleep', 
                      'hydration_level', 'intensity', 'participant_id', 'date', 'activity_type', 'fitness_level'])

# Display the updated DataFrame
print(final_merged_dataset.columns)

Index(['User ID', 'Age', 'Gender', 'Height (cm)', 'Weight (kg)',
       'Workout Type', 'Workout Duration (mins)', 'Calories Burned',
       'Heart Rate (bpm)', 'Steps Taken', 'Distance (km)', 'Workout Intensity',
       'Sleep Hours', 'Water Intake (liters)', 'Daily Calories Intake',
       'Resting Heart Rate (bpm)', 'VO2 Max', 'Body Fat (%)',
       'Mood Before Workout', 'Mood After Workout', 'BMI', 'stress_level',
       'blood_pressure_systolic', 'blood_pressure_diastolic',
       'health_condition', 'smoking_status'],
      dtype='object')


In [7]:
final_merged_dataset.rename(columns={
    'stress_level': 'Stress Level',
    'blood_pressure_systolic': 'Blood Pressure Systolic',
    'blood_pressure_diastolic': 'Blood Pressure Diastolic',
    'health_condition': 'Health Condition',
    'smoking_status': 'Smoking Status'
}, inplace=True)

# Display the updated DataFrame
print(final_merged_dataset.columns)

Index(['User ID', 'Age', 'Gender', 'Height (cm)', 'Weight (kg)',
       'Workout Type', 'Workout Duration (mins)', 'Calories Burned',
       'Heart Rate (bpm)', 'Steps Taken', 'Distance (km)', 'Workout Intensity',
       'Sleep Hours', 'Water Intake (liters)', 'Daily Calories Intake',
       'Resting Heart Rate (bpm)', 'VO2 Max', 'Body Fat (%)',
       'Mood Before Workout', 'Mood After Workout', 'BMI', 'Stress Level',
       'Blood Pressure Systolic', 'Blood Pressure Diastolic',
       'Health Condition', 'Smoking Status'],
      dtype='object')
