In [9]:
import pandas as pd
import numpy as np

In [12]:
df = pd.read_csv('gym_members_exercise_tracking.csv')
df.head()

Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,Fat_Percentage,Water_Intake (liters),Workout_Frequency (days/week),Experience_Level,BMI
0,56,Male,88.3,1.71,180,157,60,1.69,1313.0,Yoga,12.6,3.5,4,3,30.2
1,46,Female,74.9,1.53,179,151,66,1.3,883.0,HIIT,33.9,2.1,4,2,32.0
2,32,Female,68.1,1.66,167,122,54,1.11,677.0,Cardio,33.4,2.3,4,2,24.71
3,25,Male,53.2,1.7,190,164,56,0.59,532.0,Strength,28.8,2.1,3,1,18.41
4,38,Male,46.1,1.79,188,158,68,0.64,556.0,Strength,29.2,2.8,3,1,14.39


In [13]:
gender_map = {'Male':0, 'Female': 1}

df['Gender'] = df['Gender'].map(gender_map)
df['Gender'].unique()

array([0, 1], dtype=int64)

In [14]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output= False)

encoded = ['Workout_Type']
encodedArr = encoder.fit_transform(df[encoded])

# Convert to DataFrame with proper column names
encoded_df = pd.DataFrame(encodedArr, columns=encoder.get_feature_names_out(encoded))

# Concatenate the encoded columns with the original DataFrame (excluding the originals)
df = pd.concat([df.drop(columns=encoded), encoded_df], axis=1)

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df.drop('Calories_Burned', axis=1)
y = df['Calories_Burned']

# 1. Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Scale X only
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [16]:
from sklearn.linear_model import LinearRegression

logist = LinearRegression()

logist.fit(X_train_scaled, y_train)

In [17]:
# Predicting
y_pred = logist.predict(X_test_scaled)

In [18]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("The MSE   {}".format(mse))
print("The RMAE  {}".format(np.sqrt(mse)))
print("The MAE   {}".format(mae))
print("The R2    {}".format(mae))

The MSE   1646.1760145970627
The RMAE  40.57309471308619
The MAE   30.27013984532089
The R2    30.27013984532089


K-fold

In [None]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import numpy as np

# Build pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])

# Use KFold for regression
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Cross-validation for MSE
mse_scores = -cross_val_score(pipeline, X, y, cv=cv, scoring='neg_mean_squared_error')

# Cross-validation for R^2
r2_scores = cross_val_score(pipeline, X, y, cv=cv, scoring='r2')

# Calculate RMSE
rmse_scores = np.sqrt(mse_scores)

# Output
print("Cross-Validation MSE Scores:", mse_scores)
print("Average CV MSE:", mse_scores.mean())

print("\nCross-Validation RMSE Scores:", rmse_scores)
print("Average CV RMSE:", rmse_scores.mean())

print("\nCross-Validation R^2 Scores:", r2_scores)
print("Average CV R^2:", r2_scores.mean())

Cross-Validation MSE Scores: [1646.1760146  1543.8474902  1609.53001025 1663.81952768 1564.49935445]
Average CV MSE: 1605.5744794341726

Cross-Validation RMSE Scores: [40.57309471 39.29182472 40.11894827 40.78994395 39.55375272]
Average CV RMSE: 40.065512876570565

Cross-Validation R^2 Scores: [0.9802676  0.977628   0.97791888 0.97866037 0.97695961]
Average CV R^2: 0.9782868932188569


: 