Question: Can we predict the amount of calories burnt during a workout based on biological measures?

In [1]:
# imports
import pandas as pd
import numpy as np
import plotly.express as px
import statsmodels.api as sm

### Dataset Creation and Initial Treatment

In [2]:
# loading
df = pd.read_csv("calories.csv") # source: https://www.kaggle.com/datasets/ruchikakumbhar/calories-burnt-prediction/data

In [3]:
df.head(10)

Unnamed: 0,User_ID,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,14733363,male,68,190.0,94.0,29.0,105.0,40.8,231.0
1,14861698,female,20,166.0,60.0,14.0,94.0,40.3,66.0
2,11179863,male,69,179.0,79.0,5.0,88.0,38.7,26.0
3,16180408,female,34,179.0,71.0,13.0,100.0,40.5,71.0
4,17771927,female,27,154.0,58.0,10.0,81.0,39.8,35.0
5,15130815,female,36,151.0,50.0,23.0,96.0,40.7,123.0
6,19602372,female,33,158.0,56.0,22.0,95.0,40.5,112.0
7,11117088,male,41,175.0,85.0,25.0,100.0,40.7,143.0
8,12132339,male,60,186.0,94.0,21.0,97.0,40.4,134.0
9,17964668,female,26,146.0,51.0,16.0,90.0,40.2,72.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   User_ID     15000 non-null  int64  
 1   Gender      15000 non-null  object 
 2   Age         15000 non-null  int64  
 3   Height      15000 non-null  float64
 4   Weight      15000 non-null  float64
 5   Duration    15000 non-null  float64
 6   Heart_Rate  15000 non-null  float64
 7   Body_Temp   15000 non-null  float64
 8   Calories    15000 non-null  float64
dtypes: float64(6), int64(2), object(1)
memory usage: 1.0+ MB


In [5]:
# setting up and cleaning
df = df.set_index('User_ID')  # set the column 'User_ID' as the dataset index
df = df.drop_duplicates()  # remove duplicates
df = df.fillna(0)  # fill empty columns with 0

In [6]:
# changing gender to binary numbers, so we can do a better correlation
df['Gender'] = df['Gender'].map({"male" : 0, "female": 1})

### EDA (Exploratory Data Analisys)

In [7]:
df.describe()

Unnamed: 0,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
count,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0
mean,0.5035,42.790786,174.465831,74.967931,15.531035,95.518901,40.025488,89.542836
std,0.500004,16.980401,14.258333,15.035593,8.31931,9.583542,0.779244,62.45775
min,0.0,20.0,123.0,36.0,1.0,67.0,37.1,1.0
25%,0.0,28.0,164.0,63.0,8.0,88.0,39.6,35.0
50%,1.0,39.0,175.0,74.0,16.0,96.0,40.2,79.0
75%,1.0,56.0,185.0,87.0,23.0,103.0,40.6,138.0
max,1.0,79.0,222.0,132.0,30.0,128.0,41.5,314.0


In [8]:
# correlation: -1 is inversely proportional and 1 is directly proportional
df.corr()
# this shows us that the top three correlated (apparently) variables are duration, heart_rate and body_temperature

Unnamed: 0,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
Gender,1.0,-0.003165,-0.710522,-0.783171,-0.003388,-0.011518,-0.00722,-0.022306
Age,-0.003165,1.0,0.009512,0.090038,0.013202,0.010449,0.013136,0.154356
Height,-0.710522,0.009512,1.0,0.958452,-0.004664,0.0005,0.001167,0.017499
Weight,-0.783171,0.090038,0.958452,1.0,-0.00194,0.004271,0.004048,0.035426
Duration,-0.003388,0.013202,-0.004664,-0.00194,1.0,0.852866,0.903164,0.955419
Heart_Rate,-0.011518,0.010449,0.0005,0.004271,0.852866,1.0,0.771523,0.89788
Body_Temp,-0.00722,0.013136,0.001167,0.004048,0.903164,0.771523,1.0,0.824552
Calories,-0.022306,0.154356,0.017499,0.035426,0.955419,0.89788,0.824552,1.0


In [9]:
# undo the binary gender representation
df['Gender'] = df['Gender'].map({0 : "male", 1: "female"})

In [16]:
# average height and weight between men and women
gender_groupby = df.groupby('Gender').agg({'Height': 'mean', "Weight": "mean"})
gender_groupby

Unnamed: 0_level_0,Height,Weight
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
female,164.405985,63.275026
male,184.667517,86.825702


In [17]:
# attempt of representation of all the signficant variables in one scatter plot
fig = px.scatter(
    df, 
    x="Duration",
    y="Calories",
    color="Gender",
    size="Heart_Rate",
    trendline="ols",
    hover_data = ["Height", "Weight", "Heart_Rate", "Body_Temp"],
    labels={"Gender": "Gender", "Heart_Rate": "Heart Rate (bpm)"},
    color_discrete_sequence=["blue", "red"]  # used for categorical variables color
)

fig.update_layout(
    title="Calories Burnt During a Workout",
    xaxis_title="Workout Duration (minutes)",
    yaxis_title="Calories Burnt"
)

fig.update_layout(width=1000, height=700)

fig.show()

The scatter 