Question: Can we predict the amount of calories burnt during a workout based on biological measures?

In [None]:
# imports
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

### Dataset Creation and Initial Treatment

In [None]:
# loading
df = pd.read_csv('calories.csv') # source: https://www.kaggle.com/datasets/ruchikakumbhar/calories-burnt-prediction/data

In [None]:
df.head(10)

In [None]:
df.info()

In [None]:
# setting up and cleaning
df = df.set_index('User_ID')  # set the column 'User_ID' as the dataset index
df = df.drop_duplicates()  # remove duplicates
df = df.fillna(0)  # fill empty columns with 0

In [None]:
# changing gender to binary numbers, so we can do a better correlation
df['Gender'] = df['Gender'].map({'male' : 0, 'female': 1})

### EDA (Exploratory Data Analisys)

In [None]:
df.describe()

In [None]:
# correlation: -1 is inversely proportional and 1 is directly proportional
df.corr()
# this shows us that the top three correlated (apparently) variables are duration, heart_rate and body_temperature

In [None]:
# undo the binary gender representation
df['Gender'] = df['Gender'].map({0 : 'male', 1: 'female'})

In [None]:
# average height and weight between men and women
gender_groupby = df.groupby('Gender').agg({'Height': 'mean', 'Weight': 'mean', 'Calories': 'mean'})
gender_groupby

In [None]:
# measures of men and women who burnt most calories
men_data = df[df['Gender'] == 'male'].sort_values(by='Calories', ascending=False)[['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'Calories']]
men_data.head(10)

In [None]:
women_data = df[df['Gender'] == 'female'].sort_values(by='Calories', ascending=False)[['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'Calories']]
women_data.head(10)

In [None]:
# checking the correlation between duration and heart rate, and duration and body temp
filter_data = df.sort_values(by=['Duration', 'Body_Temp', 'Heart_Rate'], ascending=[True, False, False])
filter_data.head()

In [None]:
# attempt of representation of all the signficant variables in one scatter plot
fig = px.scatter(
    df, 
    x='Duration',
    y='Calories',
    color='Gender',
    size='Heart_Rate',
    trendline='ols', # linear regression line
    hover_data=['Height', 'Weight', 'Heart_Rate', 'Body_Temp'],
    labels={'Gender': 'Gender', 'Heart_Rate': 'Heart Rate (bpm)'},
    color_discrete_sequence=['blue', 'red']  # used for categorical variables color
)

fig.update_layout(
    title='Calories Burnt During a Workout',
    xaxis_title='Workout Duration (minutes)',
    yaxis_title='Calories Burnt'
)

fig.update_layout(width=1000, height=700)

fig.show()

# Building a model

In [None]:
# verifying the statistic relationship between the variables

y = df['Calories']  # target variable
X = df[['Duration', 'Body_Temp', 'Heart_Rate', 'Age', 'Height', 'Weight']]  # feature variables (the key variables to predict the target variable)

# constant
X = sm.add_constant(X)

# model
model = sm.OLS(y, X).fit()

# results
print(model.summary())

The results indicates that our models is good at predicting, don't suffer with auto-correlation but the multicollinearity between the feature variables is a problem. I've checked before that the cause of this multicollinearity is the correlation between duration and heart rate and body temperature. It seems like duration affects directly these two variables, which means that they're not good predictors for the calories burnt.

In [None]:
# creating a linear regression model with Sklearn

# variables
X = np.array(df['Duration'])  # the main feature variable
X = X.reshape(-1, 1) # the model expects a matrix

y = df['Calories']

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# model 
model = LinearRegression()
model.fit(X_train, y_train)

# score
score = model.score(X_test, y_test)
print(score)

# coefficients
print(model.coef_)
print(model.intercept_)

In [None]:
# plotting with matplotlib
plt.scatter(X, y)
plt.plot(X, model.predict(X), color="red", label='Model Prediction')
plt.xlabel('Workout Duration')
plt.ylabel('Calories Burnt')
plt.legend()
plt.show()

# Deploy

In [None]:
# creating an interface with streamlit