## Explore the dataset

Import Pandas package

In [None]:
import pandas as pd

In [None]:
gym = pd.read_csv('data/gym_members_exercise_tracking.csv')

In [None]:
gym.head(5)

In [None]:
gym.info()

In [None]:
gym.shape

In [None]:
gym.values

In [None]:
gym.columns

In [None]:
gym.index

In [None]:
gym.describe()

## Select a specific column

In [None]:
gym['Age']

In [None]:
gym[['Age','Fat_Percentage']]

In [None]:
gym.loc[:,'Age']

In [None]:
gym.iloc[:,1]

In [None]:
gym['Age'].describe()

In [None]:
gym[['Age']] # ---> dataframe as type

##### -----------------> Mentimiter

### Sort a column

In [None]:
gym.sort_values(gym.columns[3])

##### -----------------> Mentimiter

In [None]:
gym.sort_values(['Height (m)','Weight (kg)'], ascending=[False,True])

## Select a row

In [None]:
gym.iloc[0,:]

In [None]:
gym[gym['Age'] == 28]

## Subsetting based on multiple conditions

In [None]:
cond1 = gym['Age'] == 18

In [None]:
cond2 = gym['Gender'] == 'Male'

In [None]:
cond3 = gym['Height (m)'] > 1.80

In [None]:
gym[cond1 & cond2 & cond3]

In [None]:
ismale = gym['Gender'].isin(['Male'])

In [None]:
gymmale = gym[ismale]

##### -----------------> Mentimiter

## Adding columns

In [None]:
gym['Burned_by_duration'] = gym['Calories_Burned'] / gym['Session_Duration (hours)']

In [None]:
gym

## Column satistics

### Simple statistics

In [None]:
gym.Age.mean()

In [None]:
numeric_columns = gym.select_dtypes(include=['number']).columns
gym[numeric_columns].mean()

#### Count occurrencies

In [None]:
gym['Gender'].value_counts()

In [None]:
male = gym['Gender'] == 'Male'

In [None]:
gym[male]['Workout_Type'].value_counts(normalize=True)

### Powerful and flexible aggregation method .agg() 

Togheter with the usual statistic methods .mean() .min() .max()

#### Apply multiple functions at once

In [None]:
stats = ['mean','sum','min','max']

In [None]:
gym['Calories_Burned'].agg(stats)

#### Apply multiple functions to multiple columns

In [None]:
stats = ['mean','sum','min','max']

In [None]:
columns = ['Calories_Burned', 'Fat_Percentage']

In [None]:
gym[columns].agg(stats)

#### Custom functions

In [None]:
rangefun = lambda x: x.max() - x.min()

In [None]:
gym['Calories_Burned'].agg(rangefun)

#### Different functions to different columns

In [None]:
functions = {'Calories_Burned':'mean', 'Fat_Percentage':'max'}

In [None]:
gym.agg(functions)

In [None]:
gym.agg(Mean_Calories = ('Calories_Burned', 'mean'), Max_Fat = ('Fat_Percentage', 'max'))

In [None]:
functions = {'Mean_calories':('Calories_Burned','mean'),
             'Max_Flat':('Fat_Percentage','max')}

In [None]:
gym.agg(**functions) 

The ** operator "unpacks" the 'functions' dictionary, this means that the keys and values of the dictionary are passed as keyword arguments to the .agg() function.

## GroupBy

In [None]:
numeric_columns = gym.select_dtypes(include=['number']).columns
gym.groupby(['Workout_Type'])[numeric_columns].mean()

In [None]:
gymc = gym.iloc[:,[0,4,5,6,12]]

In [None]:
gymc = gymc.copy()

In [None]:
gymc.head(1)

In [None]:
# Define the limits
bins = [49, 56, 62, 68, 74]
bins1 = [17, 29, 39, 49, 59]

# Define labels
labels = ['50-56','57-62','63-68','69-74']
labels1 = ['18-29', '30-39', '40-49', '50-59']


# Use the function cut to create the new columns
gymc.loc[:,'BeatsRest_Class'] = pd.cut(gymc.loc[:,'Resting_BPM'], bins=bins, labels=labels, right=True) 
# right=True means closed to the right open to the left
gymc.loc[:,'Age_Class'] = pd.cut(gymc.loc[:,'Age'], bins=bins1, labels=labels1, right=True)

In [None]:
gymc.head(3)

In [None]:
numeric_columns = gymc.select_dtypes(include=['number']).columns
gymc.groupby(['Age_Class', 'BeatsRest_Class'],observed=True)[numeric_columns].mean()

##### -----------------> Mentimiter

## Pivot tables

In [None]:
gym.pivot_table(values='Age', index='Workout_Type')

In [None]:
rangefun = lambda x: str(x.min()) + ' - ' + str(x.max())
rangefun.__name__ = 'Range'

In [None]:
meanmed = lambda x: str(x.mean()) + ' - ' + str(x.median())
meanmed.__name__ = 'Mean - Median'

In [None]:
gym.pivot_table(values='Calories_Burned', index='Workout_Type', aggfunc=[rangefun,meanmed])

Using Groupby

In [None]:
gym.groupby(['Workout_Type'])['Calories_Burned'].agg(['min','max'])

In [None]:
gym.pivot_table(values='Calories_Burned', index='Workout_Type',columns='Experience_Level')

In [None]:
gym.pivot_table(values='Calories_Burned', index='Workout_Type',columns='Experience_Level', margins=True)

In [None]:
gym.pivot_table(values='Burned_by_duration', index='Workout_Type',columns='Experience_Level')

## Working with indexes

In [None]:
gym.set_index('Workout_Type')

Simpler to subset

In [None]:
gym.set_index('Workout_Type').loc['Yoga']

In [None]:
gym.set_index(['Workout_Type','Age']).loc[['Yoga','HIIT']].sort_index(level=["Workout_Type", "Age"], ascending=[True, False])

In [None]:
gym.columns

In [None]:
gym.set_index(['Workout_Type','Age']).loc[[('Yoga',20),('HIIT',30)],'Weight (kg)':'Resting_BPM'] # --> NOTE:Resting_BPM included

In [None]:
gym.pivot_table('Fat_Percentage',index='Workout_Type',columns='Experience_Level')

In [None]:
gym.pivot_table('Fat_Percentage',index='Workout_Type',columns='Experience_Level').loc['Cardio':'Strength']

In [None]:
gym.pivot_table('Fat_Percentage',index='Workout_Type',columns='Experience_Level').mean(axis='index')

In [None]:
gym.pivot_table('Fat_Percentage',index='Workout_Type',columns='Experience_Level').mean(axis='columns')

## Working with missing values

In [None]:
gymfiltered = gym.filter(like='BPM').iloc[1:20].copy()

In [None]:
import random
import numpy as np

# Numero di NaN da inserire
n_nan = 10

# Ottieni gli indici delle righe e delle colonne
rows = gymfiltered.index
cols = gymfiltered.columns

# Genera una lista di tuple (riga, colonna) casuali
random_indices = random.sample([(r, c) for r in rows for c in cols], n_nan)

# Inserisci NaN nelle posizioni casuali
for r, c in random_indices:
    gymfiltered.loc[r, c] = np.nan

gymfiltered

In [None]:
gymfiltered.isna().sum()

In [None]:
gymfiltered.dropna().isna().sum()

In [None]:
gymfiltered.fillna(0)

<span style="background-color:yellow;">Be carefull!</span>

In [None]:
gymfiltered.isna().sum()

In [None]:
gymfiltered.dropna(inplace=True)

In [None]:
gymfiltered.isna().sum()

In [None]:
gymfiltered

#### Saving to csv

In [None]:
gymfiltered.to_csv('gymfiltered.csv')

## Plotting

In [None]:
import matplotlib.pyplot as plt

In [None]:
gym['Age'].hist(bins=15)

In [None]:
gym.columns

In [None]:
gym.groupby('Workout_Type')['BMI'].mean().plot(kind='bar')

In [None]:
gym[gym["Gender"]=="Female"]["Weight (kg)"].hist(alpha=0.7,color='red')
gym[gym["Gender"]=="Male"]["Weight (kg)"].hist(alpha=0.7,color='blue')
plt.legend(["Female", "Male"])

In [None]:
# Calcolo delle frequenze di 'Workout_Type' raggruppate per 'Gender'
grouped_data = gym.groupby(['Workout_Type', 'Gender']).size().unstack()

# Imposta la figura
plt.figure(figsize=(12, 6))

# Creazione del grafico a barre
x_labels = grouped_data.index
x = range(len(x_labels))
bar_width = 0.35

# Disegna le barre per ciascun genere
for i, gender in enumerate(grouped_data.columns):
    plt.bar(
        [pos + i * bar_width for pos in x],
        grouped_data[gender],
        width=bar_width,
        label=gender
    )

# Etichette e titolo
plt.xticks([pos + bar_width / 2 for pos in x], x_labels)
plt.title('Workout Type by Gender')
plt.xlabel('Workout Type')
plt.ylabel('Count')
plt.legend(title='Gender')

# Mostra il grafico
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Calcolo delle frequenze di 'Workout_Type' raggruppate per 'Gender'
grouped_data = gym.groupby(['Workout_Type', 'Gender']).size().unstack()

# Imposta la figura
plt.figure(figsize=(12, 6))

# Creazione del grafico a barre
grouped_data.plot(kind='bar', width=0.8, ax=plt.gca())  # Usa direttamente Pandas per il grafico a barre

# Etichette e titolo
plt.title('Workout Type by Gender')
plt.xlabel('Workout Type')
plt.ylabel('Count')
plt.legend(title='Gender')

# Mostra il grafico
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Dati da plottare
x = gym['Session_Duration (hours)']
y = gym['Calories_Burned']

# Calcolo della linea di regressione
coefficients = np.polyfit(x, y, deg=1)  # Regressione lineare
regression_line = np.poly1d(coefficients)

# Imposta la figura
plt.figure(figsize=(12, 6))

# Scatter plot (punti dati)
plt.scatter(x, y, color='blue', alpha=0.7, label='Data points')

# Linea di regressione
x_vals = np.linspace(x.min(), x.max(), 100)
plt.plot(x_vals, regression_line(x_vals), color='red', label='Regression line')

# Titolo e etichette
plt.title('Calories Burned by Session Duration')
plt.xlabel('Session Duration (hours)')
plt.ylabel('Calories Burned')
plt.legend()

# Mostra il grafico
plt.tight_layout()
plt.show()


##### -----------------> Mentimiter