In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

#Cell 1 — Import Libraries & Load Data
df = pd.read_csv("data/processed/gym_members_cleaned.csv")
df.head()

In [None]:
# Summary statistics for numeric and categorical columns
df.describe().T

In [None]:
# Check for missing data across columns
df.isna().sum()


In [None]:
# Plot boxplots for numeric columns to check outliers
num = df.select_dtypes(include=["float64", "int64"])
ax = num.boxplot(rot=45, figsize=(10,5))
plt.title("Boxplot of Numeric Variables – Outlier Check")

from pathlib import Path
Path("reports/figures").mkdir(parents=True, exist_ok=True)
plt.savefig("reports/figures/boxplot_outliers.png", bbox_inches="tight", dpi=300)
plt.show()

In [None]:
#Histograms for Key Variables:
vars_ = ["Age", "BMI", "Session_Duration_hours", "Workout_Frequency_days_week", "Calories_Burned"]

fig, axes = plt.subplots(2, 3, figsize=(12,8))
axes = axes.ravel()

for i, v in enumerate([c for c in vars_ if c in df.columns][:5]):
    sns.histplot(df[v], kde=True, bins=20, ax=axes[i])
    axes[i].set_title(f"Distribution of {v}")

plt.tight_layout()
plt.savefig("reports/figures/numeric_distributions.png", bbox_inches="tight", dpi=300)
plt.show()

In [None]:
#Workout Frequency vs Session Duration by Retention 
sns.scatterplot(
    data=df,
    x="Workout_Frequency_days_week",
    y="Session_Duration_hours",
    hue="retention_status"
)
plt.title("Workout Frequency vs Session Duration by Retention")
plt.savefig("reports/figures/scatter_frequency_duration_retention.png", bbox_inches="tight", dpi=300)
plt.show()


In [None]:
#Correlation Matrix 
corr = df.select_dtypes(include=["float64", "int64"]).corr()
plt.figure(figsize=(9,7))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Matrix of Numeric Features")
plt.savefig("reports/figures/correlation_matrix.png", bbox_inches="tight", dpi=300)
plt.show()


In [None]:
#Average Retention by Workout Type
plt.figure(figsize=(6,4))
sns.barplot(x='Workout_Type', y='retention_status', data=df, palette='muted', ci=None)
plt.title("Average Retention by Workout Type")
plt.xlabel("Workout Type")
plt.ylabel("Average Retention Rate")
plt.savefig("reports/figures/barplot_workouttype_retention.png", dpi=300, bbox_inches="tight")
plt.show()
