In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Step 1: Load the dataset
df = pd.read_csv("PMC7727026_full_data.csv")

# Step 2: Select relevant features
features = [
    'Duration (weeks)', 
    '(times/week)', 
    'Energy intake (kcal/kg/day)', 
    'Protein intake (g/kg/day)', 
    'LBM change (kg)'
]
df_filtered = df[features].dropna().copy()

# Step 3: Create a new efficiency metric: muscle gain per gram of protein intake
df_filtered['Efficiency (kg/g_protein)'] = df_filtered['LBM change (kg)'] / df_filtered['Protein intake (g/kg/day)']
df_filtered['Efficiency (kg/g_protein)'] = df_filtered['Efficiency (kg/g_protein)'].replace([np.inf, -np.inf], np.nan).fillna(0)

# Step 4: Normalize data for clustering
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df_filtered.drop(columns=['Efficiency (kg/g_protein)']))

# Step 5: Silhouette score to find optimal k
silhouette_scores = {}

for k in range(2, 8):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(scaled_features)
    score = silhouette_score(scaled_features, labels)
    silhouette_scores[k] = score

best_k = max(silhouette_scores, key=silhouette_scores.get)
print(f"Best k (based on silhouette score): {best_k}")

# Step 6: Final clustering using best_k
kmeans_final = KMeans(n_clusters=best_k, random_state=42)
df_filtered['Cluster'] = kmeans_final.fit_predict(scaled_features)

# Step 7: Group by cluster and summarize
cluster_summary = df_filtered.groupby('Cluster').agg({
    'Duration (weeks)': 'mean',
    '(times/week)': 'mean',
    'Energy intake (kcal/kg/day)': 'mean',
    'Protein intake (g/kg/day)': 'mean',
    'LBM change (kg)': ['mean', 'max'],
    'Efficiency (kg/g_protein)': ['mean', 'max']
})

# Display cluster summary
print("\nCluster Summary:")
print(cluster_summary)

# Step 8: Export processed data for next team members
df_filtered.to_csv("processed_training_data.csv", index=False)


Best k (based on silhouette score): 3

Cluster Summary:
        Duration (weeks) (times/week) Energy intake (kcal/kg/day)  \
                    mean         mean                        mean   
Cluster                                                             
0              31.750000     2.833333                   28.256667   
1              10.790323     3.112903                   28.860000   
2               8.736842     4.578947                   38.981053   

        Protein intake (g/kg/day) LBM change (kg)       \
                             mean            mean  max   
Cluster                                                  
0                        1.242500        1.329250  3.3   
1                        1.399032        1.209615  4.3   
2                        1.915263        1.635421  3.9   

        Efficiency (kg/g_protein)             
                             mean        max  
Cluster                                       
0                        1.051806   2.4

## 📊 PCA & Visual Analysis

This section focuses on visually analyzing the training data after clustering.

Using **Principal Component Analysis (PCA)**, the original five training-related features were reduced to two dimensions. This allows us to clearly visualize how different training programs (clusters) group together based on common behavior patterns (e.g., frequency, duration, intake).

- The **PCA Scatterplot** shows how the programs naturally cluster into 3 distinct groups.
- The **boxplots** highlight how these groups differ in:
  - **Average daily protein intake**, and
  - **Change in lean body mass (LBM)**

These visual tools help us interpret which training strategies are more intense, more effective, or more efficient — and prepare the foundation for the final insights.


In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Load the processed dataset
df = pd.read_csv("processed_training_data.csv")

#Step 2: Select features for PCA
feature_cols = [
    'Duration (weeks)', 
    '(times/week)', 
    'Energy intake (kcal/kg/day)', 
    'Protein intake (g/kg/day)', 
    'LBM change (kg)'
]
X = df[feature_cols]

#Step 3: Apply PCA reduced to 2 components
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X)

#Step 4: Create a new DataFrame with PCA results and Cluster info
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
pca_df['Cluster'] = df['Cluster']

# Step 5: PCA Scatterplot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=pca_df, x='PC1', y='PC2', hue='Cluster', palette='Set2', s=100)

plt.title('Visual Clustering of Training Programs (Reduced Dimensions)')
plt.xlabel('Training Behavior Pattern (X-axis)')
plt.ylabel('Training Behavior Pattern (Y-axis)')
plt.legend(title='Program Cluster')
plt.grid(True)
plt.tight_layout()
plt.show()

# Step 6: Boxplots for: 1) Protein Intake and 2) LBM Change by Cluster
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Step 6.1: Protein Intake
sns.boxplot(data=df, x='Cluster', y='Protein intake (g/kg/day)', ax=axes[0], palette='Set1')
axes[0].set_title('Average Daily Protein Intake by Program Cluster')
axes[0].set_xlabel('Training Program Cluster')
axes[0].set_ylabel('Protein Intake (g/kg/day)')

# step 6.2: LBM Change
sns.boxplot(data=df, x='Cluster', y='LBM change (kg)', ax=axes[1], palette='Set1')
axes[1].set_title('Muscle Gain (LBM) by Program Cluster')
axes[1].set_xlabel('Training Program Cluster')
axes[1].set_ylabel('Change in Lean Body Mass (kg)')

# Step 6.3: Display boxplots
plt.tight_layout()
plt.show()
