# Obesity Classification using K-Means Clustering

This notebook trains and evaluates a K-Means Clustering model on the obesity dataset. Note that K-Means is an unsupervised learning algorithm, so we'll need to map cluster labels to actual classes for evaluation.

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from time import time
from datetime import datetime
from collections import Counter

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12

## 1. Load and Preprocess Data

In [None]:
def load_and_preprocess_data(file_path):
    df = pd.read_csv(file_path)
    
    print(f"Dataset shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    
    missing_values = df.isnull().sum()
    if missing_values.sum() > 0:
        print(f"Missing values per column:\n{missing_values[missing_values > 0]}")
        df.fillna(df.mean(), inplace=True)
    
    categorical_cols = df.select_dtypes(include=['object']).columns
    print(f"Categorical columns: {categorical_cols.tolist()}")
    
    encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        encoders[col] = le
    
    X = df.drop(['id', 'WeightCategory'], axis=1)
    y = df['WeightCategory']
    
    print("\nTarget class distribution:")
    print(y.value_counts())
    
    le_target = LabelEncoder()
    y_encoded = le_target.fit_transform(y)
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.25, random_state=42)
    
    n_classes = len(np.unique(y_encoded))
    print(f"\nNumber of unique classes: {n_classes}")
    
    return X, X_scaled, y, y_encoded, X_train, X_test, y_train, y_test, categorical_cols, encoders, le_target, scaler, n_classes

file_path = "train.csv"

X, X_scaled, y, y_encoded, X_train, X_test, y_train, y_test, categorical_cols, encoders, le_target, scaler, n_classes = load_and_preprocess_data(file_path)

## 2. Create Results Directory

In [None]:
results_dir = "kmeans_results"
if not os.path.exists(results_dir):
    os.makedirs(results_dir)
print(f"Results will be saved to '{results_dir}' directory.")

## 3. Finding Optimal Number of Clusters

In [None]:
sse = []
k_range = range(1, 15)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_train)
    sse.append(kmeans.inertia_)
    
plt.figure(figsize=(10, 6))
plt.plot(k_range, sse, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Sum of Squared Distances')
plt.title('Elbow Method For Optimal k')
plt.xticks(k_range)
plt.grid(True)
plt.savefig(f"{results_dir}/elbow_method.png")
plt.show()

print(f"Based on our dataset, we'll use {n_classes} clusters as we have {n_classes} unique weight categories.")

## 4. Train K-Means Clustering Model

In [None]:
start_time = time()
model = KMeans(n_clusters=n_classes, random_state=42)
model.fit(X_train)

train_predictions = model.predict(X_train)
y_pred = model.predict(X_test)

cluster_to_label = {}
for cluster in range(model.n_clusters):
    mask = (train_predictions == cluster)
    if np.any(mask):
        counter = Counter(y_train[mask])
        most_common = counter.most_common(1)
        if most_common:
            cluster_to_label[cluster] = most_common[0][0]
        else:
            cluster_to_label[cluster] = -1
    else:
        cluster_to_label[cluster] = -1

print("Cluster to Label Mapping:")
for cluster, label in cluster_to_label.items():
    print(f"Cluster {cluster} → Label {label}")

y_pred_mapped = np.array([cluster_to_label.get(label, -1) for label in y_pred])
train_pred_mapped = np.array([cluster_to_label.get(label, -1) for label in train_predictions])

train_accuracy = accuracy_score(y_train, train_pred_mapped)
test_accuracy = accuracy_score(y_test, y_pred_mapped)
training_time = time() - start_time

print(f"\nTraining Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")
print(f"Training Time: {training_time:.2f} seconds")

report = classification_report(y_test, y_pred_mapped, output_dict=True)
report_df = pd.DataFrame(report).transpose()
print("\nClassification Report:")
print(report_df)

## 5. Visualize Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred_mapped)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - K-Means Clustering')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()

plt.savefig(f"{results_dir}/confusion_matrix.png")
plt.show()

## 6. Visualize Clusters (2D Projection)

In [None]:
pca_viz = PCA(n_components=2)
X_train_2d = pca_viz.fit_transform(X_train)

plt.figure(figsize=(12, 10))

plt.subplot(1, 2, 1)
plt.scatter(X_train_2d[:, 0], X_train_2d[:, 1], c=y_train, cmap='viridis', alpha=0.7)
plt.colorbar()
plt.title('Actual Classes')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')

plt.subplot(1, 2, 2)
plt.scatter(X_train_2d[:, 0], X_train_2d[:, 1], c=train_predictions, cmap='viridis', alpha=0.7)
plt.colorbar()
plt.title('K-Means Clusters')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')

plt.tight_layout()
plt.savefig(f"{results_dir}/cluster_visualization.png")
plt.show()

## 7. Save Results

In [None]:
report_df.to_csv(f"{results_dir}/classification_report.csv")
print(f"Classification report saved to {results_dir}/classification_report.csv")

mapping_df = pd.DataFrame(list(cluster_to_label.items()), columns=['Cluster', 'Label'])
mapping_df.to_csv(f"{results_dir}/cluster_mapping.csv", index=False)
print(f"Cluster mapping saved to {results_dir}/cluster_mapping.csv")

results = {
    'Model': 'KMeans',
    'Training Accuracy': train_accuracy,
    'Testing Accuracy': test_accuracy,
    'Training Time (s)': training_time,
    'Number of Clusters': n_classes
}
results_df = pd.DataFrame([results])
results_df.to_csv(f"{results_dir}/results_summary.csv", index=False)
print(f"Results summary saved to {results_dir}/results_summary.csv")

results_df

## 8. Process Test Data and Create Submission

In [None]:
test = pd.read_csv("test.csv")
test_encoded = test.copy().drop(columns=["id"])

for col in test_encoded.select_dtypes(include=['object']).columns:
    if col in encoders:
        test_encoded[col] = encoders[col].transform(test_encoded[col].astype(str))
    else:
        le = LabelEncoder()
        test_encoded[col] = le.fit_transform(test_encoded[col].astype(str))

test_scaled = scaler.transform(test_encoded)
test_preds_clusters = model.predict(test_scaled)
test_preds = np.array([cluster_to_label.get(label, -1) for label in test_preds_clusters])
test_preds_labels = le_target.inverse_transform(test_preds)

submission = pd.DataFrame({
    "id": test["id"],
    "WeightCategory": test_preds_labels
})

submission.to_csv("submission_kmeans.csv", index=False)
print("\nsubmission_kmeans.csv is ready for Kaggle!")
submission.head()