In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

shrinithiandalt_obesity_dataset1_path = kagglehub.dataset_download('shrinithiandalt/obesity-dataset1')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# ============================================
# OBESITY RISK FACTOR ANALYSIS & MODEL PREDICTION
# ============================================
# Single script (college-project style)
# - PCA block (standalone) added BEFORE preprocessing/modeling
# - PCA is ANALYTICAL ONLY and does NOT change the model pipeline
# - Only correlation heatmap is kept from EDA (on encoded features)
# - No clustering; model logic preserved exactly
# ============================================

# -----------------------------
# 0. LIBRARIES
# -----------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.model_selection import RepeatedStratifiedKFold, RandomizedSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier

# -----------------------------
# 📌 CHANGES MADE (vs original code)
# -----------------------------
# 1. Added standalone PCA analysis block BEFORE preprocessing/modeling (PCA for analysis only).
# 2. Kept all model training/tuning/cv logic unchanged.
# 3. Removed clustering and any other EDA plots; kept only correlation heatmap (on encoded features).
# 4. Reworked variable names to student-friendly style (df_train, x_scaled, best_xgb, etc.).
# 5. PCA uses its own preprocessing pipeline (imputer + scaler + one-hot) and prints explained variance.
# Note: PCA results are printed and plotted but NOT used in model training.

# -----------------------------
# 1. LOAD DATA (train1, train2, test)
# -----------------------------
df_train1 = pd.read_csv("/kaggle/input/obesity-dataset1/train1.csv")
df_train2 = pd.read_csv("/kaggle/input/obesity-dataset1/train2.csv")
df_test = pd.read_csv("/kaggle/input/obesity-dataset1/test.csv")

print("train1 shape:", df_train1.shape)
print("train2 shape:", df_train2.shape)
print("test shape :", df_test.shape)

# Combine the two training parts (this combined set will be used both for PCA analysis and for modeling)
combined_train = pd.concat([df_train1, df_train2], axis=0, ignore_index=True)
print("combined_train shape:", combined_train.shape)

# ====================================================
# PART A — PCA ANALYSIS (Standalone, BEFORE modeling)
# This block performs its own preprocessing inside a pipeline.
# It does NOT alter combined_train or any subsequent modeling data.
# ====================================================
print("\n=== PCA ANALYSIS (Standalone) ===")

# Prepare feature list for PCA (drop id and target)
target_col = 'WeightCategory'
pca_feature_cols = [c for c in combined_train.columns if c not in ['id', target_col]]
X_pca_raw = combined_train[pca_feature_cols].copy()

# Identify numeric and categorical columns
numeric_cols_pca = X_pca_raw.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols_pca = X_pca_raw.select_dtypes(include=['object']).columns.tolist()

print("Numeric cols for PCA:", numeric_cols_pca)
print("Categorical cols for PCA:", categorical_cols_pca)

# Build preprocessing pipelines for PCA (impute + scale numeric, impute + onehot categorical)
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    # OneHotEncoder with drop='first' to avoid full rank and handle_unknown='ignore'
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore', sparse=False))
])

preprocessor_pca = ColumnTransformer(transformers=[
    ('num', num_pipeline, numeric_cols_pca),
    ('cat', cat_pipeline, categorical_cols_pca)
])

# PCA pipeline: preprocessor then PCA (keep all components initially)
pca_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor_pca),
    ('pca', PCA(n_components=None))
])

# Fit PCA pipeline on the PCA feature matrix (this will not affect model pipeline)
X_pca_transformed = pca_pipeline.fit_transform(X_pca_raw)

# Extract explained variance ratios
pca_model = pca_pipeline.named_steps['pca']
explained_var = pca_model.explained_variance_ratio_
cum_explained_var = np.cumsum(explained_var)

# Print explained variance for each PC
print("\nExplained variance ratio by principal component (PCA):")
for i, var in enumerate(explained_var, start=1):
    print(f"PC{i}: {var:.4f}")

# Print cumulative explained variance and show elbow-ish info
print("\nCumulative explained variance (first 20 shown or all if less):")
for i, cumv in enumerate(cum_explained_var[:min(20, len(cum_explained_var))], start=1):
    print(f"PC{i}: Cumulative = {cumv:.4f}")

# Plot cumulative explained variance
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(cum_explained_var) + 1), cum_explained_var, marker='o')
plt.title("PCA - Cumulative Explained Variance (Standalone Analysis)")
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Explained Variance Ratio")
plt.grid(True)
plt.show()

print("=== End of PCA Analysis (standalone) ===\n")

# ====================================================
# PART B — PREPROCESSING & CORRELATION (for modeling)
# ====================================================
# The following encoding is the same approach used in the original script.
# We will encode categorical features in train & test, then show correlation heatmap
# on encoded features (this heatmap is the only EDA plot requested).
# ====================================================

# Copy combined_train to df_train for modeling steps (do not modify combined_train used earlier)
df_train = combined_train.copy()

# Label-encode the target column for modeling
le_target = LabelEncoder()
df_train['WeightCategory'] = le_target.fit_transform(df_train['WeightCategory'])

# Helper: encode object columns with LabelEncoder (keeps encoders for test mapping)
def encode_features(df, encoders=None):
    encoders = encoders or {}
    for col in df.columns:
        if df[col].dtype == 'object' and col != 'WeightCategory':
            if col not in encoders:
                le = LabelEncoder()
                df[col] = le.fit_transform(df[col].astype(str))
                encoders[col] = le
            else:
                le = encoders[col]
                # If unseen category appears in test, map to -1
                df[col] = df[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
    return df, encoders

# Encode train features and test features using the same encoders
df_train, feature_encoders = encode_features(df_train)
df_test_enc, _ = encode_features(df_test.copy(), feature_encoders)  # operate on a copy for safety

# Correlation heatmap on encoded features (drop id & target)
feature_cols_encoded = [c for c in df_train.columns if c not in ['id', 'WeightCategory']]
corr_matrix = df_train[feature_cols_encoded].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0)
plt.title("Correlation Heatmap (Encoded Features)")
plt.show()

# ====================================================
# PART C — MODEL PREPARATION (Scaling) and MODELING (unchanged logic)
# ====================================================
# Prepare X, y for modeling
X = df_train.drop(['id', 'WeightCategory'], axis=1)
y = df_train['WeightCategory']
X_test = df_test_enc.drop(['id'], axis=1)

# Scaling (explicit)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# Cross-validation strategy (same as original)
cv_strategy = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=42)

# -----------------------------
# MODEL 1: Logistic Regression (tuning)
# -----------------------------
lr_param_grid = {
    'C': np.logspace(-3, 2, 10),
    'solver': ['lbfgs', 'saga'],
    'max_iter': [500, 1000]
}
lr_model = LogisticRegression(multi_class='multinomial', random_state=42)
lr_tuner = RandomizedSearchCV(
    lr_model,
    lr_param_grid,
    n_iter=20,
    cv=cv_strategy,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

print("\n--- Tuning Logistic Regression ---")
lr_tuner.fit(X_scaled, y)
best_lr = lr_tuner.best_estimator_
acc_lr = np.mean(cross_val_score(best_lr, X_scaled, y, cv=cv_strategy, scoring='accuracy'))

# -----------------------------
# MODEL 2: Random Forest (tuning)
# -----------------------------
rf_param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [5, 10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}
rf_model = RandomForestClassifier(random_state=42)
rf_tuner = RandomizedSearchCV(
    rf_model,
    rf_param_grid,
    n_iter=30,
    cv=cv_strategy,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

print("\n--- Tuning Random Forest ---")
rf_tuner.fit(X_scaled, y)
best_rf = rf_tuner.best_estimator_
acc_rf = np.mean(cross_val_score(best_rf, X_scaled, y, cv=cv_strategy, scoring='accuracy'))

# -----------------------------
# MODEL 3: AdaBoost (tuning)
# -----------------------------
ada_param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0]
}
ada_model = AdaBoostClassifier(random_state=42)
ada_tuner = RandomizedSearchCV(
    ada_model,
    ada_param_grid,
    n_iter=20,
    cv=cv_strategy,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

print("\n--- Tuning AdaBoost ---")
ada_tuner.fit(X_scaled, y)
best_ada = ada_tuner.best_estimator_
acc_ada = np.mean(cross_val_score(best_ada, X_scaled, y, cv=cv_strategy, scoring='accuracy'))

# -----------------------------
# MODEL 4: XGBoost (deep tuning)
# -----------------------------
print("\n🔧 Starting Deep Hyperparameter Tuning on XGBoost (with Refined CV)...")

xgb_param_grid = {
    'n_estimators': [400, 600, 800, 1000],
    'max_depth': [4, 6, 8, 10],
    'learning_rate': [0.005, 0.01, 0.03, 0.05],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.3, 0.5],
    'min_child_weight': [1, 3, 5],
    'reg_lambda': [1, 1.5, 2, 3],
    'reg_alpha': [0, 0.1, 0.3, 0.5]
}

xgb_base = XGBClassifier(
    random_state=42,
    eval_metric='mlogloss',
    tree_method='hist',
    use_label_encoder=False
)

xgb_tuner = RandomizedSearchCV(
    estimator=xgb_base,
    param_distributions=xgb_param_grid,
    n_iter=50,
    scoring='accuracy',
    cv=cv_strategy,
    verbose=1,
    n_jobs=-1
)

print("\n--- Tuning XGBoost ---")
xgb_tuner.fit(X_scaled, y)
best_xgb = xgb_tuner.best_estimator_
acc_xgb = np.mean(cross_val_score(best_xgb, X_scaled, y, cv=cv_strategy, scoring='accuracy'))

# ====================================================
# PERFORMANCE SUMMARY
# ====================================================
print("\n============================")
print("MODEL PERFORMANCE COMPARISON")
print("============================")
print(f"Logistic Regression Accuracy : {acc_lr:.4f}")
print(f"Random Forest Accuracy       : {acc_rf:.4f}")
print(f"AdaBoost Accuracy            : {acc_ada:.4f}")
print(f"XGBoost Accuracy             : {acc_xgb:.4f}")
print("============================")

# ====================================================
# FINAL PREDICTIONS (use best_xgb as final model)
# ====================================================
final_preds = best_xgb.predict(X_test_scaled)
final_labels = le_target.inverse_transform(final_preds)
submission = pd.DataFrame({'id': df_test['id'], 'WeightCategory': final_labels})
submission.to_csv("final_xgb_predictions.csv", index=False)
print("\nSaved: final_xgb_predictions.csv")

# ====================================================
# BAR CHART: Model Accuracy Comparison (kept as original)
# ====================================================
model_names = ['Logistic Regression', 'Random Forest', 'AdaBoost', 'XGBoost']
accuracies = [acc_lr, acc_rf, acc_ada, acc_xgb]

plt.figure(figsize=(8, 5))
sns.barplot(x=model_names, y=accuracies, palette='viridis')
plt.title("Model Accuracy Comparison", fontsize=14, weight='bold')
plt.ylabel("Accuracy Score")
plt.xlabel("Model")
plt.ylim(0, 1)
for i, acc in enumerate(accuracies):
    plt.text(i, acc + 0.01, f"{acc:.3f}", ha='center', fontsize=10, weight='bold')
plt.show()

# ============================================
# END OF SCRIPT
# ============================================