# 1. Data Understanding 

The attributes related with eating habits are: Frequent consumption of high caloric food (FAVC), Frequency of consumption of vegetables (FCVC), Number of main meals (NCP), Consumption of food between meals (CAEC), Consumption of water daily (CH20), and Consumption of alcohol (CALC). 

The attributes related with the physical condition are: Calories consumption monitoring (SCC), Physical activity frequency (FAF), Time using technology devices (TUE), Transportation used (MTRANS)

Variables obtained :
Family history with overweight, Gender, Age, Height and Weight

NObesity values are:

•Underweight Less than 18.5

•Normal 18.5 to 24.9

•Overweight 25.0 to 29.9

•Obesity I 30.0 to 34.9

•Obesity II 35.0 to 39.9

•Obesity III Higher than 40



Evaluation
Submissions are evaluated using the accuracy score.

Submission File
For each id row in the test set, you must predict the class value of the target, NObeyesdad. The file should contain a header and have the following format:

id,NObeyesdad
20758,Normal_Weight
20759,Normal_Weight
20760,Normal_Weight
etc.

# 2. Import packages and data 

In [None]:
!pip install ISLP

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import display
from ISLP import confusion_table
from ISLP.models import (ModelSpec as MS, summarize, contrast)

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.preprocessing import label_binarize
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, label_binarize

import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings("ignore", category=DataConversionWarning)

ModuleNotFoundError: No module named 'ISLP'

In [None]:
# get directory for data 
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
test = pd.read_csv('/kaggle/input/playground-series-s4e2/test.csv')
train = pd.read_csv('/kaggle/input/playground-series-s4e2/train.csv')

In [None]:

print("Train shape:", train.shape)
print("=========================================")
print("Test  shape:", test.shape)
print("=========================================")
print("\nTrain info:")
train.info()

In [None]:
train. head(10)

In [None]:
# set variable types and check for missing values and 0s 
cat_vars = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']

num_vars = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']

target = ['NObeyesdad']

train[cat_vars] = train[cat_vars].astype('category')
test[cat_vars] = test[cat_vars].astype('category')
train[target] = train[target].astype('category')



In [None]:
# calulate BMI to add to varibles 

for df in (train, test):
    df['BMI'] = df['Weight'] / (df['Height']**2)

# Register BMI as numeric
if 'BMI' not in num_vars:
    num_vars.append('BMI')

# 3. Start EDA 

In [None]:
# Numeric summary
print("\nNumeric summary (train):")
print(train[num_vars].describe())


In [None]:
# Missingness checks
print("\nMissing values per column (train):")
print(train.isna().sum())
print("\nMissing values per column (test):")
print(test.isna().sum())

In [None]:
# Target distribution
print("\nTarget distribution (counts):")
print(train[target].value_counts())
print("\nTarget distribution (proportions):")
print(train[target].value_counts(normalize=True))

print("====================================================")
# Bar chart of target distribution
plt.figure(figsize=(10,4))
(train[target].value_counts(normalize=True)
     .sort_index()
     .plot(kind='bar'))
plt.title("Class Proportions: NObeyesdad")
plt.ylabel("Proportion")
plt.xlabel("Class")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Correlations among numeric features
corr = train[num_vars].corr()
plt.figure(figsize=(7,6))
im = plt.imshow(corr, interpolation='nearest')
plt.colorbar(im, fraction=0.046, pad=0.04)
plt.xticks(range(len(num_vars)), num_vars, rotation=45, ha='right')
plt.yticks(range(len(num_vars)), num_vars)
plt.title("Correlation Heatmap (Numeric Features)")
plt.tight_layout()
plt.show()

In [None]:
# Class-wise numeric means
group_means = train.groupby(target)[num_vars].mean()
print("\nClass-wise means of numeric features:")
display(group_means)

In [None]:
# Univariate histograms
fig, axes = plt.subplots(3, 3, figsize=(14,10))
axes = axes.ravel()
for i, col in enumerate(num_vars):
    axes[i].hist(train[col].values, bins=30)
    axes[i].set_title(col)
plt.tight_layout()
plt.show()

# 4. Design Matrices 

In [None]:
# Build explicit contrasts for each categorical (treatment coding)
encoded_cats = [contrast(v, 'drop') for v in cat_vars]

# 3) Assemble ModelSpec terms (categorical contrasts + numeric features)
all_terms = encoded_cats + num_vars

# 4) Fit the design on TRAIN+TEST predictors to capture ALL category levels
design = MS(all_terms, intercept=True)
predictor_cols = cat_vars + num_vars
combined_predictors = pd.concat([train[predictor_cols], test[predictor_cols]], ignore_index=True)
_ = design.fit(combined_predictors)

# 5) Transform TRAIN and TEST -> fully numeric design matrices
X_full      = design.transform(train[predictor_cols]) 
X_test_full = design.transform(test[predictor_cols])
y_full      = train[target]

# 6) Train/Valid split (stratified) using row indices so matrices stay aligned
from sklearn.model_selection import train_test_split
idx_train, idx_valid = train_test_split(
    np.arange(train.shape[0]),
    test_size=0.20,
    stratify=y_full,
    random_state=42
)
X_train = X_full.iloc[idx_train].copy()
X_valid = X_full.iloc[idx_valid].copy()
y_train = y_full.iloc[idx_train].copy()
y_valid = y_full.iloc[idx_valid].copy()

# 7) Drop ISLP 'intercept' 
if 'intercept' in X_train.columns:
    X_train_no_int = X_train.drop(columns=['intercept'])
    X_valid_no_int = X_valid.drop(columns=['intercept'])
    X_full_no_int  = X_full.drop(columns=['intercept'])
    X_test_no_int  = X_test_full.drop(columns=['intercept'])
else:
    X_train_no_int = X_train
    X_valid_no_int = X_valid
    X_full_no_int  = X_full
    X_test_no_int  = X_test_full


# 5. Models 

In [None]:
# standardize for Logit and SVM 
logit_clf = Pipeline(steps=[
    ('scaler', StandardScaler(with_mean=True, with_std=True)),
    ('model', LogisticRegression(
        multi_class='multinomial', solver='lbfgs', max_iter=2000, n_jobs=None, random_state=42
    ))
])

lda_clf = LDA(store_covariance=True, solver='svd')  
nb_clf  = GaussianNB()                             
svm_clf = Pipeline(steps=[
    ('scaler', StandardScaler(with_mean=True, with_std=True)),
    ('model', SVC(kernel='rbf', C=3.0, gamma='scale', probability=True, random_state=42))
])

models = {
    'logit_multinomial': logit_clf,
    'lda': lda_clf,
    'naive_bayes': nb_clf,
    'svm_rbf': svm_clf
}

# Add shrinkage LDA variant (uses Ledoit–Wolf)
models['lda_shrink'] = LDA(solver='lsqr', shrinkage='auto')

# 6. View results 

In [None]:
results = []

def get_estimator(m):
    # works for Pipeline or bare estimator
    return m.named_steps['model'] if hasattr(m, 'named_steps') else m

# ensure 1D string labels 
y_train_1d = y_train.astype(str).to_numpy()
y_valid_1d = y_valid.astype(str).to_numpy()
y_full_1d  = y_full.astype(str).to_numpy()

for name, mdl in models.items():
    print(f"\n=== Fitting: {name} ===")

    # Fit
    mdl.fit(X_train_no_int, y_train_1d)

    # Predict labels
    y_pred = mdl.predict(X_valid_no_int)
    acc = accuracy_score(y_valid_1d, y_pred)
    print(f"Validation Accuracy: {acc:.4f}")

    # ISLP confusion table (rows=Predicted, cols=Truth)
    print("\nConfusion Table (rows=Predicted, cols=Truth):")
    C = confusion_table(y_pred, y_valid_1d)
    display(C)

    # Classification report
    print("\nClassification Report:")
    print(classification_report(y_valid_1d, y_pred, digits=3))

    # Macro AUC (One-vs-Rest)
    macro_auc = np.nan
    est = get_estimator(mdl)
    if hasattr(est, "predict_proba"):
        y_proba = mdl.predict_proba(X_valid_no_int)
        classes_est = np.array([str(c) for c in est.classes_])  # order matches proba columns
        y_valid_bin = label_binarize(y_valid_1d, classes=classes_est)
        try:
            macro_auc = roc_auc_score(y_valid_bin, y_proba, average='macro')
            print(f"Macro ROC AUC (OvR): {macro_auc:.4f}")
        except Exception as e:
            print("AUC not computed:", e)

    # 5-fold CV accuracy
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(mdl, X_full_no_int, y_full_1d, cv=cv, scoring='accuracy')
    print(f"5-Fold CV Accuracy: mean={cv_scores.mean():.4f}, std={cv_scores.std():.4f}")

    results.append({
        'model': name,
        'val_accuracy': acc,
        'cv_mean_acc': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'macro_auc_ovr': macro_auc
    })

results_df = pd.DataFrame(results).sort_values(by='val_accuracy', ascending=False)
print("\n=== Model Comparison (Validation) ===")
display(results_df)

# 7. Conduct compact SVM hyperparameter sweep to increase acuracy of SVM model

In [None]:
# Compact SVM (RBF) hyperparameter sweep
def to_1d_str(y):
    # If DataFrame, require a single column then squeeze
    if isinstance(y, pd.DataFrame):
        if y.shape[1] != 1:
            raise ValueError(f"y has shape {y.shape}; expected a single column.")
        y = y.iloc[:, 0]
    # Convert to numpy and flatten if needed
    arr = y.to_numpy() if isinstance(y, pd.Series) else np.asarray(y)
    if arr.ndim == 2:
        if arr.shape[1] == 1:
            arr = arr[:, 0]
        else:
            arr = arr.reshape(-1)
    return arr.astype(str)

y_train_1d = to_1d_str(y_train)
y_valid_1d = to_1d_str(y_valid)
y_full_1d  = to_1d_str(y_full)
# Pipeline: scale -> SVC(probabilities enabled)
svm_pipe = Pipeline(steps=[
    ('scaler', StandardScaler(with_mean=True, with_std=True)),
    ('model', SVC(kernel='rbf', probability=True, random_state=42))
])

# Compact grid
param_grid = {
    'model__C':     [0.5, 1, 2, 3, 5],
    'model__gamma': ['scale', 0.03, 0.1, 0.3]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
svm_gs = GridSearchCV(
    estimator=svm_pipe,
    param_grid=param_grid,
    scoring='accuracy',
    cv=cv,
    n_jobs=-1,
    refit=True,
    verbose=1
)

# Fit on training split
svm_gs.fit(X_train_no_int, y_train_1d)
print("Best params:", svm_gs.best_params_)
print("Best CV mean accuracy:", svm_gs.best_score_)

# Validate
y_pred = svm_gs.predict(X_valid_no_int)
val_acc = accuracy_score(y_valid_1d, y_pred)
print(f"\nValidation Accuracy (tuned SVM): {val_acc:.4f}")

print("\nConfusion Table (rows=Predicted, cols=Truth):")
display(confusion_table(y_pred, y_valid_1d))

print("\nClassification Report:")
print(classification_report(y_valid_1d, y_pred, digits=3))

# Macro AUC (OvR)
est = svm_gs.best_estimator_.named_steps['model']
classes_est = np.array([str(c) for c in est.classes_])
y_proba = svm_gs.predict_proba(X_valid_no_int)
y_valid_bin = label_binarize(y_valid_1d, classes=classes_est)
macro_auc = roc_auc_score(y_valid_bin, y_proba, average='macro')
print(f"Macro ROC AUC (OvR): {macro_auc:.4f}")

# CV results heatmap (mean accuracy)
cvres = pd.DataFrame(svm_gs.cv_results_)
heat = cvres.pivot(index='param_model__C', columns='param_model__gamma', values='mean_test_score')
plt.figure(figsize=(6,4))
plt.imshow(heat.values, aspect='auto')
plt.xticks(range(heat.shape[1]), heat.columns.astype(str), rotation=45, ha='right')
plt.yticks(range(heat.shape[0]), heat.index.astype(str))
plt.title('SVM Grid CV Mean Accuracy')
plt.xlabel('gamma')
plt.ylabel('C')
plt.colorbar()
plt.tight_layout()
plt.show()

# Train best on full training set and create submission
best_svm = svm_gs.best_estimator_
best_svm.fit(X_full_no_int, y_full_1d)
test_pred = best_svm.predict(X_test_no_int)
sub_tuned = pd.DataFrame({'id': test['id'], 'NObeyesdad': test_pred})
out_path = "/kaggle/working/submission.csv"
sub_tuned.to_csv(out_path, index=False)
print("Saved:", out_path)

# (Optional) register in your models dict for later reuse
models['svm_rbf_tuned'] = best_svm