# Obesity Classification using Decision Tree

This notebook trains and evaluates a Decision Tree Classifier model on the obesity dataset.

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from time import time
from datetime import datetime
from collections import Counter

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12

## 1. Load and Preprocess Data

In [None]:
def load_and_preprocess_data(file_path):
    df = pd.read_csv(file_path)
    
    print(f"Dataset shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    
    missing_values = df.isnull().sum()
    if missing_values.sum() > 0:
        print(f"Missing values per column:\n{missing_values[missing_values > 0]}")
        df.fillna(df.mean(), inplace=True)
    
    categorical_cols = df.select_dtypes(include=['object']).columns
    print(f"Categorical columns: {categorical_cols.tolist()}")
    
    encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        encoders[col] = le
    
    X = df.drop(['id', 'WeightCategory'], axis=1)
    y = df['WeightCategory']
    
    print("\nTarget class distribution:")
    print(y.value_counts())
    
    le_target = LabelEncoder()
    y_encoded = le_target.fit_transform(y)
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.25, random_state=42)
    
    return X, X_scaled, y, y_encoded, X_train, X_test, y_train, y_test, categorical_cols, encoders, le_target, scaler

file_path = "train.csv"

X, X_scaled, y, y_encoded, X_train, X_test, y_train, y_test, categorical_cols, encoders, le_target, scaler = load_and_preprocess_data(file_path)

## 2. Create Results Directory

In [None]:
results_dir = "decision_tree_results"
if not os.path.exists(results_dir):
    os.makedirs(results_dir)
print(f"Results will be saved to '{results_dir}' directory.")

## 3. Find Optimal Tree Depth

In [None]:
max_depths = list(range(1, 21))
train_accuracy = []
test_accuracy = []

for depth in max_depths:
    tree = DecisionTreeClassifier(max_depth=depth, random_state=42)
    tree.fit(X_train, y_train)
    
    y_train_pred = tree.predict(X_train)
    train_acc = accuracy_score(y_train, y_train_pred)
    train_accuracy.append(train_acc)
    
    y_test_pred = tree.predict(X_test)
    test_acc = accuracy_score(y_test, y_test_pred)
    test_accuracy.append(test_acc)

plt.figure(figsize=(10, 6))
plt.plot(max_depths, train_accuracy, label='Training Accuracy', marker='o')
plt.plot(max_depths, test_accuracy, label='Testing Accuracy', marker='o')
plt.xlabel('Maximum Tree Depth')
plt.ylabel('Accuracy')
plt.title('Accuracy vs. Maximum Tree Depth')
plt.legend()
plt.grid(True)
plt.xticks(max_depths)
plt.tight_layout()
plt.savefig(f"{results_dir}/tree_depth_analysis.png")
plt.show()

optimal_depth = max_depths[test_accuracy.index(max(test_accuracy))]
print(f"Optimal tree depth: {optimal_depth} with testing accuracy: {max(test_accuracy):.4f}")

## 4. Train and Evaluate Decision Tree with Optimal Depth

In [None]:
start_time = time()
model = DecisionTreeClassifier(max_depth=optimal_depth, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
train_predictions = model.predict(X_train)

train_accuracy = accuracy_score(y_train, train_predictions)
test_accuracy = accuracy_score(y_test, y_pred)
training_time = time() - start_time

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")
print(f"Training Time: {training_time:.2f} seconds")

report = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report).transpose()
print("\nClassification Report:")
print(report_df)

## 5. Visualize Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Decision Tree Classifier')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()

plt.savefig(f"{results_dir}/confusion_matrix.png")
plt.show()

## 6. Feature Importance Analysis

In [None]:
feature_importances = model.feature_importances_

feature_names = X.columns
importances_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
importances_df = importances_df.sort_values('Importance', ascending=False)

plt.figure(figsize=(12, 8))
plt.barh(importances_df['Feature'], importances_df['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Decision Tree Feature Importances')
plt.tight_layout()
plt.savefig(f"{results_dir}/feature_importances.png")
plt.show()

print("Top 10 Features by Importance:")
print(importances_df.head(10))

## 7. Visualize Decision Tree (simplified)

In [None]:
simple_tree = DecisionTreeClassifier(max_depth=3, random_state=42)
simple_tree.fit(X_train, y_train)

plt.figure(figsize=(20, 10))
plot_tree(simple_tree, 
          feature_names=feature_names,
          class_names=[str(c) for c in simple_tree.classes_],
          filled=True, 
          rounded=True, 
          fontsize=10)
plt.title('Simplified Decision Tree (max_depth=3)')
plt.savefig(f"{results_dir}/simplified_tree.png", bbox_inches='tight', dpi=300)
plt.show()

## 8. Save Results

In [None]:
report_df.to_csv(f"{results_dir}/classification_report.csv")
print(f"Classification report saved to {results_dir}/classification_report.csv")

importances_df.to_csv(f"{results_dir}/feature_importances.csv", index=False)
print(f"Feature importances saved to {results_dir}/feature_importances.csv")

results = {
    'Model': 'DecisionTreeClassifier',
    'Training Accuracy': train_accuracy,
    'Testing Accuracy': test_accuracy,
    'Training Time (s)': training_time,
    'Optimal Depth': optimal_depth
}
results_df = pd.DataFrame([results])
results_df.to_csv(f"{results_dir}/results_summary.csv", index=False)
print(f"Results summary saved to {results_dir}/results_summary.csv")

results_df

## 9. Process Test Data and Create Submission

In [None]:
test = pd.read_csv("test.csv")
test_encoded = test.copy().drop(columns=["id"])

for col in test_encoded.select_dtypes(include=['object']).columns:
    if col in encoders:
        test_encoded[col] = encoders[col].transform(test_encoded[col].astype(str))
    else:
        le = LabelEncoder()
        test_encoded[col] = le.fit_transform(test_encoded[col].astype(str))

test_scaled = scaler.transform(test_encoded)
test_preds = model.predict(test_scaled)
test_preds_labels = le_target.inverse_transform(test_preds)

submission = pd.DataFrame({
    "id": test["id"],
    "WeightCategory": test_preds_labels
})

submission.to_csv("submission_decision_tree.csv", index=False)
print("\nsubmission_decision_tree.csv is ready for Kaggle!")
submission.head()