In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import joblib


# ----------------------------------------------------------
# 1. LOAD DATA
# ----------------------------------------------------------
df = pd.read_csv("health_data.csv")

print("FIRST 5 ROWS:")
print(df.head())
print("\nDATA SHAPE:", df.shape)


# ----------------------------------------------------------
# 2. VISUALIZATION BEFORE CLEANING
# ----------------------------------------------------------
'''
num_cols = df.select_dtypes(include=['int64','float64']).columns

plt.figure(figsize=(12, 6))
df[num_cols].boxplot()
plt.title("Before Cleaning - Boxplots of Numeric Columns")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
'''
num_cols = df.select_dtypes(include=np.number).columns.tolist()
if 'disease_risk' in num_cols:
    num_cols.remove('disease_risk')

print("\nBoxplots for numeric columns (BEFORE imputation).")
plt.figure(figsize=(10, len(num_cols)*2.2))
for i, col in enumerate(num_cols, 1):
    plt.subplot(len(num_cols), 1, i)
    plt.boxplot(df[col].dropna(), vert=False)
    plt.title(f"Boxplot - {col} (before imputation)")
    plt.xlabel("")
plt.tight_layout()
plt.show()

# ----------------------------------------------------------
# 3. HANDLE MISSING VALUES
# ----------------------------------------------------------


cat_cols = df.select_dtypes(include=['object','bool']).columns
num_cols = df.select_dtypes(include=['int64','float64']).columns

# fill numeric
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# fill categorical only if they exist
if len(cat_cols) > 0:
    df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])

# ----------------------------------------------------------
# 4. REMOVE OUTLIERS USING IQR
# ----------------------------------------------------------
clean_df = df.copy()

for col in num_cols:
    Q1 = clean_df[col].quantile(0.25)
    Q3 = clean_df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    clean_df = clean_df[(clean_df[col] >= lower) & (clean_df[col] <= upper)]

print("\nNEW SHAPE AFTER OUTLIER REMOVAL:", clean_df.shape)


# ----------------------------------------------------------
# 5. VISUALIZATION AFTER CLEANING
# ----------------------------------------------------------
'''
plt.figure(figsize=(12, 6))
clean_df[num_cols].boxplot()
plt.title("After Cleaning - Boxplots of Numeric Columns")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
'''
print("\nBoxplots for numeric columns (AFTER imputation, BEFORE outlier removal).")
plt.figure(figsize=(10, len(num_cols)*2.2))
for i, col in enumerate(num_cols, 1):
    plt.subplot(len(num_cols), 1, i)
    plt.boxplot(df[col].dropna(), vert=False)
    plt.title(f"Boxplot - {col} (after imputation)")
    plt.xlabel("")
plt.tight_layout()
plt.show()


# ----------------------------------------------------------
# 6. TRAIN–TEST SPLIT
# ----------------------------------------------------------
X = clean_df.drop("disease_risk", axis=1)
y = clean_df["disease_risk"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# ----------------------------------------------------------
# 7. SCALING (needed for KNN)
# ----------------------------------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# ----------------------------------------------------------
# 8. MODEL 1 — RANDOM FOREST
# ----------------------------------------------------------
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)

rf_pred = rf.predict(X_test_scaled)

rf_acc = accuracy_score(y_test, rf_pred)
rf_f1 = f1_score(y_test, rf_pred)


# ----------------------------------------------------------
# 9. MODEL 2 — KNN
# ----------------------------------------------------------
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

knn_pred = knn.predict(X_test_scaled)

knn_acc = accuracy_score(y_test, knn_pred)
knn_f1 = f1_score(y_test, knn_pred)


# ----------------------------------------------------------
# 10. SIMPLE MODEL COMPARISON (bare minimum)
# ----------------------------------------------------------
print("RF Accuracy:", rf_acc)
print("RF F1:", rf_f1)

print("KNN Accuracy:", knn_acc)
print("KNN F1:", knn_f1)


# ----------------------------------------------------------
# 11. CONFUSION MATRIX (RANDOM FOREST)
# ----------------------------------------------------------
cm = confusion_matrix(y_test, rf_pred)

plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap="Blues")
plt.title("Random Forest - Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


# ----------------------------------------------------------
# 12. SAVE MODELS AND CREATE SUBMISSION CSV
# ----------------------------------------------------------

# Save models and scaler
joblib.dump(rf, "rf_model_simple.joblib")
joblib.dump(knn, "knn_model_simple.joblib")
joblib.dump(scaler, "scaler_simple.joblib")

# Create submission-style CSV
test_ids = X_test.index
test_pred = rf_pred  # or knn_pred if you prefer

sub = pd.DataFrame({
    'id': test_ids,
    'Prediction': test_pred
})

sub.to_csv("submission.csv", index=False)

print("Saved: submission.csv, rf_model_simple.joblib, knn_model_simple.joblib, scaler_simple.joblib")