In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score   # CHANGED FOR REGRESSION
from sklearn.ensemble import RandomForestRegressor                              # CHANGED FOR REGRESSION
from sklearn.neighbors import KNeighborsRegressor                               # CHANGED FOR REGRESSION
import joblib

# Load Data
df = pd.read_csv("healthcare_dataset.csv")

# ---------------------------------------------------
# IDENTIFY NUMERIC & CATEGORICAL COLUMNS
# ---------------------------------------------------

num_cols = df.select_dtypes(include=np.number).columns.tolist()
if "disease_risk" in num_cols:
    num_cols.remove("disease_risk")  # we don't want to treat target as input

cat_cols = df.select_dtypes(include="object").columns.tolist()

# ---------------------------------------------------
# BOX PLOTS BEFORE IMPUTATION
# ---------------------------------------------------

print("\nBoxplots for numeric columns (BEFORE imputation).")
plt.figure(figsize=(10, len(num_cols) * 2.2))
for i, col in enumerate(num_cols, 1):
    plt.subplot(len(num_cols), 1, i)
    plt.boxplot(df[col].dropna(), vert=False)
    plt.title(f"Boxplot - {col} (before imputation)")
plt.tight_layout()
plt.show()

# ---------------------------------------------------
# HANDLE MISSING VALUES
# ---------------------------------------------------

# Numeric → median
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Categorical → mode
for col in cat_cols:
    mode_val = df[col].mode()
    if len(mode_val) > 0:
        df[col] = df[col].fillna(mode_val.iloc[0])  # CHANGED: avoid IndexError
    else:
        df[col] = df[col].fillna("Unknown")

# ---------------------------------------------------
# OUTLIER REMOVAL (IQR METHOD)
# ---------------------------------------------------

for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    low = Q1 - 1.5 * IQR
    high = Q3 + 1.5 * IQR
    df = df[(df[col] >= low) & (df[col] <= high)]

# ---------------------------------------------------
# BOX PLOTS AFTER IMPUTATION & OUTLIER REMOVAL
# ---------------------------------------------------

print("\nBoxplots for numeric columns (AFTER cleaning).")
plt.figure(figsize=(10, len(num_cols) * 2.2))
for i, col in enumerate(num_cols, 1):
    plt.subplot(len(num_cols), 1, i)
    plt.boxplot(df[col], vert=False)
    plt.title(f"Boxplot - {col} (after cleaning)")
plt.tight_layout()
plt.show()

# ---------------------------------------------------
# ENCODE CATEGORICAL DATA
# ---------------------------------------------------

df = pd.get_dummies(df, columns=cat_cols, drop_first=True)  # CHANGED: works same for regression

# ---------------------------------------------------
# SPLIT DATA
# ---------------------------------------------------

X = df.drop("disease_risk", axis=1)
y = df["disease_risk"]  # CHANGED FOR REGRESSION (now continuous target)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ---------------------------------------------------
# SCALE DATA
# ---------------------------------------------------

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ---------------------------------------------------
# MODEL 1: RANDOM FOREST REGRESSOR
# ---------------------------------------------------

rf = RandomForestRegressor(random_state=42)  # CHANGED FOR REGRESSION
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

# REGRESSION METRICS — CHANGED
print("\nRandomForestRegressor Performance:")
print("MSE:", mean_squared_error(y_test, rf_pred))
print("MAE:", mean_absolute_error(y_test, rf_pred))
print("R2 Score:", r2_score(y_test, rf_pred))

# ---------------------------------------------------
# MODEL 2: KNN REGRESSOR
# ---------------------------------------------------

knn = KNeighborsRegressor(n_neighbors=5)  # CHANGED FOR REGRESSION
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)

print("\nKNeighborsRegressor Performance:")
print("MSE:", mean_squared_error(y_test, knn_pred))
print("MAE:", mean_absolute_error(y_test, knn_pred))
print("R2 Score:", r2_score(y_test, knn_pred))

# ---------------------------------------------------
# SAVE BEST MODEL
# ---------------------------------------------------

best_model = rf if r2_score(y_test, rf_pred) > r2_score(y_test, knn_pred) else knn
joblib.dump(best_model, "best_regression_model.pkl")  # CHANGED (model name)

print("\nModel saved as best_regression_model.pkl")