In [22]:
import os
import numpy as np
import pandas as pd
import csv
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error
from sklearn.impute import SimpleImputer
import xgboost as xgb

In [8]:
# Load and preprocess data
X = pd.read_csv("final_df.csv")

# Drop correlated values to FUTURE_OUT_OF_POCKET
X.drop(["TOTAL_FUTURE_COVERAGE", "TOTAL_FUTURE_COST"], axis=1, inplace=True)
X.dropna(axis=0, subset=["FUTURE_OUT_OF_POCKET"], inplace=True)

y = X.FUTURE_OUT_OF_POCKET

In [9]:
# Handle outliers in target variable
y = X.FUTURE_OUT_OF_POCKET
p1, p99 = y.quantile(0.01), y.quantile(0.99)
print(f"Original number of samples: {len(X)}")
mask = (y >= p1) & (y <= p99)
X = X[mask]
y = y[mask]
print(f"Number of samples after removing outliers: {len(X)}")
print(f"Kept values in range: ${p1:.2f} - ${p99:.2f}")

X.drop(["FUTURE_OUT_OF_POCKET"], axis=1, inplace=True)

Original number of samples: 170859
Number of samples after removing outliers: 167643
Kept values in range: $78.95 - $3697348.69


In [10]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.8, test_size=0.2, random_state=42
)

In [18]:
# Handle missing values
imputer = SimpleImputer(strategy="median")
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# Handle outliers in numerical columns
numerical_cols = [
    "Body Weight",
    "Body Height",
    "Body Mass Index",
    "Systolic Blood Pressure",
    "Diastolic Blood Pressure",
    "Heart rate",
    "Respiratory rate",
    "AGE",
    "TOTAL_FUTURE_COST",
    "TOTAL_FUTURE_COVERAGE",
    "FUTURE_OUT_OF_POCKET"
]

for col in X.columns:
  print(col) 
  if col in numerical_cols:
      # Convert to nullable float type if needed, but float already handles NaN
      X[col] = X[col].astype(pd.Float64Dtype())  # or simply float
  else:
      # Convert to nullable integer type
      X[col] = X[col].astype("category")

RACE
ETHNICITY
GENDER
Body Height
Body Weight
Body Mass Index
Diastolic Blood Pressure
Systolic Blood Pressure
Heart rate
Respiratory rate
Acquired coagulation disorder (disorder)
Acute Cholecystitis
Acute allergic reaction
Acute bacterial sinusitis (disorder)
Acute bronchitis (disorder)
Acute deep venous thrombosis (disorder)
Acute pulmonary embolism (disorder)
Acute respiratory distress syndrome (disorder)
Acute respiratory failure (disorder)
Acute viral pharyngitis (disorder)
Alcoholism
Allergy to latex (finding)
Alzheimer's disease (disorder)
Anemia (disorder)
Antepartum eclampsia
Appendicitis
Asthma
At risk for suicide (finding)
Atopic dermatitis
Atrial Fibrillation
Bacterial infectious disease (disorder)
Bleeding from anus
Blighted ovum
Blindness due to type 2 diabetes mellitus (disorder)
Body mass index 30+ - obesity (finding)
Body mass index 40+ - severely obese (finding)
Brain damage - traumatic
Bullet wound
Burn injury(morphologic abnormality)
COVID-19
Carcinoma in situ of pr

# XGBoost Model

In [20]:
# Use RobustScaler instead of StandardScaler for better handling of outliers
scaler = RobustScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# Initialize model with carefully tuned parameters
model = xgb.XGBRegressor(
    max_depth=15,  # Reduced to prevent overfitting
    learning_rate=0.03,  # Smaller learning rate for better generalization
    n_estimators=300,  # Increased due to smaller learning rate
    min_child_weight=5,  # Increased to reduce overfitting
    subsample=0.9,  # Increased for better stability
    colsample_bytree=0.9,  # Increased for better stability
    gamma=1,  # Added to control tree growth
    reg_alpha=0.1,  # L1 regularization
    reg_lambda=1,  # L2 regularization
    objective="reg:squarederror",
    eval_metric="mae",
    early_stopping_rounds=20,  # Increased patience
    random_state=42,
    enable_categorical=True,
)

In [None]:
# Fit the model with more evaluation sets
model.fit(
    X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False
)

# Make predictions
predictions = model.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
mad = median_absolute_error(y_test, predictions)


print(f"Mean Absolute Error: {mae}")
print(f"Median Absolute Error: {mad}")


# Feature importance analysis
feature_importance = pd.DataFrame(
    {"feature": X_train.columns, "importance": model.feature_importances_}
)
feature_importance = feature_importance.sort_values("importance", ascending=False)
print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))

Mean Absolute Error: 68613.31193358707
Median Absolute Error: 23815.87609374997

Top 10 Most Important Features:
                                feature  importance
235   Suspected lung cancer (situation)    0.115707
3                           Body Height    0.048754
4                           Body Weight    0.046356
5                       Body Mass Index    0.041117
0                                  RACE    0.032764
2                                GENDER    0.032140
251                                 AGE    0.029520
158                Neoplasm of prostate    0.029392
19   Acute viral pharyngitis (disorder)    0.028715
1                             ETHNICITY    0.024072
