In [None]:
# 03_modeling.ipynb
# Modeling for Patient Readmission Risk

import pandas as pd
import joblib
import os
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from xgboost import XGBClassifier


# 1. Load Data

X_train = pd.read_csv("/content/test_X.csv")
y_train = pd.read_csv("/content/test_y.csv").values.ravel()
X_test = pd.read_csv("/content/train_X.csv")
y_test = pd.read_csv("/content/train_y.csv").values.ravel()


# 2. Preprocessing

cat_cols = X_train.select_dtypes(include='object').columns.tolist()
num_cols = X_train.select_dtypes(exclude='object').columns.tolist()

numeric_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipe, num_cols),
    ("cat", categorical_pipe, cat_cols)
])


# 3. Baseline Model

baseline_model = Pipeline([
    ("prep", preprocessor),
    ("clf", LogisticRegression(max_iter=1000, class_weight="balanced"))
])
baseline_model.fit(X_train, y_train)

pred_prob = baseline_model.predict_proba(X_test)[:,1]
print("Baseline ROC-AUC:", roc_auc_score(y_test, pred_prob))


# 4. XGBoost Model

X_train_t = preprocessor.fit_transform(X_train)
X_test_t = preprocessor.transform(X_test)

xgb = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    eval_metric="auc",
    scale_pos_weight=(len(y_train)-sum(y_train))/sum(y_train)
)
xgb.fit(X_train_t, y_train)
proba = xgb.predict_proba(X_test_t)[:,1]
print("XGBoost ROC-AUC:", roc_auc_score(y_test, proba))

# 5. Save Artifacts

os.makedirs("models", exist_ok=True)
joblib.dump(preprocessor, "preprocessor.pkl")
joblib.dump(xgb, "xgb_readmit.pkl")


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Baseline ROC-AUC: 0.6153920494220323
XGBoost ROC-AUC: 0.6518326815845654


['xgb_readmit.pkl']