# IMPORTING THE LIBRARIES

In [51]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


# IMPORTING THE DATASET

In [52]:
dataset=pd.read_csv('diabetic_data.csv')
import pandas as pd

dataset = pd.read_csv("diabetic_data.csv")

drop_cols = [

    "encounter_id", "patient_nbr",

    "race", "weight", "payer_code",

    "discharge_disposition_id",
    "admission_source_id",
    "admission_type_id",


    "max_glu_serum", "A1Cresult",

    "metformin", "repaglinide", "nateglinide", "chlorpropamide",
    "glimepiride", "acetohexamide", "glipizide", "glyburide",
    "tolbutamide", "pioglitazone", "rosiglitazone", "acarbose",
    "miglitol", "troglitazone", "tolazamide", "examide",
    "citoglipton",


    "glipizide-metformin", "glyburide-metformin",
    "metformin-pioglitazone", "metformin-rosiglitazone",
    "glimepiride-pioglitazone"
]

dataset = dataset.drop(columns=drop_cols)



In [53]:
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]


In [54]:
dataset["readmitted"].value_counts()


Unnamed: 0_level_0,count
readmitted,Unnamed: 1_level_1
NO,54864
>30,35545
<30,11357


# CONVERTING DEPENDENT VARIABLE TO BINARY

In [55]:
y = y.apply(lambda x: 1 if x == '<30' else 0)
print(y)

0         0
1         0
2         0
3         0
4         0
         ..
101761    0
101762    0
101763    0
101764    0
101765    0
Name: readmitted, Length: 101766, dtype: int64


# ENCODING CATEGORICAL DATA

In [56]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Re-initialize X to ensure it's a pandas DataFrame before transformation
X = dataset.iloc[:, :-1]

categorical_cols = [
    "age",
    "gender",
    "medical_specialty",
    "diag_1",
    "diag_2",
    "diag_3",
    "insulin",
    "change",
    "diabetesMed"
]

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(handle_unknown='ignore'), categorical_cols)], remainder='passthrough')

X = ct.fit_transform(X)

# SPLITTING THE DATASET

In [57]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# TRAINING THE MODEL LOGISTIC REGRESSION

In [58]:
from sklearn.linear_model import LogisticRegression


log_reg = LogisticRegression(max_iter=1000,class_weight="balanced",n_jobs=-1)

log_reg.fit(X_train, y_train)


In [59]:
y_pred = log_reg.predict(X_test)
y_pred_proba = log_reg.predict_proba(X_test)[:, 1]


# METRICS

In [60]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nROC-AUC:")
print(roc_auc_score(y_test, y_pred_proba))


Confusion Matrix:
[[11876  6193]
 [ 1057  1228]]

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.66      0.77     18069
           1       0.17      0.54      0.25      2285

    accuracy                           0.64     20354
   macro avg       0.54      0.60      0.51     20354
weighted avg       0.83      0.64      0.71     20354


ROC-AUC:
0.6395745073013938


# Catboost

In [61]:
!pip install catboost




In [62]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

X_cat = dataset.drop(columns=["readmitted"])
y_cat = y


In [63]:
cat_features = [
    "age",
    "gender",
    "medical_specialty",
    "diag_1",
    "diag_2",
    "diag_3",
    "insulin",
    "change",
    "diabetesMed"
]
XC_train, XC_test, yC_train, yC_test = train_test_split(
    X_cat,
    y_cat,
    test_size=0.2,
    random_state=42,
    stratify=y_cat
)


In [64]:
cat_model = CatBoostClassifier(
    iterations=300,
    learning_rate=0.1,
    depth=6,
    eval_metric="AUC",
    random_seed=42,
    verbose=100
)
cat_model.fit(
    XC_train,
    yC_train,
    cat_features=cat_features,
    eval_set=(XC_test, yC_test)
)

0:	test: 0.5907879	best: 0.5907879 (0)	total: 208ms	remaining: 1m 2s
100:	test: 0.6581542	best: 0.6581542 (100)	total: 35.1s	remaining: 1m 9s
200:	test: 0.6615767	best: 0.6615767 (200)	total: 56.7s	remaining: 27.9s
299:	test: 0.6622362	best: 0.6626277 (239)	total: 1m 18s	remaining: 0us

bestTest = 0.6626277291
bestIteration = 239

Shrink model to first 240 iterations.


<catboost.core.CatBoostClassifier at 0x7e66dc4c3110>

In [65]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

yC_pred = cat_model.predict(XC_test)
yC_pred_proba = cat_model.predict_proba(XC_test)[:, 1]

print(confusion_matrix(yC_test, yC_pred))
print(classification_report(yC_test, yC_pred))
print("ROC-AUC:", roc_auc_score(yC_test, yC_pred_proba))


[[18061    22]
 [ 2238    33]]
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     18083
           1       0.60      0.01      0.03      2271

    accuracy                           0.89     20354
   macro avg       0.74      0.51      0.48     20354
weighted avg       0.86      0.89      0.84     20354

ROC-AUC: 0.662627729131874
