# AMGEN

In [64]:
!pip install -q ucimlrepo

In [65]:
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, PrecisionRecallDisplay
import matplotlib.pyplot as plt

In [66]:
pd.set_option('display.max_columns', None)

In [67]:
diabetes_130_us_hospitals = fetch_ucirepo(id=296)

df = pd.concat([diabetes_130_us_hospitals.data.features, diabetes_130_us_hospitals.data.targets], axis=1)

df.head()

  df = pd.read_csv(data_url)


Unnamed: 0,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,Caucasian,Female,[0-10),,6,25,1,1,,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,,,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,Caucasian,Female,[10-20),,1,1,7,3,,,59,0,18,0,0,0,276.0,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,AfricanAmerican,Female,[20-30),,1,1,7,2,,,11,5,13,2,0,1,648.0,250.0,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
3,Caucasian,Male,[30-40),,1,1,7,2,,,44,1,16,0,0,0,8.0,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
4,Caucasian,Male,[40-50),,1,1,7,1,,,51,0,8,0,0,0,197.0,157.0,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 48 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   race                      99493 non-null   object
 1   gender                    101766 non-null  object
 2   age                       101766 non-null  object
 3   weight                    3197 non-null    object
 4   admission_type_id         101766 non-null  int64 
 5   discharge_disposition_id  101766 non-null  int64 
 6   admission_source_id       101766 non-null  int64 
 7   time_in_hospital          101766 non-null  int64 
 8   payer_code                61510 non-null   object
 9   medical_specialty         51817 non-null   object
 10  num_lab_procedures        101766 non-null  int64 
 11  num_procedures            101766 non-null  int64 
 12  num_medications           101766 non-null  int64 
 13  number_outpatient         101766 non-null  int64 
 14  numb

In [None]:
medication_columns = [
    'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
    'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
    'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
    'tolazamide', 'examide', 'citoglipton', 'insulin',
    'glyburide-metformin', 'glipizide-metformin',
    'glimepiride-pioglitazone', 'metformin-rosiglitazone',
    'metformin-pioglitazone'
]

In [69]:
for col in medication_columns:
        if col in df.columns:
            no_percentage = (df[col] == 'No').mean() * 100
            print(f"{col:<26} {no_percentage:>6.2f}% 'No'")
        else:
            print(f"Column {col} not found.")

metformin                   80.36% 'No'
repaglinide                 98.49% 'No'
nateglinide                 99.31% 'No'
chlorpropamide              99.92% 'No'
glimepiride                 94.90% 'No'
acetohexamide              100.00% 'No'
glipizide                   87.53% 'No'
glyburide                   89.53% 'No'
tolbutamide                 99.98% 'No'
pioglitazone                92.80% 'No'
rosiglitazone               93.75% 'No'
acarbose                    99.70% 'No'
miglitol                    99.96% 'No'
troglitazone               100.00% 'No'
tolazamide                  99.96% 'No'
examide                    100.00% 'No'
citoglipton                100.00% 'No'
insulin                     46.56% 'No'
glyburide-metformin         99.31% 'No'
glipizide-metformin         99.99% 'No'
glimepiride-pioglitazone   100.00% 'No'
metformin-rosiglitazone    100.00% 'No'
metformin-pioglitazone     100.00% 'No'


## Feature Engineering

In [70]:
# Drop high null columns
df.drop(columns = ["weight","max_glu_serum","A1Cresult","medical_specialty","payer_code"],inplace = True)


In [71]:
df['readmitted_binary'] = (df['readmitted'] == '<30').astype(int)

## These are selected based on the entropy (No value percentage < 90%)
df['metformin_changed'] = df['metformin'].isin(['Up', 'Down']).astype(int)
df['metformin_taken'] = (df['metformin'] != 'No').astype(int)
df['insulin_changed'] = df['insulin'].isin(['Up', 'Down']).astype(int)
df['insulin_taken'] = (df['insulin'] != 'No').astype(int)

df['glipizide_taken'] = (df['glipizide'] != 'No').astype(int)
df['glyburide_taken'] = (df['glyburide'] != 'No').astype(int)


In [72]:
def group_diagnosis(diag_code):
    if pd.isna(diag_code):
        return 'Missing'
    if '250' in diag_code: # ICD-9 code for Diabetes
        return 'Diabetes'
    if diag_code.startswith(('39', '40', '41', '42', '43', '44', '45')): # Circulatory
        return 'Circulatory'
    if diag_code.startswith(('46', '47', '48', '49', '50', '51')): # Respiratory
        return 'Respiratory'
    if diag_code.startswith(('58', '59')): # Genitourinary
        return 'Genitourinary'
    if diag_code.startswith(('8', '9')): # Injury
        return 'Injury'
    return 'Other'

df['diag_group'] = df['diag_1'].apply(group_diagnosis)

In [73]:
df.drop(columns = medication_columns+["diag_1","diag_2","diag_3","readmitted"],inplace= True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 24 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   race                      99493 non-null   object
 1   gender                    101766 non-null  object
 2   age                       101766 non-null  object
 3   admission_type_id         101766 non-null  int64 
 4   discharge_disposition_id  101766 non-null  int64 
 5   admission_source_id       101766 non-null  int64 
 6   time_in_hospital          101766 non-null  int64 
 7   num_lab_procedures        101766 non-null  int64 
 8   num_procedures            101766 non-null  int64 
 9   num_medications           101766 non-null  int64 
 10  number_outpatient         101766 non-null  int64 
 11  number_emergency          101766 non-null  int64 
 12  number_inpatient          101766 non-null  int64 
 13  number_diagnoses          101766 non-null  int64 
 14  chan

## Pre-processing

In [74]:
y = df['readmitted_binary']
X = df.drop('readmitted_binary', axis=1)

numeric_features = [
    'time_in_hospital', 'num_lab_procedures', 'num_procedures',
    'num_medications', 'number_outpatient', 'number_emergency',
    'number_inpatient', 'number_diagnoses'
]

categorical_features = [
    'race', 'gender', 'age', 'change', 'diabetesMed', 'diag_group',
    'admission_type_id', 'admission_source_id'
]

binary_features = [
    'metformin_taken', 'insulin_taken',
    'glipizide_taken', 'glyburide_taken'
]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('bin', 'passthrough', binary_features)
    ])


## Logistic Regression (Baseline)

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Data split into {X_train.shape[0]} train and {X_test.shape[0]} test samples.")

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'))
])

# Train the model
model_pipeline.fit(X_train, y_train)

print("Model training complete.")

Data split into 81412 train and 20354 test samples.
Model training complete.


In [76]:
y_pred_default = model_pipeline.predict(X_test)
print("\nClassification Report (Default Threshold):")
print(classification_report(y_test, y_pred_default, target_names=['Not Readmitted (0)', 'Readmitted (1)']))


Classification Report (Default Threshold):
                    precision    recall  f1-score   support

Not Readmitted (0)       0.92      0.69      0.79     18083
    Readmitted (1)       0.17      0.52      0.26      2271

          accuracy                           0.67     20354
         macro avg       0.55      0.60      0.52     20354
      weighted avg       0.84      0.67      0.73     20354



## Threshold Tuning

In [77]:
y_prob = model_pipeline.predict_proba(X_test)[:, 1]

new_threshold = 0.4
y_pred_thresh = (y_prob > new_threshold).astype(int)

print(f"\nClassification Report (Tuned Threshold = {new_threshold}):")
print(classification_report(y_test, y_pred_thresh, target_names=['Not Readmitted (0)', 'Readmitted (1)']))

print(f"\nRecall for 'Readmitted (1)' improved from 52% to 89%.")


Classification Report (Tuned Threshold = 0.4):
                    precision    recall  f1-score   support

Not Readmitted (0)       0.95      0.24      0.38     18083
    Readmitted (1)       0.13      0.89      0.22      2271

          accuracy                           0.31     20354
         macro avg       0.54      0.56      0.30     20354
      weighted avg       0.85      0.31      0.36     20354


Recall for 'Readmitted (1)' improved from 52% to 89%.


## Interpretation

In [78]:
print("Final Model Interpretation")

final_log_reg = model_pipeline.named_steps['classifier']

ohe = model_pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot']
categorical_features_out = ohe.get_feature_names_out(input_features=categorical_features)
all_feature_names = list(numeric_features) + list(categorical_features_out) + list(binary_features)

coefficients = final_log_reg.coef_[0]
coef_df = pd.DataFrame({'feature': all_feature_names, 'coefficient': coefficients})

coef_df['abs_coefficient'] = np.abs(coef_df['coefficient'])
coef_df = coef_df.sort_values(by='abs_coefficient', ascending=False)

print("\nTop 15 Drivers of Readmission (Honest Model):")
print(coef_df.head(15))

Final Model Interpretation

Top 15 Drivers of Readmission (Honest Model):
                   feature  coefficient  abs_coefficient
16              age_[0-10)    -0.804746         0.804746
47   admission_source_id_3     0.615140         0.615140
17             age_[10-20)    -0.375897         0.375897
6         number_inpatient     0.366521         0.366521
34      diag_group_Missing     0.323846         0.323846
43     admission_type_id_7    -0.265489         0.265489
36  diag_group_Respiratory    -0.264890         0.264890
44     admission_type_id_8    -0.251184         0.251184
24             age_[80-90)     0.242983         0.242983
58  admission_source_id_20     0.225908         0.225908
23             age_[70-80)     0.222097         0.222097
49   admission_source_id_5    -0.167317         0.167317
48   admission_source_id_4    -0.166521         0.166521
42     admission_type_id_6     0.155528         0.155528
25            age_[90-100)     0.149645         0.149645


## Random Forest

In [79]:
from sklearn.ensemble import RandomForestClassifier

rf_clean_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42,
                                          class_weight='balanced_subsample',
                                          n_jobs=-1))
])

rf_clean_pipeline.fit(X_train, y_train)

print("\nClassification Report (Random Forest on Clean Data):")
y_pred_rf_clean = rf_clean_pipeline.predict(X_test)
print(classification_report(y_test, y_pred_rf_clean, target_names=['Not Readmitted (0)', 'Readmitted (1)']))


Classification Report (Random Forest on Clean Data):
                    precision    recall  f1-score   support

Not Readmitted (0)       0.89      1.00      0.94     18083
    Readmitted (1)       0.58      0.01      0.02      2271

          accuracy                           0.89     20354
         macro avg       0.74      0.51      0.48     20354
      weighted avg       0.86      0.89      0.84     20354

