In [None]:
# 02_feature_engineering.ipynb
# Feature engineering for Patient Readmission

import pandas as pd
import numpy as np
import os

# Load cleaned data
df = pd.read_csv("/content/eda_cleaned.csv")

print("Shape:", df.shape)
df.head()


# 1. Drop unnecessary identifiers

drop_cols = ['encounter_id', 'patient_nbr', 'readmitted']
df.drop(columns=drop_cols, inplace=True, errors='ignore')


# 2. Handle Missing Values

df = df.replace('?', np.nan)
# Example: Fill 'race' with most frequent
df['race'] = df['race'].fillna(df['race'].mode()[0])


# 3. Age Bucket Conversion

def convert_age(x):
    # Example: [0-10) -> 5
    return int(x.strip('[]()').split('-')[0]) + 5

df['age_num'] = df['age'].apply(lambda x: convert_age(str(x)))


# 4. Encoding Categorical Variables

cat_cols = df.select_dtypes(include='object').columns.tolist()
cat_cols.remove('age')  # we already encoded age

# We'll handle encoding in pipeline (next notebook)
print("Categorical columns:", cat_cols)


# 5. Save train/test split

from sklearn.model_selection import train_test_split

y = df['target']
X = df.drop(columns=['target'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

os.makedirs("processed", exist_ok=True)
X_train.to_csv("processed/train_X.csv", index=False)
y_train.to_csv("processed/train_y.csv", index=False)
X_test.to_csv("processed/test_X.csv", index=False)
y_test.to_csv("processed/test_y.csv", index=False)

print("Data split saved!")


Shape: (101766, 51)
Categorical columns: ['race', 'gender', 'weight', 'payer_code', 'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed']
Data split saved!
