In [26]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from pathlib import Path

In [3]:
# Load the cleaned dataset
file_path = "../data/processed/cleaned_data.csv"
df = pd.read_csv(file_path)

# Show basic info and first few rows to confirm structure
df.info(), df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99492 entries, 0 to 99491
Data columns (total 48 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   race                      99492 non-null  object
 1   gender                    99492 non-null  object
 2   age                       99492 non-null  object
 3   admission_type_id         99492 non-null  int64 
 4   discharge_disposition_id  99492 non-null  int64 
 5   admission_source_id       99492 non-null  int64 
 6   time_in_hospital          99492 non-null  int64 
 7   num_lab_procedures        99492 non-null  int64 
 8   num_procedures            99492 non-null  int64 
 9   num_medications           99492 non-null  int64 
 10  number_outpatient         99492 non-null  int64 
 11  number_emergency          99492 non-null  int64 
 12  number_inpatient          99492 non-null  int64 
 13  diag_1                    99492 non-null  object
 14  diag_2                

(None,
               race  gender      age  admission_type_id  \
 0        Caucasian  Female   [0-10)                  6   
 1        Caucasian  Female  [10-20)                  1   
 2  AfricanAmerican  Female  [20-30)                  1   
 3        Caucasian    Male  [30-40)                  1   
 4        Caucasian    Male  [40-50)                  1   
 
    discharge_disposition_id  admission_source_id  time_in_hospital  \
 0                        25                    1                 1   
 1                         1                    7                 3   
 2                         1                    7                 2   
 3                         1                    7                 2   
 4                         1                    7                 1   
 
    num_lab_procedures  num_procedures  num_medications  ...  \
 0                  41               0                1  ...   
 1                  59               0               18  ...   
 2               

In [4]:
# 1. Remove target leakage rows based on discharge_disposition_id
leak_ids = [11, 13, 14, 19, 20, 21]
df = df[~df['discharge_disposition_id'].isin(leak_ids)]

In [5]:
# 2. Cap outliers for 'num_medications' at 95th percentile
cap = df['num_medications'].quantile(0.95)
df['num_medications'] = df['num_medications'].clip(upper=cap)

In [6]:
# 3. Create binned feature 'number_inpatient_bin'
df['number_inpatient_bin'] = pd.cut(
    df['number_inpatient'], 
    bins=[-1, 0, 2, 100], 
    labels=["0", "1-2", "3+"]
)

In [7]:
# 3. Create binned feature 'number_inpatient_bin'
df['number_inpatient_bin'] = pd.cut(
    df['number_inpatient'], 
    bins=[-1, 0, 2, 100], 
    labels=["0", "1-2", "3+"]
)

In [8]:
# 4. Create composite feature 'total_visits'
df['total_visits'] = df['number_inpatient'] + df['number_outpatient'] + df['number_emergency']

In [9]:
df.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted_30,diag_1_category,diag_2_category,diag_3_category,number_inpatient_bin,total_visits
0,Caucasian,Female,[0-10),6,25,1,1,41,0,1,...,No,No,No,No,0,Diabetes,Unknown,Unknown,0,0
1,Caucasian,Female,[10-20),1,1,7,3,59,0,18,...,No,No,Ch,Yes,0,Other,Diabetes,Other,0,0
2,AfricanAmerican,Female,[20-30),1,1,7,2,11,5,13,...,No,No,No,Yes,0,Other,Diabetes,Supplemental,1-2,3
3,Caucasian,Male,[30-40),1,1,7,2,44,1,16,...,No,No,Ch,Yes,0,Other,Diabetes,Circulatory,0,0
4,Caucasian,Male,[40-50),1,1,7,1,51,0,8,...,No,No,Ch,Yes,0,Neoplasms,Neoplasms,Diabetes,0,0


In [10]:
# 5. Group rare categories (<1%) as 'Other' for categorical variables
# Example for a generic categorical column 'some_cat_column'
def group_rare_categories(series, threshold=0.01):
    freq = series.value_counts(normalize=True)
    rare_labels = freq[freq < threshold].index
    return series.apply(lambda x: 'Other' if x in rare_labels else x)

In [12]:
print(df.columns)

Index(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted_30',
       'diag_1_category', 'diag_2_category', 'diag_3_category',
       'number_inpatient_bin', 'total_visits'],
      dtype='object')


In [13]:
# Apply for relevant categorical columns (replace 'cat_cols' with your columns)
cat_cols = ['race', 'diag_1_category', 'diag_2_category', 'diag_3_category']

for col in cat_cols:
    if col in df.columns:
        df[col] = group_rare_categories(df[col])

In [14]:
# 6. Create interaction features
df['age_inpatient_interaction'] = df['age'].astype(str) + '_' + df['number_inpatient_bin'].astype(str)
df['diabetes_insulin_combo'] = df['diabetesMed'].astype(str) + '_' + df['insulin'].astype(str)

In [20]:
print(df.columns)

Index(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted_30',
       'diag_1_category', 'diag_2_category', 'diag_3_category',
       'number_inpatient_bin', 'total_visits', 'age_inpatient_interaction',
       'diabetes_insulin_combo']

In [21]:
print(df.head())

              race  gender      age  admission_type_id  \
0        Caucasian  Female   [0-10)                  6   
1        Caucasian  Female  [10-20)                  1   
2  AfricanAmerican  Female  [20-30)                  1   
3        Caucasian    Male  [30-40)                  1   
4        Caucasian    Male  [40-50)                  1   

   discharge_disposition_id  admission_source_id  time_in_hospital  \
0                        25                    1                 1   
1                         1                    7                 3   
2                         1                    7                 2   
3                         1                    7                 2   
4                         1                    7                 1   

   num_lab_procedures  num_procedures  num_medications  ...  change  \
0                  41               0                1  ...      No   
1                  59               0               18  ...      Ch   
2                

In [24]:
print(df.columns.tolist())

['gender', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted_30', 'diag_1_category', 'diag_2_category', 'diag_3_category', 'total_visits', 'age_[10-20)', 'age_[20-30)', 'age_[30-40)', 'age_[40-50)', 'age_[50-60)', 'age_[60-70)', 'age_[70-80)', 'age_[80-90)', 'age_[90-100)', 'number_inpatient_bin_1-2', 'number_inpatient_bin_3+', 'race_

In [25]:
# List of numerical columns to scale
num_cols = [
    'num_lab_procedures',
    'num_procedures',
    'num_medications',
    'time_in_hospital',
    'number_inpatient',
    'number_outpatient',
    'number_emergency',
    'total_visits'
]

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform only if all columns exist to avoid errors
existing_num_cols = [col for col in num_cols if col in df.columns]

df[existing_num_cols] = scaler.fit_transform(df[existing_num_cols])

In [27]:
output_path = Path("../data/processed/feature_engineered_data.csv")

In [28]:
output_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(output_path, index=False)

print(f"Cleaned data saved to: {output_path}")
print(f"Final shape: {df.shape}")

Cleaned data saved to: ..\data\processed\feature_engineered_data.csv
Final shape: (97108, 93)
