In [1]:
import pandas as pd
import numpy as np
import warnings
import missingno as msno
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
warnings.filterwarnings("ignore")

def set_seed(seed):
    """Set the random seed for reproducibility."""
    np.random.seed(seed)


set_seed(42)
pd.set_option("display.max_columns", None)

In [2]:
df = pd.read_csv("data/diabetic_cleaned.csv")
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,admission_type,discharge_disposition,admission_source,readmitted_binary
0,149190,55629189,Caucasian,Female,[10-20),1,1,7,3,Unknown,59,0,18,0,0,0,276,250.01,255,9,Not Measured,Not Measured,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30,Physician Referral,Discharged to home,Emergency Room,1
1,64410,86047875,AfricanAmerican,Female,[20-30),1,1,7,2,Unknown,11,5,13,2,0,1,648,250.0,V27,6,Not Measured,Not Measured,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO,Physician Referral,Discharged to home,Emergency Room,0
2,500364,82442376,Caucasian,Male,[30-40),1,1,7,2,Unknown,44,1,16,0,0,0,8,250.43,403,7,Not Measured,Not Measured,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,Physician Referral,Discharged to home,Emergency Room,0
3,16680,42519267,Caucasian,Male,[40-50),1,1,7,1,Unknown,51,0,8,0,0,0,197,157.0,250,5,Not Measured,Not Measured,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,Physician Referral,Discharged to home,Emergency Room,0
4,35754,82637451,Caucasian,Male,[50-60),2,1,2,3,Unknown,31,6,16,0,0,0,414,411.0,250,9,Not Measured,Not Measured,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30,Clinic Referral,Discharged to home,Clinic Referral,1


In [3]:
df.shape

(100241, 52)

In [4]:
df.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'medical_specialty', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'admission_type', 'discharge_disposition', 'admission_source',
       'readmitted_binary'],
      dtype='obje

In [5]:
features_description = pd.read_csv("data/features_description.csv", sep="\t")
features_description.head()

Unnamed: 0,Variable Name,Role,Type,Demographic,Description,Units,Missing Values
0,encounter_id,ID,,,Unique identifier of an encounter,,no
1,patient_nbr,ID,,,Unique identifier of a patient,,no
2,race,Feature,Categorical,Race,"Values: Caucasian, Asian, African American, Hi...",,yes
3,gender,Feature,Categorical,Gender,"Values: male, female, and unknown/invalid",,no
4,age,Feature,Categorical,Age,"Grouped in 10-year intervals: [0, 10), [10, 20...",,no


In [6]:
features_lookup = {features_description["Variable Name"].iloc[i]: features_description["Description"].iloc[i] for i in range(len(features_description))}
features_lookup

{'encounter_id': 'Unique identifier of an encounter',
 'patient_nbr': 'Unique identifier of a patient',
 'race': 'Values: Caucasian, Asian, African American, Hispanic, and other',
 'gender': 'Values: male, female, and unknown/invalid',
 'age': 'Grouped in 10-year intervals: [0, 10), [10, 20),..., [90, 100)',
 'weight': 'Weight in pounds.',
 'admission_type_id': 'Integer identifier corresponding to 9 distinct values, for example, emergency, urgent, elective, newborn, and not available',
 'discharge_disposition_id': 'Integer identifier corresponding to 29 distinct values, for example, discharged to home, expired, and not available',
 'admission_source_id': 'Integer identifier corresponding to 21 distinct values, for example, physician referral, emergency room, and transfer from a hospital',
 'time_in_hospital': 'Integer number of days between admission and discharge',
 'payer_code': 'Integer identifier corresponding to 23 distinct values, for example, Blue Cross/Blue Shield, Medicare, an

In [7]:
drugs = [
    'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone'
]
df[drugs].head()

Unnamed: 0,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone
0,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No
1,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No
2,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No
3,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No
4,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No


## 1. Historical Admission Features

In [97]:
# Group by patient_nbr to calculate historical stats
historical_features = df.groupby('patient_nbr').agg({
    'encounter_id': 'count',  
    'time_in_hospital': ['mean', 'max'],  
    'number_inpatient': 'sum',    
    'number_emergency': 'sum',                
    'readmitted': lambda x: (x == '<30').sum()   
}).reset_index()

In [98]:
historical_features

Unnamed: 0_level_0,patient_nbr,encounter_id,time_in_hospital,time_in_hospital,number_inpatient,number_emergency,readmitted
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,sum,sum,<lambda>
0,135,2,5.5,8,1,0,1
1,378,1,2.0,2,0,0,0
2,729,1,4.0,4,0,0,0
3,774,1,3.0,3,0,0,0
4,927,1,5.0,5,0,0,0
...,...,...,...,...,...,...,...
70408,189351095,1,1.0,1,0,0,0
70409,189365864,1,3.0,3,0,0,0
70410,189445127,1,3.0,3,0,0,0
70411,189481478,1,14.0,14,0,0,0


In [65]:
historical_features.columns = [
    'patient_nbr', 
    'total_prior_admissions',
    'avg_stay_duration', 
    'max_stay_duration',
    'lifetime_inpatient_visits',
    'lifetime_emergency_visits',
    'prior_30day_readmits'
]
historical_features.head()

Unnamed: 0,patient_nbr,total_prior_admissions,avg_stay_duration,max_stay_duration,lifetime_inpatient_visits,lifetime_emergency_visits,prior_30day_readmits
0,135,2,5.5,8,1,0,1
1,378,1,2.0,2,0,0,0
2,729,1,4.0,4,0,0,0
3,774,1,3.0,3,0,0,0
4,927,1,5.0,5,0,0,0


In [66]:
df = df.merge(historical_features, on='patient_nbr', how='left')
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,admission_type,discharge_disposition,admission_source,readmitted_binary,total_prior_admissions,avg_stay_duration,max_stay_duration,lifetime_inpatient_visits,lifetime_emergency_visits,prior_30day_readmits
0,149190,55629189,Caucasian,Female,[10-20),1,1,7,3,Unknown,59,0,18,0,0,0,276,250.01,255,9,Not Measured,Not Measured,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30,Physician Referral,Discharged to home,Emergency Room,1,1,3.0,3,0,0,0
1,64410,86047875,AfricanAmerican,Female,[20-30),1,1,7,2,Unknown,11,5,13,2,0,1,648,250.0,V27,6,Not Measured,Not Measured,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO,Physician Referral,Discharged to home,Emergency Room,0,1,2.0,2,1,0,0
2,500364,82442376,Caucasian,Male,[30-40),1,1,7,2,Unknown,44,1,16,0,0,0,8,250.43,403,7,Not Measured,Not Measured,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,Physician Referral,Discharged to home,Emergency Room,0,1,2.0,2,0,0,0
3,16680,42519267,Caucasian,Male,[40-50),1,1,7,1,Unknown,51,0,8,0,0,0,197,157.0,250,5,Not Measured,Not Measured,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,Physician Referral,Discharged to home,Emergency Room,0,1,1.0,1,0,0,0
4,35754,82637451,Caucasian,Male,[50-60),2,1,2,3,Unknown,31,6,16,0,0,0,414,411.0,250,9,Not Measured,Not Measured,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30,Clinic Referral,Discharged to home,Clinic Referral,1,1,3.0,3,0,0,0


In [67]:
df['expiration_ind'] = df['discharge_disposition_id'].isin([11,13,14,19,20,21]).astype('int')

In [68]:
df['readmitted_lt30_ind'] = ( df['readmitted']=='<30' ).astype(int)
df['readmitted_gt30_ind'] = ( df['readmitted']=='>30' ).astype(int)
df['readmitted_no_ind'] = ( df['readmitted']=='NO' ).astype(int)
df['readmitted_ind'] = df['readmitted_lt30_ind'] + df['readmitted_gt30_ind']

In [69]:
# Add patient-level features
df['encounter_ct'] = df['patient_nbr'].map(df.groupby('patient_nbr')['encounter_id'].nunique())
df['mb_time_in_hospital'] = df.groupby('patient_nbr')['time_in_hospital'].transform('sum')
df['mb_readmitted_lt30_ct'] = df.groupby('patient_nbr')['readmitted_lt30_ind'].transform('sum')
df['mb_readmitted_gt30_ct'] = df.groupby('patient_nbr')['readmitted_gt30_ind'].transform('sum')
df['mb_readmitted_no_ct'] = df.groupby('patient_nbr')['readmitted_no_ind'].transform('sum')
df['mb_num_lab_procedures_ct'] = df.groupby('patient_nbr')['num_lab_procedures'].transform('sum')
df['mb_num_procedures_ct'] = df.groupby('patient_nbr')['num_procedures'].transform('sum')
df['mb_num_medications_ct'] = df.groupby('patient_nbr')['num_medications'].transform('sum')
df['mb_number_outpatient_ct'] = df.groupby('patient_nbr')['number_outpatient'].transform('sum')
df['mb_number_emergency_ct'] = df.groupby('patient_nbr')['number_emergency'].transform('sum')
df['mb_number_inpatient_ct'] = df.groupby('patient_nbr')['number_inpatient'].transform('sum')
df['mb_number_diagnoses_ct'] = df.groupby('patient_nbr')['number_diagnoses'].transform('sum')

to_drop = ['readmitted_lt30_ind', 'readmitted_gt30_ind', 'readmitted_no_ind']

In [70]:
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,admission_type,discharge_disposition,admission_source,readmitted_binary,total_prior_admissions,avg_stay_duration,max_stay_duration,lifetime_inpatient_visits,lifetime_emergency_visits,prior_30day_readmits,expiration_ind,readmitted_lt30_ind,readmitted_gt30_ind,readmitted_no_ind,readmitted_ind,encounter_ct,mb_time_in_hospital,mb_readmitted_lt30_ct,mb_readmitted_gt30_ct,mb_readmitted_no_ct,mb_num_lab_procedures_ct,mb_num_procedures_ct,mb_num_medications_ct,mb_number_outpatient_ct,mb_number_emergency_ct,mb_number_inpatient_ct,mb_number_diagnoses_ct
0,149190,55629189,Caucasian,Female,[10-20),1,1,7,3,Unknown,59,0,18,0,0,0,276,250.01,255,9,Not Measured,Not Measured,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30,Physician Referral,Discharged to home,Emergency Room,1,1,3.0,3,0,0,0,0,0,1,0,1,1,3,0,1,0,59,0,18,0,0,0,9
1,64410,86047875,AfricanAmerican,Female,[20-30),1,1,7,2,Unknown,11,5,13,2,0,1,648,250.0,V27,6,Not Measured,Not Measured,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO,Physician Referral,Discharged to home,Emergency Room,0,1,2.0,2,1,0,0,0,0,0,1,0,1,2,0,0,1,11,5,13,2,0,1,6
2,500364,82442376,Caucasian,Male,[30-40),1,1,7,2,Unknown,44,1,16,0,0,0,8,250.43,403,7,Not Measured,Not Measured,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,Physician Referral,Discharged to home,Emergency Room,0,1,2.0,2,0,0,0,0,0,0,1,0,1,2,0,0,1,44,1,16,0,0,0,7
3,16680,42519267,Caucasian,Male,[40-50),1,1,7,1,Unknown,51,0,8,0,0,0,197,157.0,250,5,Not Measured,Not Measured,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,Physician Referral,Discharged to home,Emergency Room,0,1,1.0,1,0,0,0,0,0,0,1,0,1,1,0,0,1,51,0,8,0,0,0,5
4,35754,82637451,Caucasian,Male,[50-60),2,1,2,3,Unknown,31,6,16,0,0,0,414,411.0,250,9,Not Measured,Not Measured,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30,Clinic Referral,Discharged to home,Clinic Referral,1,1,3.0,3,0,0,0,0,0,1,0,1,1,3,0,1,0,31,6,16,0,0,0,9


## 2. Diagnosis Code Intelligence

In [71]:
df["diag_1"].unique()

array(['276', '648', '8', '197', '414', '428', '398', '434', '250.7',
       '157', '518', '999', '410', '682', '402', '737', '572', 'V57',
       '189', '786', '427', '996', '277', '584', '462', '473', '411',
       '174', '486', '998', '511', '432', '626', '295', '196', '250.6',
       '618', '182', '845', '423', '808', '250.4', '722', '403', '784',
       '707', '440', '151', '715', '997', '198', '564', '812', '38',
       '590', '556', '578', '250.32', '433', 'V58', '569', '185', '536',
       '255', '250.13', '599', '558', '574', '491', '560', '244',
       '250.03', '577', '730', '188', '824', '250.8', '332', '562', '291',
       '296', '510', '401', '263', '438', '70', '642', '625', '571',
       '738', '593', '250.42', '807', '456', '446', '575', '250.41',
       '250.02', '820', '515', '780', '250.22', '995', '235', '250.82',
       '721', '787', '162', '724', '282', '250.83', '514', 'V55', '281',
       '250.33', '530', '466', '435', '250.12', 'V53', '789', '566',
       '822

In [72]:
def map_diag_code(diag):
    code = float(diag) if str(diag).isdigit() else np.nan
    if pd.isna(code): return 'other'
    
    if 250 <= code < 251: return 'diabetes'
    elif 390 <= code <= 459: return 'cardiovascular'
    elif code in [428, 414, 410]: return 'high_risk_cardiac'
    elif 580 <= code <= 629: return 'renal'
    elif code in [3051, 303, 305]: return 'substance_abuse'
    else: return 'other'


for col in ['diag_1', 'diag_2', 'diag_3']:
    df[f'{col}_group'] = df[col].apply(map_diag_code)
    
df['comorbidity_count'] = df[['diag_1_group', 'diag_2_group', 'diag_3_group']].nunique(axis=1)

In [73]:
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,admission_type,discharge_disposition,admission_source,readmitted_binary,total_prior_admissions,avg_stay_duration,max_stay_duration,lifetime_inpatient_visits,lifetime_emergency_visits,prior_30day_readmits,expiration_ind,readmitted_lt30_ind,readmitted_gt30_ind,readmitted_no_ind,readmitted_ind,encounter_ct,mb_time_in_hospital,mb_readmitted_lt30_ct,mb_readmitted_gt30_ct,mb_readmitted_no_ct,mb_num_lab_procedures_ct,mb_num_procedures_ct,mb_num_medications_ct,mb_number_outpatient_ct,mb_number_emergency_ct,mb_number_inpatient_ct,mb_number_diagnoses_ct,diag_1_group,diag_2_group,diag_3_group,comorbidity_count
0,149190,55629189,Caucasian,Female,[10-20),1,1,7,3,Unknown,59,0,18,0,0,0,276,250.01,255,9,Not Measured,Not Measured,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30,Physician Referral,Discharged to home,Emergency Room,1,1,3.0,3,0,0,0,0,0,1,0,1,1,3,0,1,0,59,0,18,0,0,0,9,other,other,other,1
1,64410,86047875,AfricanAmerican,Female,[20-30),1,1,7,2,Unknown,11,5,13,2,0,1,648,250.0,V27,6,Not Measured,Not Measured,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO,Physician Referral,Discharged to home,Emergency Room,0,1,2.0,2,1,0,0,0,0,0,1,0,1,2,0,0,1,11,5,13,2,0,1,6,other,diabetes,other,2
2,500364,82442376,Caucasian,Male,[30-40),1,1,7,2,Unknown,44,1,16,0,0,0,8,250.43,403,7,Not Measured,Not Measured,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,Physician Referral,Discharged to home,Emergency Room,0,1,2.0,2,0,0,0,0,0,0,1,0,1,2,0,0,1,44,1,16,0,0,0,7,other,other,cardiovascular,2
3,16680,42519267,Caucasian,Male,[40-50),1,1,7,1,Unknown,51,0,8,0,0,0,197,157.0,250,5,Not Measured,Not Measured,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,Physician Referral,Discharged to home,Emergency Room,0,1,1.0,1,0,0,0,0,0,0,1,0,1,1,0,0,1,51,0,8,0,0,0,5,other,other,diabetes,2
4,35754,82637451,Caucasian,Male,[50-60),2,1,2,3,Unknown,31,6,16,0,0,0,414,411.0,250,9,Not Measured,Not Measured,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30,Clinic Referral,Discharged to home,Clinic Referral,1,1,3.0,3,0,0,0,0,0,1,0,1,1,3,0,1,0,31,6,16,0,0,0,9,cardiovascular,cardiovascular,diabetes,2


## 3. Medication Patterns

In [74]:
# Medication complexity features
df['active_med_count'] = df[drugs].apply(lambda x: sum(x != 'No'), axis=1)
df['med_intensity'] = df[drugs].apply(
    lambda x: sum(1 for med in x if med in ['Up', 'Down']), axis=1)

# New: Therapy adherence proxy
df['therapy_instability'] = df[drugs].apply(
    lambda x: sum(1 for med in x if med in ['Up', 'Down', 'Steady']), axis=1) / len(drugs)

In [75]:
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,admission_type,discharge_disposition,admission_source,readmitted_binary,total_prior_admissions,avg_stay_duration,max_stay_duration,lifetime_inpatient_visits,lifetime_emergency_visits,prior_30day_readmits,expiration_ind,readmitted_lt30_ind,readmitted_gt30_ind,readmitted_no_ind,readmitted_ind,encounter_ct,mb_time_in_hospital,mb_readmitted_lt30_ct,mb_readmitted_gt30_ct,mb_readmitted_no_ct,mb_num_lab_procedures_ct,mb_num_procedures_ct,mb_num_medications_ct,mb_number_outpatient_ct,mb_number_emergency_ct,mb_number_inpatient_ct,mb_number_diagnoses_ct,diag_1_group,diag_2_group,diag_3_group,comorbidity_count,active_med_count,med_intensity,therapy_instability
0,149190,55629189,Caucasian,Female,[10-20),1,1,7,3,Unknown,59,0,18,0,0,0,276,250.01,255,9,Not Measured,Not Measured,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30,Physician Referral,Discharged to home,Emergency Room,1,1,3.0,3,0,0,0,0,0,1,0,1,1,3,0,1,0,59,0,18,0,0,0,9,other,other,other,1,1,1,0.043478
1,64410,86047875,AfricanAmerican,Female,[20-30),1,1,7,2,Unknown,11,5,13,2,0,1,648,250.0,V27,6,Not Measured,Not Measured,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO,Physician Referral,Discharged to home,Emergency Room,0,1,2.0,2,1,0,0,0,0,0,1,0,1,2,0,0,1,11,5,13,2,0,1,6,other,diabetes,other,2,1,0,0.043478
2,500364,82442376,Caucasian,Male,[30-40),1,1,7,2,Unknown,44,1,16,0,0,0,8,250.43,403,7,Not Measured,Not Measured,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,Physician Referral,Discharged to home,Emergency Room,0,1,2.0,2,0,0,0,0,0,0,1,0,1,2,0,0,1,44,1,16,0,0,0,7,other,other,cardiovascular,2,1,1,0.043478
3,16680,42519267,Caucasian,Male,[40-50),1,1,7,1,Unknown,51,0,8,0,0,0,197,157.0,250,5,Not Measured,Not Measured,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,Physician Referral,Discharged to home,Emergency Room,0,1,1.0,1,0,0,0,0,0,0,1,0,1,1,0,0,1,51,0,8,0,0,0,5,other,other,diabetes,2,2,0,0.086957
4,35754,82637451,Caucasian,Male,[50-60),2,1,2,3,Unknown,31,6,16,0,0,0,414,411.0,250,9,Not Measured,Not Measured,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30,Clinic Referral,Discharged to home,Clinic Referral,1,1,3.0,3,0,0,0,0,0,1,0,1,1,3,0,1,0,31,6,16,0,0,0,9,cardiovascular,cardiovascular,diabetes,2,1,0,0.043478


In [76]:
df.shape

(100241, 82)

In [77]:
df.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'medical_specialty', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'admission_type', 'discharge_disposition', 'admission_source',
       'readmitted_binary', 'total_prior_admis

In [78]:
# Score de complexité médicamenteuse
df['med_complexity_score'] = (
    df['num_medications'] * 0.5 + 
    df['active_med_count'] * 0.3 + 
    df['med_intensity'] * 0.2)


In [79]:
# Indicateur de diagnostic chronique majeur
chronic_conditions = ['250', '401', '428', '496']  # Codes ICD9 pour diabète, hypertension, etc.
df['major_chronic_diag'] = (
    df['diag_1'].isin(chronic_conditions).astype(int) + 
    df['diag_2'].isin(chronic_conditions).astype(int) + 
    df['diag_3'].isin(chronic_conditions).astype(int))

# Variabilité des diagnostics
df['diag_variability'] = df.apply(
    lambda row: len(set([row['diag_1_group'], row['diag_2_group'], row['diag_3_group']])), axis=1)

In [80]:
# Tendance des visites aux urgences
df['emergency_visit_trend'] = df.groupby('patient_nbr')['number_emergency'].transform(
    lambda x: x.diff().mean())

# Ratio visites urgentes/planifiées
df['urgent_care_ratio'] = (df['number_emergency'] + 1) / (df['number_outpatient'] + 1)

In [81]:
# Ratio durée séjour/nombre procédures
df['stay_procedure_ratio'] = df['time_in_hospital'] / (df['num_procedures'] + 1)

# Indicateur de séjour anormalement court/long
df['abnormal_stay'] = pd.cut(df['time_in_hospital'], 
                            bins=[0, 2, 5, 10, float('inf')], 
                            labels=['very_short', 'short', 'normal', 'long'])

In [82]:
df.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'medical_specialty', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'admission_type', 'discharge_disposition', 'admission_source',
       'readmitted_binary', 'total_prior_admis

In [83]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100241 entries, 0 to 100240
Data columns (total 89 columns):
 #   Column                     Non-Null Count   Dtype   
---  ------                     --------------   -----   
 0   encounter_id               100241 non-null  int64   
 1   patient_nbr                100241 non-null  int64   
 2   race                       100241 non-null  object  
 3   gender                     100241 non-null  object  
 4   age                        100241 non-null  object  
 5   admission_type_id          100241 non-null  int64   
 6   discharge_disposition_id   100241 non-null  int64   
 7   admission_source_id        100241 non-null  int64   
 8   time_in_hospital           100241 non-null  int64   
 9   medical_specialty          100241 non-null  object  
 10  num_lab_procedures         100241 non-null  int64   
 11  num_procedures             100241 non-null  int64   
 12  num_medications            100241 non-null  int64   
 13  number_outpati

In [84]:
to_drop.extend([
    'encounter_id',
    'patient_nbr',      
    'discharge_disposition',       
    'diag_1',                
    'diag_2',                
    'diag_3',               
    # 4. Rarely used medications (<1% prevalence)
    'nateglinide',
    'chlorpropamide',
    'acetohexamide',
    'tolbutamide',
    'acarbose',
    'miglitol',
    'troglitazone',
    'tolazamide',
    'examide',
    'citoglipton',
    'glyburide-metformin',
    'glipizide-metformin',
    'glimepiride-pioglitazone',
    'metformin-rosiglitazone',
    'metformin-pioglitazone',
    
    # 6. Low-variance administrative features
    'medical_specialty'      # Too many categories (84+), often missing
])

In [85]:
to_drop

['readmitted_lt30_ind',
 'readmitted_gt30_ind',
 'readmitted_no_ind',
 'encounter_id',
 'patient_nbr',
 'discharge_disposition',
 'diag_1',
 'diag_2',
 'diag_3',
 'nateglinide',
 'chlorpropamide',
 'acetohexamide',
 'tolbutamide',
 'acarbose',
 'miglitol',
 'troglitazone',
 'tolazamide',
 'examide',
 'citoglipton',
 'glyburide-metformin',
 'glipizide-metformin',
 'glimepiride-pioglitazone',
 'metformin-rosiglitazone',
 'metformin-pioglitazone',
 'medical_specialty']

In [86]:
df.drop(columns=to_drop, inplace=True)
df.columns

Index(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'glimepiride', 'glipizide', 'glyburide', 'pioglitazone',
       'rosiglitazone', 'insulin', 'change', 'diabetesMed', 'readmitted',
       'admission_type', 'admission_source', 'readmitted_binary',
       'total_prior_admissions', 'avg_stay_duration', 'max_stay_duration',
       'lifetime_inpatient_visits', 'lifetime_emergency_visits',
       'prior_30day_readmits', 'expiration_ind', 'readmitted_ind',
       'encounter_ct', 'mb_time_in_hospital', 'mb_readmitted_lt30_ct',
       'mb_readmitted_gt30_ct', 'mb_readmitted_no_ct',
       'mb_num_lab_procedures_ct', 'mb_num_procedures_ct',
       'mb_num_medications_ct', 'mb_number_outpatient_

In [87]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100241 entries, 0 to 100240
Data columns (total 64 columns):
 #   Column                     Non-Null Count   Dtype   
---  ------                     --------------   -----   
 0   race                       100241 non-null  object  
 1   gender                     100241 non-null  object  
 2   age                        100241 non-null  object  
 3   admission_type_id          100241 non-null  int64   
 4   discharge_disposition_id   100241 non-null  int64   
 5   admission_source_id        100241 non-null  int64   
 6   time_in_hospital           100241 non-null  int64   
 7   num_lab_procedures         100241 non-null  int64   
 8   num_procedures             100241 non-null  int64   
 9   num_medications            100241 non-null  int64   
 10  number_outpatient          100241 non-null  int64   
 11  number_emergency           100241 non-null  int64   
 12  number_inpatient           100241 non-null  int64   
 13  number_diagnos

In [88]:
df.drop(columns=['readmitted_binary','readmitted_ind'], inplace=True)

In [89]:
df.to_csv("data/diabetic_preprocess_2.csv", index=False)

In [90]:
df.shape

(100241, 62)