In [1]:

import pandas as pd
import numpy as np
pd.set_option('display.max_info_columns', 200)

In [2]:

# https://archive.ics.uci.edu/dataset/296/medical_data+130-us+hospitals+for+years+1999-2008
medical_data=pd.read_csv('/Users/helen/Documents/Rob - Humber/Capstone/diabetic_data.csv')

print(medical_data.shape)
print(medical_data.columns)

(101766, 50)
Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')


In [3]:

#dropping features considered in exploratory work (separate ipynb files)
drop=['race', 'weight', 'medical_specialty', 'max_glu_serum']

In [4]:

admission_type_id = { 1 : 'Emergency'
, 2 : 'Urgent'
, 3 : 'Elective'
, 4 : 'Newborn'
, 5 : 'Not Available'
, 6 : 'NULL'
, 7 : 'Trauma Center'
, 8 : 'Not Mapped' }

In [5]:

discharge_disposition_id = { 1 : 'Discharged to home'
, 2 : 'Discharged/transferred to another short term hospital'
, 3 : 'Discharged/transferred to SNF'
, 4 : 'Discharged/transferred to ICF'
, 5 : 'Discharged/transferred to another type of inpatient care institution'
, 6 : 'Discharged/transferred to home with home health service'
, 7 : 'Left AMA'
, 8 : 'Discharged/transferred to home under care of Home IV provider'
, 9 : 'Admitted as an inpatient to this hospital'
, 10 : 'Neonate discharged to another hospital for neonatal aftercare'
, 11 : 'Expired'
, 12 : 'Still patient or expected to return for outpatient services'
, 13 : 'Hospice / home'
, 14 : 'Hospice / medical facility'
, 15 : 'Discharged/transferred within this institution to Medicare approved swing bed'
, 16 : 'Discharged/transferred/referred another institution for outpatient services'
, 17 : 'Discharged/transferred/referred to this institution for outpatient services'
, 18 : 'NULL'
, 19 : 'Expired at home. Medicaid only, hospice'
, 20 : 'Expired in a medical facility. Medicaid only, hospice'
, 21 : 'Expired, place unknown. Medicaid only, hospice'
, 22 : 'Discharged/transferred to another rehab fac including rehab units of a hospital'
, 23 : 'Discharged/transferred to a long term care hospital'
, 24 : 'Discharged/transferred to a nursing facility certified under Medicaid but not certified under Medicare'
, 25 : 'Not Mapped'
, 26 : 'Unknown/Invalid'
, 30 : 'Discharged/transferred to another Type of Health Care Institution not Defined Elsewhere'
, 27 : 'Discharged/transferred to a federal health care facility'
, 28 : 'Discharged/transferred/referred to a psychiatric hospital of psychiatric distinct part unit of a hospital'
, 29 : 'Discharged/transferred to a Critical Access Hospital (CAH)' }

In [6]:

admission_source_id = { 1 : 'Physician Referral'
, 2 : 'Clinic Referral'
, 3 : 'HMO Referral'
, 4 : 'Transfer from a hospital'
, 5 : 'Transfer from a Skilled Nursing Facility (SNF)'
, 6 : 'Transfer from another health care facility'
, 7 : 'Emergency Room'
, 8 : 'Court/Law Enforcement'
, 9 :  'Not Available'
, 10 : 'Transfer from critial access hospital'
, 11 : 'Normal Delivery'
, 12 : 'Premature Delivery'
, 13 : 'Sick Baby'
, 14 : 'Extramural Birth'
, 15 : 'Not Available'
, 17 : 'NULL'
, 18 : 'Transfer From Another Home Health Agency'
, 19 : 'Readmission to Same Home Health Agency'
, 20 : 'Not Mapped'
, 21 : 'Unknown/Invalid'
, 22 : 'Transfer from hospital inpt/same fac reslt in a sep claim'
, 23 : 'Born inside this hospital'
, 24 : 'Born outside this hospital'
, 25 : 'Transfer from Ambulatory Surgery Center'
, 26 : 'Transfer from Hospice'
                      }

In [7]:

medical_data['expiration_ind'] = medical_data['discharge_disposition_id'].isin([11,13,14,19,20,21]).astype('int')

In [8]:

#install values from lookup dictionaries
medical_data['admission_type'] = medical_data['admission_type_id'].map(admission_type_id)
medical_data['discharge_disposition'] = medical_data['discharge_disposition_id'].map(discharge_disposition_id)
medical_data['admission_source'] = medical_data['admission_source_id'].map(admission_source_id)


del admission_type_id
del discharge_disposition_id 
del admission_source_id

medical_data['admission_grp_1_ind'] = ( medical_data['admission_type'].isin(['NULL','Emergency'])).astype(int)
medical_data['admission_grp_2_ind'] = ( medical_data['admission_type'].isin(['Elective','Not Mapped'])).astype(int)

medical_data['discharge_grp_1_ind'] = ( medical_data['discharge_disposition'].isin(['Discharged/transferred to a long term care hospital'
                                                                           ,'NULL'
                                                                           ,'Discharged to home'])).astype(int)

medical_data['discharge_grp_2_ind'] = ( medical_data['discharge_disposition'].isin(['Left AMA'
                                                                            ,'Discharged/transferred to another type of inpatient care institution'
                                                                            ,'Discharged/transferred to SNF'
                                                                            ,'Discharged/transferred to home with home health service'
                                                                            ,'Discharged/transferred to another rehab fac including rehab units of a hospital'])).astype(int)

medical_data['admission_type_ind'] = ( medical_data['admission_source'].isin(['Clinic Referral'
                                                                     ,'Transfer from a hospital'
                                                                     ,'Transfer from another health care facility'])).astype(int)

medical_data['mb_admission_grp_1_ct'] = medical_data.groupby('patient_nbr')['admission_grp_1_ind'].transform('sum')
medical_data['mb_admission_grp_2_ct'] = medical_data.groupby('patient_nbr')['admission_grp_2_ind'].transform('sum')
medical_data['mb_discharge_grp_1_ct'] = medical_data.groupby('patient_nbr')['discharge_grp_1_ind'].transform('sum')
medical_data['mb_discharge_grp_2_ct'] = medical_data.groupby('patient_nbr')['discharge_grp_2_ind'].transform('sum')
medical_data['mb_admission_type_ct']  = medical_data.groupby('patient_nbr')['admission_type_ind'].transform('sum')

drop.extend(['payer_code', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id'])
drop.extend(['admission_grp_1_ind','admission_grp_2_ind','discharge_grp_1_ind', 'discharge_grp_2_ind','admission_type_ind'])


In [9]:

#clean dx codes and replace missing values with ZZZ
medical_data['diag_1'] = medical_data['diag_1'].astype(str).str[ :3]
medical_data['diag_2'] = medical_data['diag_2'].astype(str).str[ :3]
medical_data['diag_3'] = medical_data['diag_3'].astype(str).str[ :3]

medical_data['diag_1'] = medical_data['diag_1'].replace('?', 'ZZZ')
medical_data['diag_2'] = medical_data['diag_2'].replace('?', 'ZZZ')
medical_data['diag_3'] = medical_data['diag_3'].replace('?', 'ZZZ')

In [10]:

#add patient-level count of unique diagnoses codes 
# Melt the dataset to have all diagnoses in a single column
diagnosis_melted = medical_data.melt(id_vars=['patient_nbr'], value_vars=['diag_1', 'diag_2', 'diag_3'])

# Group by patient_nbr and count distinct diagnosis codes
distinct_counts = diagnosis_melted.groupby('patient_nbr')['value'].nunique().reset_index()

# Rename columns for clarity
distinct_counts.columns = ['patient_nbr', 'distinct_diag_count']

medical_data = medical_data.merge(distinct_counts, on='patient_nbr', how='left')

del diagnosis_melted, distinct_counts

In [11]:

# Read the dx code lookup into dataframe 
df = pd.read_csv("unique_diag_df_edit.csv")

# Convert DataFrame to a dictionary with 'diagnosis_cd' as keys and 'diagnosis' as values
diag_dict = dict(zip(df['diagnosis_cd'], df['diagnosis']))

diag_dict['ZZZ'] = 'No diag'  # Adding a new key-value pair

del df

#install descriptive diagnoses
medical_data['diagnosis_1'] = medical_data['diag_1'].map(diag_dict)
medical_data['diagnosis_2'] = medical_data['diag_2'].map(diag_dict)
medical_data['diagnosis_3'] = medical_data['diag_3'].map(diag_dict)

In [12]:

medical_data['diag_1_freq'] = medical_data.groupby('diag_1')['diag_1'].transform('count')
medical_data['diag_2_freq'] = medical_data.groupby('diag_2')['diag_2'].transform('count')
medical_data['diag_3_freq'] = medical_data.groupby('diag_3')['diag_3'].transform('count')

In [13]:

# Major ordered drivers of readmit
# medical_data['diag_1_428_ind'] = ( medical_data['diag_1']=='428' ).astype(int) #CHF NOS
# medical_data['diag_1_491_ind'] = ( medical_data['diag_1']=='491' ).astype(int) #SIMPLE CHR BRONCHITIS
# medical_data['diag_1_493_ind'] = ( medical_data['diag_1']=='493' ).astype(int) #EXTRINSIC ASTHMA NOS

# medical_data['diag_2_403_ind'] = ( medical_data['diag_2']=='403' ).astype(int) #MAL HY KID W CR KID I-IV
# medical_data['diag_2_707_ind'] = ( medical_data['diag_2']=='707' ).astype(int) #DECUBITUS ULCER
# medical_data['diag_2_585_ind'] = ( medical_data['diag_2']=='585' ).astype(int) #CHRONIC RENAL FAILURE
# medical_data['diag_2_491_ind'] = ( medical_data['diag_2']=='491' ).astype(int) #SIMPLE CHR BRONCHITIS

# medical_data['diag_3_403_ind'] = ( medical_data['diag_3']=='403' ).astype(int) #MAL HY KID W CR KID I-IV
# medical_data['diag_3_585_ind'] = ( medical_data['diag_3']=='585' ).astype(int) #CHRONIC RENAL FAILURE
# medical_data['diag_3_707_ind'] = ( medical_data['diag_3']=='707' ).astype(int) #DECUBITUS ULCER

# Indicator variables for Life-Threatening Infections & Sepsis
for code in ['38', '40', '36', '320', '324']:
    medical_data[f'LTIS_{code}_ind'] = (medical_data['diag_1'] == code).astype(int)
    medical_data[f'LTIS_{code}_ind'] = (medical_data['diag_2'] == code).astype(int)
    medical_data[f'LTIS_{code}_ind'] = (medical_data['diag_3'] == code).astype(int)

# Indicator variables for Cardiovascular Emergencies
for code in ['410', '430', '431', '415', '428']:
    medical_data[f'CE_{code}_ind'] = (medical_data['diag_1'] == code).astype(int)
    medical_data[f'CE_{code}_ind'] = (medical_data['diag_2'] == code).astype(int)
    medical_data[f'CE_{code}_ind'] = (medical_data['diag_3'] == code).astype(int)

# Indicator variables for Cancer (Malignant Neoplasms)
for code in ['155', '162', '191', '197', '199']:
    medical_data[f'CMN_{code}_ind'] = (medical_data['diag_1'] == code).astype(int)
    medical_data[f'CMN_{code}_ind'] = (medical_data['diag_2'] == code).astype(int)
    medical_data[f'CMN_{code}_ind'] = (medical_data['diag_3'] == code).astype(int)

# Indicator variables for Organ Failure
for code in ['570', '584', '585', '277']:
    medical_data[f'OF_{code}_ind'] = (medical_data['diag_1'] == code).astype(int)
    medical_data[f'OF_{code}_ind'] = (medical_data['diag_2'] == code).astype(int)
    medical_data[f'OF_{code}_ind'] = (medical_data['diag_3'] == code).astype(int)

# Indicator variables for Neurological & Brain Disorders
for code in ['331', '340', '780', '852']:
    medical_data[f'NBD_{code}_ind'] = (medical_data['diag_1'] == code).astype(int)
    medical_data[f'NBD_{code}_ind'] = (medical_data['diag_2'] == code).astype(int)
    medical_data[f'NBD_{code}_ind'] = (medical_data['diag_3'] == code).astype(int)

# Indicator variables for Severe Trauma & Injuries
for code in ['806', '861', '864', '958']:
    medical_data[f'STI_{code}_ind'] = (medical_data['diag_1'] == code).astype(int)
    medical_data[f'STI_{code}_ind'] = (medical_data['diag_2'] == code).astype(int)
    medical_data[f'STI_{code}_ind'] = (medical_data['diag_3'] == code).astype(int)

# Indicator variables for Other Critical Conditions
for code in ['250', '995', '986', '989']:
    medical_data[f'OCC_{code}_ind'] = (medical_data['diag_1'] == code).astype(int)
    medical_data[f'OCC_{code}_ind'] = (medical_data['diag_2'] == code).astype(int)
    medical_data[f'OCC_{code}_ind'] = (medical_data['diag_3'] == code).astype(int)


In [14]:

# medical_data['diag_1_driver_ind'] = medical_data['diag_1'].isin([ '403' #MAL HY KID W CR KID I-IV
# , '787' #NAUSEA WITH VOMITING
# , '404' #MAL HY HT/KD I-IV W/O HF
# , '707' #DECUBITUS ULCER
# , '572' #ABSCESS OF LIVER
# , '730' #AC OSTEOMYELITIS-UNSPEC
# , 'V58' #RADIOTHERAPY ENCOUNTER
# , '537' #ACQ PYLORIC STENOSIS
# , '443' #RAYNAUD'S SYNDROME
# , '292' #DRUG WITHDRAWAL
# , '496' #CHR AIRWAY OBSTRUCT NEC
# , '585' #CHRONIC RENAL FAILURE
# , '282' #HEREDITARY SPHEROCYTOSIS
# , '799' #ASPHYXIA
# , '284' #CONGEN APLASTIC ANEMIA
# , '567' #PERITONITIS IN INFEC DIS
# , '293' #DELIRIUM D/T OTHER COND
# , '924' #CONTUSION OF THIGH
# , '340' #MULTIPLE SCLEROSIS
# , '514' #PULM CONGEST/HYPOSTASIS
# , '485' #BRONCHOPNEUMONIA ORG NOS
# , '714' #RHEUMATOID ARTHRITIS
# , '277' #CYSTIC FIBROS W/O ILEUS
# , '150' #MAL NEO CERVICAL ESOPHAG
# , '135' #SARCOIDOSIS
# , '522']).astype(int) #PULPITIS


In [15]:

# medical_data['diag_2_driver_ind'] = medical_data['diag_2'].isin([ '571' #ALCOHOLIC FATTY LIVER
# , '404' #MAL HY HT/KD I-IV W/O HF
# , '536' #ACHLORHYDRIA
# , '202' #NDLR LYM UNSP XTRNDL ORG
# , '396' #MITRAL/AORTIC STENOSIS
# , '304' #OPIOID DEPENDENCE-UNSPEC
# , '444' #ABD AORTIC EMBOLISM
# , '581' #NEPHROTIC SYN, PROLIFER
# , '731' #OSTEITIS DEFORMANS NOS
# , 'E94' #ADV EFF ANALEPTICS
# , '397' #TRICUSPID VALVE DISEASE
# , '595' #ACUTE CYSTITIS
# , '205' #AC MYL LEUK WO ACHV RMSN
# , '490' #BRONCHITIS NOS
# , '459' #HEMORRHAGE NOS
# , '189' #MALIG NEOPL KIDNEY
# , '154' #MAL NEO RECTOSIGMOID JCT
# , '332' #PARALYSIS AGITANS
# , 'V49' #DEFICIENCIES OF LIMBS
# , '681' #CELLULITIS, FINGER NOS
# , '150' #MAL NEO CERVICAL ESOPHAG
# , '537' #ACQ PYLORIC STENOSIS
# , '094' ]).astype(int) #TABES DORSALIS



In [16]:

# medical_data['diag_3_driver_ind'] = medical_data['diag_3'].isin([  '682' #CELLULITIS OF FACE
# , '070' #HEPATITIS A WITH COMA
# , '536' #ACHLORHYDRIA
# , 'V42' #KIDNEY TRANSPLANT STATUS
# , '443' #RAYNAUD'S SYNDROME
# , '304' #OPIOID DEPENDENCE-UNSPEC
# , '284' #CONGEN APLASTIC ANEMIA
# , '466' #ACUTE BRONCHITIS
# , '459' #HEMORRHAGE NOS
# , '581' #NEPHROTIC SYN, PROLIFER
# , '337' #IDIOPATH AUTO NEUROPATHY
# , '583' #PROLIFERAT NEPHRITIS NOS
# , '203' #MULT MYE W/O ACHV RMSON
# , 'V46' #DEPENDENCE ON ASPIRATOR
# , '482' #K. PNEUMONIAE PNEUMONIA
# , 'V49' #DEFICIENCIES OF LIMBS
# , '444' #ABD AORTIC EMBOLISM
# , '174' #MALIG NEO NIPPLE
# , '456' #ESOPHAG VARICES W BLEED
# , '519' #TRACHEOSTOMY COMP NOS
# , 'E92' #ACC-POWERED LAWN MOWER
# , 'V62' #UNEMPLOYMENT
# , '356' #HERED PERIPH NEUROPATHY
# , '711' #PYOGEN ARTHRITIS-UNSPEC
# , '053' ]).astype(int)  #HERPES ZOSTER MENINGITIS


In [17]:
import pandas as pd

# --- Pre-computation Setup ---------------------------------------------------
# This script assumes you have a pandas DataFrame named 'medical_data'.
# It also assumes you have a dictionary named 'diag_dict' that maps
# diagnosis codes (like '428') to their descriptions (like 'Congestive Heart Failure').
#
# Example:
# medical_data = pd.read_csv('your_data.csv')
# diag_dict = {'428': 'Congestive Heart Failure', '250': 'Diabetes Mellitus', ...}
# -----------------------------------------------------------------------------


# --- Step 1: Combine and Sort Diagnosis Codes --------------------------------
# The goal here is to treat the three diagnosis columns as a single entity.
# We sort them so that the order doesn't matter when we count combinations.
# For example, a patient with diagnoses (428, 250, 401) will be treated the same
# as a patient with (250, 401, 428).

print("Step 1: Combining and sorting diagnoses for each patient...")

# Create a new column 'sorted_diagnoses_list' that contains a sorted list
# of the three diagnosis codes for each patient.
diagnosis_columns = ['diag_1', 'diag_2', 'diag_3']
medical_data['sorted_diagnoses_list'] = medical_data[diagnosis_columns].values.tolist()
medical_data['sorted_diagnoses_list'] = medical_data['sorted_diagnoses_list'].apply(sorted)

print(" -> Done. Created 'sorted_diagnoses_list' column.\n")


# --- Step 2: Calculate the Frequency of Each Diagnosis Combination ----------
# Now, we count how many times each unique *combination* of three diagnoses appears
# in the entire dataset.

print("Step 2: Calculating frequency of diagnosis combinations...")

# To use groupby, we need a hashable type, so we convert the list to a tuple.
medical_data['sorted_diagnoses_tuple'] = medical_data['sorted_diagnoses_list'].apply(tuple)

# Use groupby().transform('count') to count occurrences of each tuple and
# assign that count to every row with that same tuple.
medical_data['diagnosis_combo_frequency'] = medical_data.groupby('sorted_diagnoses_tuple')['sorted_diagnoses_tuple'].transform('count')

# We no longer need the temporary tuple column, so we can drop it.
medical_data.drop(columns=['sorted_diagnoses_tuple'], inplace=True)

print(" -> Done. Created 'diagnosis_combo_frequency' column.\n")


# --- Step 3: Analyze Frequencies of Individual Sorted Diagnoses -------------
# This step creates three new columns containing the sorted diagnoses. This makes it
# easy to see the most common primary, secondary, and tertiary diagnoses
# *after* they have been put in a consistent order.

print("Step 3: Analyzing frequencies of individual diagnoses in their sorted positions...")

# Split the sorted list back into three separate columns.
medical_data[['sorted_diag_1', 'sorted_diag_2', 'sorted_diag_3']] = pd.DataFrame(
    medical_data['sorted_diagnoses_list'].tolist(),
    index=medical_data.index
)

# Calculate the frequency of each diagnosis code in each of the three sorted positions.
medical_data['sorted_diag_1_frequency'] = medical_data.groupby('sorted_diag_1')['sorted_diag_1'].transform('count')
medical_data['sorted_diag_2_frequency'] = medical_data.groupby('sorted_diag_2')['sorted_diag_2'].transform('count')
medical_data['sorted_diag_3_frequency'] = medical_data.groupby('sorted_diag_3')['sorted_diag_3'].transform('count')

print(" -> Done. Created sorted diagnosis columns and their frequency columns.\n")


# --- Step 4: Final Formatting and Cleanup ------------------------------------
# The final steps are to create a clean string representation of the diagnosis
# combination and to map the diagnosis codes to their human-readable descriptions.

print("Step 4: Formatting and cleaning up...")

# Create a user-friendly string representation of the sorted diagnosis combination.
# e.g., ['250', '401', '428'] becomes '(250 401 428)'
medical_data['diagnosis_combo_string'] = medical_data['sorted_diagnoses_list'].apply(lambda x: f"({' '.join(map(str, x))})")

# Now that we're done with all calculations, drop the list column.
medical_data.drop(columns=['sorted_diagnoses_list'], inplace=True)

# Use the pre-defined 'diag_dict' to replace codes with full text descriptions.
# This makes the final DataFrame much easier to read.
# We apply this to the sorted columns for clarity.
# Note: Ensure 'diag_dict' is defined before this step!
if 'diag_dict' in locals() or 'diag_dict' in globals():
    medical_data['sorted_diag_1'] = medical_data['sorted_diag_1'].map(diag_dict)
    medical_data['sorted_diag_2'] = medical_data['sorted_diag_2'].map(diag_dict)
    medical_data['sorted_diag_3'] = medical_data['sorted_diag_3'].map(diag_dict)
    del diag_dict # Clean up the dictionary from memory
else:
    print("Warning: 'diag_dict' not found. Skipping diagnosis description mapping.")

print(" -> Done. Final formatting complete.\n")


# --- Step 5: Create Indicator for High-Risk Diagnosis Combinations ---------
# This step creates a binary flag for patients with diagnosis combinations
# that are known to be clinically significant and may lead to higher readmission rates.

print("Step 5: Creating a high-risk patient indicator...")

# Define a custom list of high-risk diagnosis combinations.
# These combinations often represent severe comorbidity scenarios for diabetic patients.
# Note: Codes within each tuple are pre-sorted to match 'diagnosis_combo_string'.
high_risk_combinations = [
    ' (250 401 428)', # Diabetes, Hypertension, Congestive Heart Failure
    ' (250 410 428)', # Diabetes, Heart Attack, Congestive Heart Failure
    ' (250 403 585)', # Diabetes, Hypertensive Chronic Kidney Disease, Chronic Kidney Disease
    ' (250 428 585)', # Diabetes, Congestive Heart Failure, Chronic Kidney Disease
    ' (250 486 496)', # Diabetes, Pneumonia, Chronic Airway Obstruction
    ' (250 682 707)', # Diabetes, Cellulitis, Decubitus Ulcer
    ' (414 427 428)', # Coronary Atherosclerosis, Cardiac Dysrhythmias, Heart Failure
]

# Create the indicator column. It will be 1 if the patient's diagnosis combo
# is in our high-risk list, and 0 otherwise.
medical_data['is_high_risk_combo'] = medical_data['diagnosis_combo_string'].isin(high_risk_combinations).astype(int)

print(" -> Done. Created 'is_high_risk_combo' indicator column.\n")
print("Data processing finished. Check the 'medical_data' DataFrame for new columns.")


Step 1: Combining and sorting diagnoses for each patient...
 -> Done. Created 'sorted_diagnoses_list' column.

Step 2: Calculating frequency of diagnosis combinations...
 -> Done. Created 'diagnosis_combo_frequency' column.

Step 3: Analyzing frequencies of individual diagnoses in their sorted positions...
 -> Done. Created sorted diagnosis columns and their frequency columns.

Step 4: Formatting and cleaning up...
 -> Done. Final formatting complete.

Step 5: Creating a high-risk patient indicator...
 -> Done. Created 'is_high_risk_combo' indicator column.

Data processing finished. Check the 'medical_data' DataFrame for new columns.


In [18]:

dx_list=[ '428'
, '403'
, '707'
, '585'
, '491'
, '396'
, '440'
, '453'
, '571'
, '284'
, '304'
, '482'
, '150'
, '282'
, '332'
, '443'
, '719'
, '423'
, '281'
, '536'
, '368'
, '515'
, '595'
, '572'
, '681'
, '581'
, '537'
, '490'
, '583'
, 'V46'
, '519'
, '300'
, '567'
, 'E92'
, 'V49'
, '094'
, '514'
, '494'
, '042'
, '404'
, '346'
, '792'
, '398'
, '753'
, '577'
, '730'
, '444'
, '459'
, '790'
, '337'
, '397'
, '292'
, 'V42'
, '289']


In [19]:

# Create dx_events with the selected columns
dx_events = medical_data[['patient_nbr', 'encounter_id', 'diag_1', 'diag_2', 'diag_3']].copy(deep=True)

# Add new empty columns for each element in dx_list
for dx in dx_list:
    dx_events[f'dx_{dx}_ind'] = pd.NA  # Creates empty columns with missing values (best for Pandas)

print(dx_events.shape)

(101766, 59)


In [20]:

# populate each column
for dx in dx_list:
    dx_events[f'dx_{dx}_ind'] = ((dx_events['diag_1'] == dx) |
                                 (dx_events['diag_2'] == dx) |
                                 (dx_events['diag_3'] == dx) ).astype(int)
print('done')   

done


In [21]:

# Group by patient_nbr and compute max and sum for each diagnosis indicator
dx_aggregated = dx_events.groupby('patient_nbr').agg(
    {f'dx_{dx}_ind': ['max', 'sum'] for dx in dx_list}
)

# Rename columns to match SQL-style naming
dx_aggregated.columns = [f'{col[0]}_{col[1]}' for col in dx_aggregated.columns]

# Reset index to bring patient_nbr back as a column
dx_aggregated = dx_aggregated.reset_index()

dx_aggregated.to_csv('dx_aggregated_ck.csv',index=False)
del dx_events

In [22]:

print(medical_data.shape)
medical_data = medical_data.merge(dx_aggregated, on='patient_nbr', how='left')
print(medical_data.shape)
del dx_aggregated

(101766, 111)
(101766, 219)


In [23]:

medical_data['alcohol_ind'] = medical_data[['diagnosis_1', 'diagnosis_2', 'diagnosis_3']].apply(
    lambda row: int(any('ALCOHOL' in str(val) for val in row)), axis=1
)

medical_data['obesity_ind'] = medical_data[['diagnosis_1', 'diagnosis_2', 'diagnosis_3']].apply(
    lambda row: int(any('OBESITY' in str(val) for val in row)), axis=1
)

medical_data['mh_ind'] = medical_data[['diagnosis_1', 'diagnosis_2', 'diagnosis_3']].apply(
    lambda row: int(any('MALIGNANT HYPERTENSION' in str(val) for val in row)), axis=1
)

In [24]:

medical_data['alcohol_history_ind'] = medical_data.groupby('patient_nbr')['alcohol_ind'].transform('max')
medical_data['obesity_history_ind'] = medical_data.groupby('patient_nbr')['obesity_ind'].transform('max')
medical_data['mh_history_ind'] = medical_data.groupby('patient_nbr')['mh_ind'].transform('max')

drop.extend(['alcohol_ind', 'obesity_ind', 'mh_ind'])

In [25]:

medical_data['readmitted_lt30_ind'] = ( medical_data['readmitted']=='<30' ).astype(int)
medical_data['readmitted_gt30_ind'] = ( medical_data['readmitted']=='>30' ).astype(int)
medical_data['readmitted_no_ind'] = ( medical_data['readmitted']=='NO' ).astype(int)
medical_data['readmitted_ind'] = medical_data['readmitted_lt30_ind'] + medical_data['readmitted_gt30_ind']

In [26]:

#add patient-level some features
medical_data['encounter_ct'] = medical_data['patient_nbr'].map(medical_data.groupby('patient_nbr')['encounter_id'].nunique())
medical_data['mb_time_in_hospital'] = medical_data.groupby('patient_nbr')['time_in_hospital'].transform('sum')
medical_data['mb_readmitted_lt30_ct'] = medical_data.groupby('patient_nbr')['readmitted_lt30_ind'].transform('sum')
medical_data['mb_readmitted_gt30_ct'] = medical_data.groupby('patient_nbr')['readmitted_gt30_ind'].transform('sum')
medical_data['mb_readmitted_no_ct'] = medical_data.groupby('patient_nbr')['readmitted_no_ind'].transform('sum')
medical_data['mb_num_lab_procedures_ct'] = medical_data.groupby('patient_nbr')['num_lab_procedures'].transform('sum')
medical_data['mb_num_procedures_ct'] = medical_data.groupby('patient_nbr')['num_procedures'].transform('sum')
medical_data['mb_num_medications_ct'] = medical_data.groupby('patient_nbr')['num_medications'].transform('sum')
medical_data['mb_number_outpatient_ct'] = medical_data.groupby('patient_nbr')['number_outpatient'].transform('sum')
medical_data['mb_number_emergency_ct'] = medical_data.groupby('patient_nbr')['number_emergency'].transform('sum')
medical_data['mb_number_inpatient_ct'] = medical_data.groupby('patient_nbr')['number_inpatient'].transform('sum')
medical_data['mb_number_diagnoses_ct'] = medical_data.groupby('patient_nbr')['number_diagnoses'].transform('sum')

drop.extend(['readmitted_lt30_ind', 'readmitted_gt30_ind', 'readmitted_no_ind'])

In [27]:

# delete patients whose history includes childhood or old age

medical_data['under20_ind'] = medical_data['age'].isin(['[0-10)','[10-20)']).astype(int)
medical_data['over80_ind']  = medical_data['age'].isin(['[80-90)', '[90-100)']).astype(int)

medical_data['mb_under20_ind'] = medical_data.groupby('patient_nbr')['under20_ind'].transform('max')
medical_data['mb_over80_ind'] = medical_data.groupby('patient_nbr')['over80_ind'].transform('max')

drop.extend(['under20_ind', 'over80_ind'])

In [28]:
drop.extend([ 'citoglipton'
, 'examide'
, 'acetohexamide'
, 'glimepiride-pioglitazone'
, 'metformin-pioglitazone'
, 'metformin-rosiglitazone'
, 'troglitazone'
, 'glipizide-metformin'
, 'tolbutamide'
, 'miglitol'
, 'tolazamide'
, 'chlorpropamide'
, 'acarbose'
, 'nateglinide'
, 'glyburide-metformin'
, 'repaglinide'
 ])

In [29]:

print(medical_data.groupby('encounter_ct')['patient_nbr'].nunique().reset_index().sort_values('encounter_ct'))

    encounter_ct  patient_nbr
0              1        54745
1              2        10434
2              3         3328
3              4         1421
4              5          717
5              6          346
6              7          207
7              8          111
8              9           70
9             10           42
10            11           20
11            12           19
12            13           14
13            14            5
14            15            9
15            16            4
16            17            3
17            18            6
18            19            3
19            20            6
20            21            1
21            22            2
22            23            3
23            28            1
24            40            1


In [30]:

print(medical_data.shape)
#remove very young and very old members, members who died, unknown gender
medical_data = medical_data[
    (medical_data['expiration_ind'] != 1 ) &
    (medical_data['mb_under20_ind'] != 1 ) &
    (medical_data['mb_over80_ind']  != 1 ) &
    (medical_data['gender'] != 'Unknown/Invalid') &  # Exclude records with gender 'Unknown/Invalid'
    (medical_data['encounter_ct'] < 14 )  # Exclude records with gender 'Unknown/Invalid'
]
print(medical_data.shape)

drop.extend(['mb_under20_ind', 'mb_over80_ind', 'expiration_ind'])

(101766, 245)
(78049, 245)


In [31]:

medical_data.drop(columns=drop, inplace=True)
medical_data['dummy']=1
medical_data.to_csv('p004_medical_data_MLprep.csv',index=False)
print(medical_data.shape)
print(medical_data.info())

(78049, 206)
<class 'pandas.core.frame.DataFrame'>
Index: 78049 entries, 2 to 101765
Columns: 206 entries, encounter_id to dummy
dtypes: int32(149), int64(31), object(26)
memory usage: 78.9+ MB
None


In [32]:
# Define the list of columns to drop after feature engineering is complete.
# These are the original diagnosis columns and the intermediate columns used
# for calculation and analysis.

drop.extend([
    # --- Original Diagnosis Columns (now redundant) ---
    'diag_1',
    'diag_2',
    'diag_3',

    # --- Sorted Diagnosis Description Columns (now replaced by features) ---
    'sorted_diag_1',
    'sorted_diag_2',
    'sorted_diag_3',

    # --- Intermediate Frequency Calculation Columns ---
    'diagnosis_combo_frequency',
    'sorted_diag_1_frequency',
    'sorted_diag_2_frequency',
    'sorted_diag_3_frequency',

    # --- Intermediate Combination String (optional to keep for inspection) ---
    # This column was used to create the 'is_high_risk_combo' feature.
    # You can comment it out if you want to keep it for debugging or analysis.
    'diagnosis_combo_string',
        
    'A1cresult',
    'metformin',
    'glimepiride',
    'glipizide',    
    'glyburide',
    'pioglitazone',
    'rosiglitazone',
    'insulin',
])


# Example of how to use the list to drop the columns from your DataFrame
# medical_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# print("Dropped intermediate and redundant columns.")

In [33]:
medical_data.drop(columns=drop, inplace=True, errors='ignore')
medical_data.to_pickle("p004_medical_data_MLprep.pkl")
print(medical_data.shape)
print(medical_data.info())

(78049, 188)
<class 'pandas.core.frame.DataFrame'>
Index: 78049 entries, 2 to 101765
Data columns (total 188 columns):
 #    Column                    Non-Null Count  Dtype 
---   ------                    --------------  ----- 
 0    encounter_id              78049 non-null  int64 
 1    patient_nbr               78049 non-null  int64 
 2    gender                    78049 non-null  object
 3    age                       78049 non-null  object
 4    time_in_hospital          78049 non-null  int64 
 5    num_lab_procedures        78049 non-null  int64 
 6    num_procedures            78049 non-null  int64 
 7    num_medications           78049 non-null  int64 
 8    number_outpatient         78049 non-null  int64 
 9    number_emergency          78049 non-null  int64 
 10   number_inpatient          78049 non-null  int64 
 11   number_diagnoses          78049 non-null  int64 
 12   A1Cresult                 13734 non-null  object
 13   change                    78049 non-null  object
 

In [34]:
for _ in medical_data.columns:
    print("column names :", _)

column names : encounter_id
column names : patient_nbr
column names : gender
column names : age
column names : time_in_hospital
column names : num_lab_procedures
column names : num_procedures
column names : num_medications
column names : number_outpatient
column names : number_emergency
column names : number_inpatient
column names : number_diagnoses
column names : A1Cresult
column names : change
column names : diabetesMed
column names : readmitted
column names : admission_type
column names : discharge_disposition
column names : admission_source
column names : mb_admission_grp_1_ct
column names : mb_admission_grp_2_ct
column names : mb_discharge_grp_1_ct
column names : mb_discharge_grp_2_ct
column names : mb_admission_type_ct
column names : distinct_diag_count
column names : diagnosis_1
column names : diagnosis_2
column names : diagnosis_3
column names : diag_1_freq
column names : diag_2_freq
column names : diag_3_freq
column names : LTIS_38_ind
column names : LTIS_40_ind
column names :

In [35]:
import pandas as pd

# Set display options to show all columns and rows without limits
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Now, when you print the head of your DataFrame (or the whole thing),
# all columns will be visible.
print(medical_data.head())

   encounter_id  patient_nbr  gender      age  time_in_hospital  \
2         64410     86047875  Female  [20-30)                 2   
3        500364     82442376    Male  [30-40)                 2   
4         16680     42519267    Male  [40-50)                 1   
5         35754     82637451    Male  [50-60)                 3   
6         55842     84259809    Male  [60-70)                 4   

   num_lab_procedures  num_procedures  num_medications  number_outpatient  \
2                  11               5               13                  2   
3                  44               1               16                  0   
4                  51               0                8                  0   
5                  31               6               16                  0   
6                  70               1               21                  0   

   number_emergency  number_inpatient  number_diagnoses A1Cresult change  \
2                 0                 1                 6   

In [36]:
#show the data types of the columns
print(medical_data.dtypes)



encounter_id                 int64
patient_nbr                  int64
gender                      object
age                         object
time_in_hospital             int64
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
number_diagnoses             int64
A1Cresult                   object
change                      object
diabetesMed                 object
readmitted                  object
admission_type              object
discharge_disposition       object
admission_source            object
mb_admission_grp_1_ct        int32
mb_admission_grp_2_ct        int32
mb_discharge_grp_1_ct        int32
mb_discharge_grp_2_ct        int32
mb_admission_type_ct         int32
distinct_diag_count          int64
diagnosis_1                 object
diagnosis_2                 object
diagnosis_3                 object
diag_1_freq         

In [37]:
# how many mb_readmitted_lt30_ct > 1
print("Number of patients with more than one readmission within 30 days:")
print(medical_data[medical_data['mb_readmitted_lt30_ct'] > 1].shape[0])
print("Number of patients with more than one readmission within 30 days (as percentage):")
print(medical_data[medical_data['mb_readmitted_lt30_ct'] > 1].shape[0] / medical_data.shape[0] * 100)
# how many mb_readmitted_gt30_ct > 1
print("Number of patients with more than one readmission after 30 days:")
print(medical_data[medical_data['mb_readmitted_gt30_ct'] > 1].shape[0])
print("Number of patients with more than one readmission after 30 days (as percentage):")
print(medical_data[medical_data['mb_readmitted_gt30_ct'] > 1].shape[0] / medical_data.shape[0] * 100)

Number of patients with more than one readmission within 30 days:
5783
Number of patients with more than one readmission within 30 days (as percentage):
7.409447910927751
Number of patients with more than one readmission after 30 days:
18186
Number of patients with more than one readmission after 30 days (as percentage):
23.300746966649154


In [38]:
#here we deal with some of the object columns


#print just the object columns
print(medical_data.select_dtypes(include=['object']).columns)

Index(['gender', 'age', 'A1Cresult', 'change', 'diabetesMed', 'readmitted',
       'admission_type', 'discharge_disposition', 'admission_source',
       'diagnosis_1', 'diagnosis_2', 'diagnosis_3'],
      dtype='object')


In [39]:
# show the number of null values in each object column
print(medical_data.select_dtypes(include=['object']).isnull().sum())

gender                       0
age                          0
A1Cresult                64315
change                       0
diabetesMed                  0
readmitted                   0
admission_type               0
discharge_disposition        0
admission_source             0
diagnosis_1                  0
diagnosis_2                  0
diagnosis_3                  0
dtype: int64


In [40]:
#drop A1Cresult column - way too many nulls
medical_data.drop(columns=['A1cresult'], inplace=True, errors='ignore')


In [41]:
# convert the remaining object columns using pd.get_dummies
medical_data = pd.get_dummies(medical_data, drop_first=True, dtype=int)
                            

In [42]:
#for indicator, we are using mb_readmitted_lt30_ct and we need to convert it to a binary indicator
medical_data['readmitted_lt30_ind'] = (medical_data['mb_readmitted_lt30_ct'] > 0).astype(int)
medical_data['readmitted_gt30_ind'] = (medical_data['mb_readmitted_gt30_ct'] > 0).astype(int)
medical_data['readmitted_no_ind'] = (medical_data['mb_readmitted_no_ct'] > 0).astype(int)   

In [50]:
# check the balance of the readmission indicators
print("Balance of readmission indicators:") 
print(medical_data['readmitted_lt30_ind'].value_counts(normalize=False))
print(medical_data['readmitted_gt30_ind'].value_counts(normalize=False))
print(medical_data['readmitted_no_ind'].value_counts(normalize=False))


Balance of readmission indicators:
readmitted_lt30_ind
0    59722
1    18327
Name: count, dtype: int64
readmitted_gt30_ind
0    39308
1    38741
Name: count, dtype: int64
readmitted_no_ind
1    57843
0    20206
Name: count, dtype: int64


In [46]:
# create a logistic regression model

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# Define the target variable and features
target = medical_data['readmitted_lt30_ind'] 
# Define the features to be used in the model
excluded_columns = [ 'patient_nbr', 'encounter_id', 'dummy','mb_readmitted_lt30_ct',
                                             'mb_readmitted_gt30_ct', 'mb_readmitted_no_ct','readmitted',
                                             'readmitted_gt30_ind', 'readmitted_no_ind','readmitted_lt30_ind']
features = [col for col in medical_data.columns if col not in excluded_columns]
# Split the data into training and testing sets
X = medical_data[features]
y = target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create and fit the logistic regression model
model = LogisticRegression(max_iter=1000, random_state=42)  
model.fit(X_train, y_train)
# Evaluate the model
accuracy = model.score(X_test, y_test)
print(f"Model accuracy: {accuracy:.2f}")
# create a confusion matrix
from sklearn.metrics import confusion_matrix, classification_report
# Generate predictions
y_pred = model.predict(X_test)
# Create confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Print confusion matrix
print("Confusion Matrix:")
print(cm)
# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model accuracy: 0.85
Confusion Matrix:
[[11317   630]
 [ 1742  1921]]
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.95      0.91     11947
           1       0.75      0.52      0.62      3663

    accuracy                           0.85     15610
   macro avg       0.81      0.74      0.76     15610
weighted avg       0.84      0.85      0.84     15610



In [48]:

#do another logistic regression but use validation data
from sklearn.model_selection import train_test_split
# use the same features and target variable from before
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
# Create and fit the logistic regression model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)
# Evaluate the model on the validation set
val_accuracy = model.score(X_val, y_val)
print(f"Validation Model accuracy: {val_accuracy:.2f}")
# Generate predictions on the validation set
y_val_pred = model.predict(X_val)
# Create confusion matrix for validation set
from sklearn.metrics import confusion_matrix, classification_report
val_cm = confusion_matrix(y_val, y_val_pred)
# Print confusion matrix for validation set
print("Validation Confusion Matrix:")
print(val_cm)
# Print classification report for validation set
print("Validation Classification Report:")
print(classification_report(y_val, y_val_pred))
# Evaluate the model on the test set
test_accuracy = model.score(X_test, y_test)
print(f"Test Model accuracy: {test_accuracy:.2f}")
# Generate predictions on the test set
y_test_pred = model.predict(X_test)
# Create confusion matrix for test set
test_cm = confusion_matrix(y_test, y_test_pred)
# Print confusion matrix for test set
print("Test Confusion Matrix:")
print(test_cm)
# Print classification report for test set
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Validation Model accuracy: 0.85
Validation Confusion Matrix:
[[8518  429]
 [1353 1407]]
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.95      0.91      8947
           1       0.77      0.51      0.61      2760

    accuracy                           0.85     11707
   macro avg       0.81      0.73      0.76     11707
weighted avg       0.84      0.85      0.84     11707

Test Model accuracy: 0.84
Test Confusion Matrix:
[[8487  472]
 [1346 1403]]
Test Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.95      0.90      8959
           1       0.75      0.51      0.61      2749

    accuracy                           0.84     11708
   macro avg       0.81      0.73      0.76     11708
weighted avg       0.84      0.84      0.83     11708



In [51]:
# set up an autoencoder to reduce the dimensionality of the data
from sklearn.decomposition import PCA
# Initialize PCA with the number of components you want to keep
pca = PCA(n_components=0.95)  # Keep 95% of the variance
# Fit PCA on the training data and transform it
X_train_pca = pca.fit_transform(X_train)
# Transform the validation and test data using the same PCA model
X_val_pca = pca.transform(X_val)
X_test_pca = pca.transform(X_test)
# Create and fit the logistic regression model using the PCA-transformed data
model_pca = LogisticRegression(max_iter=1000, random_state=42)
model_pca.fit(X_train_pca, y_train)
# Evaluate the model on the validation set
val_accuracy_pca = model_pca.score(X_val_pca, y_val)
print(f"Validation Model accuracy with PCA: {val_accuracy_pca:.2f}")
# Generate predictions on the validation set
y_val_pred_pca = model_pca.predict(X_val_pca)
# Create confusion matrix for validation set with PCA
val_cm_pca = confusion_matrix(y_val, y_val_pred_pca)
# Print confusion matrix for validation set with PCA
print("Validation Confusion Matrix with PCA:")
print(val_cm_pca)
# Print classification report for validation set with PCA
print("Validation Classification Report with PCA:")
print(classification_report(y_val, y_val_pred_pca))
# Evaluate the model on the test set
test_accuracy_pca = model_pca.score(X_test_pca, y_test)
print(f"Test Model accuracy with PCA: {test_accuracy_pca:.2f}")
# Generate predictions on the test set
y_test_pred_pca = model_pca.predict(X_test_pca)
# Create confusion matrix for test set with PCA
test_cm_pca = confusion_matrix(y_test, y_test_pred_pca)
# Print confusion matrix for test set with PCA
print("Test Confusion Matrix with PCA:")
print(test_cm_pca)
# Print classification report for test set with PCA
print("Test Classification Report with PCA:")
print(classification_report(y_test, y_test_pred_pca))


Validation Model accuracy with PCA: 0.76
Validation Confusion Matrix with PCA:
[[8947    0]
 [2760    0]]
Validation Classification Report with PCA:
              precision    recall  f1-score   support

           0       0.76      1.00      0.87      8947
           1       0.00      0.00      0.00      2760

    accuracy                           0.76     11707
   macro avg       0.38      0.50      0.43     11707
weighted avg       0.58      0.76      0.66     11707

Test Model accuracy with PCA: 0.77
Test Confusion Matrix with PCA:
[[8959    0]
 [2749    0]]
Test Classification Report with PCA:
              precision    recall  f1-score   support

           0       0.77      1.00      0.87      8959
           1       0.00      0.00      0.00      2749

    accuracy                           0.77     11708
   macro avg       0.38      0.50      0.43     11708
weighted avg       0.59      0.77      0.66     11708



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [52]:
# set up an autoencoder using keras to reduce the dimensionality of the data
from keras.models import Model
from keras.layers import Input, Dense
# Define the size of the input layer
input_size = X_train.shape[1]
# Define the size of the encoding layer
encoding_size = 50  # You can adjust this size based on your needs
# Define the input layer
input_layer = Input(shape=(input_size,))
# Define the encoding layer
encoded = Dense(encoding_size, activation='relu')(input_layer)
# Define the decoding layer
decoded = Dense(input_size, activation='sigmoid')(encoded)
# Create the autoencoder model  
autoencoder = Model(inputs=input_layer, outputs=decoded)
# Compile the autoencoder model
autoencoder.compile(optimizer='adam', loss='mean_squared_error')
# Fit the autoencoder model on the training data
autoencoder.fit(X_train, X_train, epochs=50, batch_size=256, shuffle=True, validation_data=(X_val, X_val), verbose=1)
# Use the encoder part of the autoencoder to transform the data
encoder = Model(inputs=input_layer, outputs=encoded)
# Transform the training, validation, and test data using the encoder
X_train_encoded = encoder.predict(X_train)
X_val_encoded = encoder.predict(X_val)
X_test_encoded = encoder.predict(X_test)
# Create and fit the logistic regression model using the encoded data
model_autoencoder = LogisticRegression(max_iter=1000, random_state=42)
model_autoencoder.fit(X_train_encoded, y_train)
# Evaluate the model on the validation set
val_accuracy_autoencoder = model_autoencoder.score(X_val_encoded, y_val)
print(f"Validation Model accuracy with Autoencoder: {val_accuracy_autoencoder:.2f}")
# Generate predictions on the validation set
y_val_pred_autoencoder = model_autoencoder.predict(X_val_encoded)
# Create confusion matrix for validation set with Autoencoder
val_cm_autoencoder = confusion_matrix(y_val, y_val_pred_autoencoder)
# Print confusion matrix for validation set with Autoencoder
print("Validation Confusion Matrix with Autoencoder:")
print(val_cm_autoencoder)
# Print classification report for validation set with Autoencoder
print("Validation Classification Report with Autoencoder:")
print(classification_report(y_val, y_val_pred_autoencoder))
# Evaluate the model on the test set
test_accuracy_autoencoder = model_autoencoder.score(X_test_encoded, y_test)
print(f"Test Model accuracy with Autoencoder: {test_accuracy_autoencoder:.2f}")
# Generate predictions on the test set
y_test_pred_autoencoder = model_autoencoder.predict(X_test_encoded)
# Create confusion matrix for test set with Autoencoder
test_cm_autoencoder = confusion_matrix(y_test, y_test_pred_autoencoder)
# Print confusion matrix for test set with Autoencoder
print("Test Confusion Matrix with Autoencoder:")
print(test_cm_autoencoder)
# Print classification report for test set with Autoencoder
print("Test Classification Report with Autoencoder:")
print(classification_report(y_test, y_test_pred_autoencoder))


Epoch 1/50
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 27ms/step - loss: 49870.7383 - val_loss: 49837.3477
Epoch 2/50
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - loss: 49712.6562 - val_loss: 49837.3477
Epoch 3/50
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - loss: 50002.7695 - val_loss: 49837.3477
Epoch 4/50
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - loss: 49807.8633 - val_loss: 49837.3477
Epoch 5/50
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - loss: 49496.5664 - val_loss: 49837.3477
Epoch 6/50
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - loss: 49863.0781 - val_loss: 49837.3477
Epoch 7/50
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - loss: 49653.3945 - val_loss: 49837.3477
Epoch 8/50
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - loss: 4963

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
