In [90]:
import pandas as pd 
from scipy.stats import chi2_contingency
from scipy.stats.contingency import association
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
import numpy as np

In [110]:
df = pd.read_csv("../data/clean/cleaned.csv")
df.drop(['Unnamed: 0'],inplace=True,axis=1)

In [111]:
categorical_columns = df.select_dtypes('object')
numerical_columns = df.select_dtypes('number')

In [112]:
columns_to_select = ['patient_nbr','race', 'age', 'gender', 'readmitted']
demographic_df = df[columns_to_select].copy()
demographic_df.to_csv("../data/clean/demographic.csv")

In [113]:
def automate_one_hot_encoding(df):
    """Function to apply one hot encoding to all 
    categorical column, create a new df, concat it with the original 
    and drop the original categorical columns """
    
    for col in df.select_dtypes('object').columns:
        
        encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
        
        transformed_data = encoder.fit_transform(df[[col]])
        
        # Create column names for the new one-hot encoded columns
        new_columns = [col + '_' + str(category) for category in encoder.categories_[0]]
        
        # Create a DataFrame from the transformed data with the new column names
        transformed_df = pd.DataFrame(transformed_data, columns=new_columns)
        
        # Drop the original column and concatenate the transformed DataFrame
        df = pd.concat([df.drop(col, axis=1), transformed_df], axis=1)
        
    return df

In [114]:
#df_encoded = automate_one_hot_encoding(df)
#df_encoded.to_csv(path_or_buf="../data/clean/df_encoded.csv")

In [115]:
#will save the new encoded data frame and leave this workbook as it is 
#since one hot encoding takes a long time 
# due to the nature of diag_1, diag_2 and diag3 columns 

In [116]:
df2 = df.drop(['diag_1','diag_2','diag_3'],axis=1)

In [117]:
df2['metformin'].unique()

array(['no', 'steady', 'up', 'down'], dtype=object)

In [118]:
df_encoded2 = automate_one_hot_encoding(df2)

In [119]:
columns_to_drop = [col for col in df_encoded2.columns if col.endswith('_no') and col != 'readmitted_no']
columns_to_drop.extend(['patient_nbr', 'race_nan', 'weight_nan', 'gender_unknown/invalid'])

In [120]:
# Drop identified columns
df_encoded2.drop(columns=columns_to_drop, inplace=True)

In [121]:
df_encoded2.to_csv(path_or_buf="../data/clean/df_encoded2.csv")

In [122]:
for col in df_encoded2.columns:
    print(col)

time_in_hospital
num_lab_procedures
num_procedures
num_medications
number_outpatient
number_emergency
number_inpatient
number_diagnoses
race_africanamerican
race_asian
race_caucasian
race_hispanic
race_other
gender_female
gender_male
age_[0-10)
age_[10-20)
age_[20-30)
age_[30-40)
age_[40-50)
age_[50-60)
age_[60-70)
age_[70-80)
age_[80-90)
age_[90-100)
weight_>200
weight_[0-25)
weight_[100-125)
weight_[125-150)
weight_[150-175)
weight_[175-200)
weight_[25-50)
weight_[50-75)
weight_[75-100)
metformin_down
metformin_steady
metformin_up
repaglinide_down
repaglinide_steady
repaglinide_up
nateglinide_down
nateglinide_steady
nateglinide_up
chlorpropamide_down
chlorpropamide_steady
chlorpropamide_up
glimepiride_down
glimepiride_steady
glimepiride_up
acetohexamide_steady
glipizide_down
glipizide_steady
glipizide_up
glyburide_down
glyburide_steady
glyburide_up
tolbutamide_steady
pioglitazone_down
pioglitazone_steady
pioglitazone_up
rosiglitazone_down
rosiglitazone_steady
rosiglitazone_up
acarbos