In [19]:
import pandas as pd

#Importando parquet bronze
parquet_path = r"D:\Geral\Projetos\hospital-readmission-data-engineering\data\bronze\hospital_readmissions.parquet"

df_bronze = pd.read_parquet(parquet_path)


In [20]:
#Verificando se há dados duplicados
print("Duplicados:", df_bronze.duplicated().sum())

Duplicados: 0


In [21]:
#Se sim, é feito a remoção
df_bronze = df_bronze.drop_duplicates()

In [22]:
#Verificando se há NaN e os tipos estão object
df_bronze.info()
df_bronze.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   patient_id             30000 non-null  int64  
 1   age                    30000 non-null  int64  
 2   gender                 30000 non-null  object 
 3   blood_pressure         30000 non-null  object 
 4   cholesterol            30000 non-null  int64  
 5   bmi                    30000 non-null  float64
 6   diabetes               30000 non-null  object 
 7   hypertension           30000 non-null  object 
 8   medication_count       30000 non-null  int64  
 9   length_of_stay         30000 non-null  int64  
 10  discharge_destination  30000 non-null  object 
 11  readmitted_30_days     30000 non-null  object 
dtypes: float64(1), int64(5), object(6)
memory usage: 2.7+ MB


Unnamed: 0,patient_id,age,gender,blood_pressure,cholesterol,bmi,diabetes,hypertension,medication_count,length_of_stay,discharge_destination,readmitted_30_days
0,1,74,Other,130/72,240,31.5,Yes,No,5,1,Nursing_Facility,Yes
1,2,46,Female,120/92,292,36.3,No,No,4,3,Nursing_Facility,No
2,3,89,Other,135/78,153,30.3,No,Yes,1,1,Home,No
3,4,84,Female,123/80,153,31.5,No,Yes,3,10,Home,No
4,5,32,Other,135/84,205,18.4,No,Yes,6,4,Nursing_Facility,No


In [None]:
#Transformando Dados
df_bronze["patient_id"] = df_bronze["patient_id"].astype(int)
df_bronze["age"] = df_bronze["age"].astype(int)
df_bronze["gender"] = df_bronze["gender"].astype(str)
df_bronze["systolic_pressure"] = df_bronze["blood_pressure"].str.split("/").str[0].astype(int)
df_bronze["diastolic_pressure"] = df_bronze["blood_pressure"].str.split("/").str[1].astype(int)
#Removendo a coluna blood_pressure, pois foi extraída
df_bronze = df_bronze.drop(columns=["blood_pressure"])

df_bronze["cholesterol"] = df_bronze["cholesterol"].astype(int)
# df_bronze["bmi"] = df_bronze["bmi"].str.replace(",", ".").astype(float)
df_bronze["diabetes"] = df_bronze["diabetes"].map({"Yes": True, "No": False}).astype("boolean")
df_bronze["hypertension"] = df_bronze["hypertension"].map({"Yes": True, "No": False}).astype("boolean")
df_bronze["medication_count"] = df_bronze["medication_count"].astype(int)
df_bronze["length_of_stay"] = df_bronze["length_of_stay"].astype(int)
df_bronze["discharge_destination"] = df_bronze["discharge_destination"].astype(str)
df_bronze["readmitted_30_days"] = df_bronze["readmitted_30_days"].map({"Yes": True, "No": False}).astype("boolean")

In [24]:
#Validações
#idade
df_bronze = df_bronze[(df_bronze["age"]) >= 0 & (df_bronze["age"] <= 120)]

#Pressão
df_bronze = df_bronze[(df_bronze["systolic_pressure"] >= 70) & (df_bronze["systolic_pressure"] <= 250)]

#Tempo de internação
df_bronze = df_bronze[(df_bronze["length_of_stay"] >= 0)]

In [25]:
#Padronizando Textos
df_bronze["gender"] = df_bronze["gender"].str.strip().str.upper()

df_bronze["discharge_destination"] = df_bronze["discharge_destination"].str.strip().str.title()

In [None]:
#Faixas Etárias
df_bronze["agre_group"] = pd.cut(df_bronze["age"], bins=[0, 18, 40, 60, 75, 120], labels=["child", "young_adult", "adult", "senior", "very_senior"])