# Análise e Tratamento dos Dados

## Importando libs

In [45]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
import plotly.figure_factory as ff
import os

## Leitura do dataset

In [46]:
caminho_csv = '../data/csv/heart.csv'

df = pd.read_csv(
   caminho_csv,
   sep=',',
   encoding='utf-8'  # se der erro de encoding, troque para 'iso-8859-1'
)

df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


## Visão geral dos dados

In [47]:
print("Formato do dataset (linhas, colunas):", df.shape)
print("\nTipos de dados:")
print(df.dtypes)

print("\nValores faltantes (NaN) por coluna:")
print(df.isnull().sum())

Formato do dataset (linhas, colunas): (918, 12)

Tipos de dados:
Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
HeartDisease        int64
dtype: object

Valores faltantes (NaN) por coluna:
Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64


## Estatísticas descritivas básicas

In [48]:
display(df.describe(include='all'))

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
count,918.0,918,918,918.0,918.0,918.0,918,918.0,918,918.0,918,918.0
unique,,2,4,,,,3,,2,,3,
top,,M,ASY,,,,Normal,,N,,Flat,
freq,,725,496,,,,552,,547,,460,
mean,53.510893,,,132.396514,198.799564,0.233115,,136.809368,,0.887364,,0.553377
std,9.432617,,,18.514154,109.384145,0.423046,,25.460334,,1.06657,,0.497414
min,28.0,,,0.0,0.0,0.0,,60.0,,-2.6,,0.0
25%,47.0,,,120.0,173.25,0.0,,120.0,,0.0,,0.0
50%,54.0,,,130.0,223.0,0.0,,138.0,,0.6,,1.0
75%,60.0,,,140.0,267.0,0.0,,156.0,,1.5,,1.0


## Medidas de posição para idade e pressão em repouso

In [49]:
df_idade = df["Age"]
df_pressao = df["RestingBP"]

quartis_idade = list(df_idade.quantile([0.25, 0.5, 0.75]))
quartis_pressao = list(df_pressao.quantile([0.25, 0.5, 0.75]))

print("MEDIDAS DE POSIÇÃO PARA IDADE")
print(f"Q1: {quartis_idade[0]:.1f}")
print(f"Q2 (mediana): {quartis_idade[1]:.1f}")
print(f"Q3: {quartis_idade[2]:.1f}")
print()

print("MEDIDAS DE POSIÇÃO PARA PRESSÃO EM REPOUSO")
print(f"Q1: {quartis_pressao[0]:.1f}")
print(f"Q2 (mediana): {quartis_pressao[1]:.1f}")
print(f"Q3: {quartis_pressao[2]:.1f}")

MEDIDAS DE POSIÇÃO PARA IDADE
Q1: 47.0
Q2 (mediana): 54.0
Q3: 60.0

MEDIDAS DE POSIÇÃO PARA PRESSÃO EM REPOUSO
Q1: 120.0
Q2 (mediana): 130.0
Q3: 140.0


## Verificando moda, média e mediana da idade

In [50]:
moda_idade = df["Age"].mode()[0]
media_idade = df["Age"].mean()
mediana_idade = df["Age"].median()

print(
    f"Moda: {moda_idade:.0f} | "
    f"Média: {media_idade:.0f} | "
    f"Mediana: {mediana_idade:.0f}"
)

Moda: 54 | Média: 54 | Mediana: 54


## Tratando as variáveis

### Pressão em repouso

#### Igual a zero

In [51]:
qtd_zero_pressao = (df["RestingBP"] == 0).sum()
print("Quantidade de linhas com RestingBP = 0:", qtd_zero_pressao)

# Mantendo uma cópia tratada
df_tratado = df.loc[df["RestingBP"] != 0].copy()

print("Formato após remover RestingBP = 0:", df_tratado.shape)
df_tratado["RestingBP"].describe()

Quantidade de linhas com RestingBP = 0: 1
Formato após remover RestingBP = 0: (917, 12)


count    917.000000
mean     132.540894
std       17.999749
min       80.000000
25%      120.000000
50%      130.000000
75%      140.000000
max      200.000000
Name: RestingBP, dtype: float64

### Colesterol

#### Igual a zero

In [52]:
print("Valores únicos de colesterol antes do tratamento:")
print(df_tratado["Cholesterol"].value_counts().sort_index().head(10))

qtd_zero_col = (df_tratado["Cholesterol"] == 0).sum()
print("\nQuantidade de linhas com Cholesterol = 0:", qtd_zero_col)

df_tratado2 = df_tratado.copy()
df_tratado2["Cholesterol"] = df_tratado2["Cholesterol"].replace(0, np.nan)

print("\nValores missing por coluna após transformar 0 em NaN no colesterol:")
print(df_tratado2.isnull().sum())

Valores únicos de colesterol antes do tratamento:
Cholesterol
0      171
85       1
100      2
110      1
113      1
117      1
123      1
126      2
129      1
131      1
Name: count, dtype: int64

Quantidade de linhas com Cholesterol = 0: 171

Valores missing por coluna após transformar 0 em NaN no colesterol:
Age                 0
Sex                 0
ChestPainType       0
RestingBP           0
Cholesterol       171
FastingBS           0
RestingECG          0
MaxHR               0
ExerciseAngina      0
Oldpeak             0
ST_Slope            0
HeartDisease        0
dtype: int64


In [53]:
media_colesterol = df_tratado2["Cholesterol"].mean()
print("Média do colesterol (sem zeros incoerentes):", media_colesterol)

df_tratado2["Cholesterol"] = df_tratado2["Cholesterol"].fillna(media_colesterol)

print("\nValores missing após preenchimento:")
print(df_tratado2.isnull().sum())

df_tratado2.describe()

Média do colesterol (sem zeros incoerentes): 244.6353887399464

Valores missing após preenchimento:
Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64


Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,917.0,917.0,917.0,917.0,917.0,917.0,917.0
mean,53.509269,132.540894,244.635389,0.23337,136.789531,0.886696,0.55289
std,9.437636,17.999749,53.347125,0.423206,25.467129,1.06696,0.497466
min,28.0,80.0,85.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,214.0,0.0,120.0,0.0,0.0
50%,54.0,130.0,244.635389,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [54]:
# Histograma + curva de densidade (KDE) do Colesterol usando Plotly
# Usamos plotly.figure_factory.create_distplot para mostrar histograma + linha de distribuição
hist_data = [df_tratado2['Cholesterol'].dropna().tolist()]
group_labels = ['Cholesterol']
fig_chol = ff.create_distplot(hist_data, group_labels, show_hist=True, show_rug=False, colors=['orange'], bin_size=20)
fig_chol.update_layout(title='Distribuição do Colesterol após tratamento', width=800, height=400)
fig_chol.show()

## Visão geral rápida das variáveis principais

In [55]:
colunas_principais = [
    "Age", "Sex", "ChestPainType", "RestingBP",
    "Cholesterol", "FastingBS", "RestingECG",
    "MaxHR", "ExerciseAngina", "Oldpeak",
    "ST_Slope", "HeartDisease"
]

df_tratado2[colunas_principais].head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289.0,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180.0,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283.0,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214.0,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195.0,0,Normal,122,N,0.0,Up,0


In [56]:
fig_age = px.histogram(
    df_tratado2,
    x="Age",
    nbins=60,
    title="Distribuição por idade",
)
fig_age.update_layout(width=600, height=350)
fig_age.show()

fig_restingbp = px.histogram(
    df_tratado2,
    x="RestingBP",
    nbins=40,
    title="Distribuição da pressão em repouso",
)
fig_restingbp.update_layout(width=600, height=350)
fig_restingbp.show()

fig_maxhr = px.histogram(
    df_tratado2,
    x="MaxHR",
    nbins=40,
    title="Distribuição da frequência cardíaca máxima",
)
fig_maxhr.update_layout(width=600, height=350)
fig_maxhr.show()


In [57]:
# Substituindo countplots do seaborn por gráficos interativos do Plotly
fig_sex = px.histogram(df_tratado2, x="Sex", title="Distribuição por sexo", color="Sex", color_discrete_map={"M": "blue", "F": "pink"})
fig_sex.update_layout(width=600, height=350)
fig_sex.show()

fig_chest = px.histogram(df_tratado2, x="ChestPainType", title="Tipos de dor no peito")
fig_chest.update_layout(width=600, height=350)
fig_chest.show()

fig_restingecg = px.histogram(df_tratado2, x="RestingECG", title="Distribuição do eletrocardiograma em repouso")
fig_restingecg.update_layout(width=600, height=350)
fig_restingecg.show()


In [58]:
fig_sex = px.pie(df_tratado2, names="Sex", title="Proporção por sexo")
fig_sex.show()

fig_ex_ang = px.pie(df_tratado2, names="ExerciseAngina", title="ExerciseAngina")
fig_ex_ang.show()

fig_st_slope = px.pie(df_tratado2, names="ST_Slope", title="Inclinação do segmento ST")
fig_st_slope.show()

fig_hd = px.pie(df_tratado2, names="HeartDisease", title="Presença de doença cardíaca")
fig_hd.show()


In [59]:
fig_box_age = px.box(df_tratado2, y="Age", title="Boxplot Idade")
fig_box_age.show()

fig_box_pressao = px.box(df_tratado2, y="RestingBP", title="Boxplot Pressão em repouso")
fig_box_pressao.show()

fig_box_col = px.box(df_tratado2, y="Cholesterol", title="Boxplot Colesterol tratato")
fig_box_col.show()

fig_box_maxhr = px.box(df_tratado2, y="MaxHR", title="Boxplot Frequência cardíaca máxima")
fig_box_maxhr.show()


In [60]:
caminho_pasta_processed = "../data/csv/processed"
os.makedirs(caminho_pasta_processed, exist_ok=True)

caminho_saida = os.path.join(caminho_pasta_processed, "heart_tratado.csv")

df_tratado2.to_csv(caminho_saida, encoding="utf-8", index=False)

print(f"Arquivo salvo em: {caminho_saida}")

Arquivo salvo em: ../data/csv/processed\heart_tratado.csv
