In [20]:
import pandas as pd

In [21]:
import os
import kagglehub

path = kagglehub.dataset_download("uom190346a/sleep-health-and-lifestyle-dataset")

file_path = os.path.join(path, "Sleep_health_and_lifestyle_dataset.csv")
df = pd.read_csv(file_path)

Using Colab cache for faster access to the 'sleep-health-and-lifestyle-dataset' dataset.


## Checagem do Dataset

In [22]:
df.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea


In [23]:
df['Blood Pressure'].value_counts()

Unnamed: 0_level_0,count
Blood Pressure,Unnamed: 1_level_1
130/85,99
140/95,65
125/80,65
120/80,45
115/75,32
135/90,27
140/90,4
125/82,4
128/85,3
132/87,3


In [24]:
print(f"{df.shape} \n")
print(f"{df.dtypes} \n")
print(f"{df.isnull().sum()} \n")

(374, 13) 

Person ID                    int64
Gender                      object
Age                          int64
Occupation                  object
Sleep Duration             float64
Quality of Sleep             int64
Physical Activity Level      int64
Stress Level                 int64
BMI Category                object
Blood Pressure              object
Heart Rate                   int64
Daily Steps                  int64
Sleep Disorder              object
dtype: object 

Person ID                    0
Gender                       0
Age                          0
Occupation                   0
Sleep Duration               0
Quality of Sleep             0
Physical Activity Level      0
Stress Level                 0
BMI Category                 0
Blood Pressure               0
Heart Rate                   0
Daily Steps                  0
Sleep Disorder             219
dtype: int64 



In [25]:
# Validação do formato do dataset
# Blood Pressure está uniformemente no formato 'int/int'

import re

pattern = r'^\d+/\d+$'

invalid_bp = df[~df['Blood Pressure'].astype(str).str.match(pattern)]

print(f"Valores fora do padrão esperado: {len(invalid_bp)}")
invalid_bp['Blood Pressure'].value_counts().head(10)


Valores fora do padrão esperado: 0


Unnamed: 0_level_0,count
Blood Pressure,Unnamed: 1_level_1


Após validação do formato da variável Blood Pressure via regex, verificou-se que todos os registros seguem o padrão numérico ‘sistólica/diastólica’, permitindo o parsing direto sem perda de informação.

## Dataset Base para previsão





In [26]:
def base_dataset(df):

    df_base = df.copy()

    # Drop ID
    df_base = df_base.drop(columns=['Person ID'])

    # Sleep Disorder: NaN = ausência reportada
    df_base["Sleep Disorder"] = df_base["Sleep Disorder"].fillna("No Disorder")

    # Parsing validado por regex
    df_base[['Systolic', 'Diastolic']] = (
        df_base['Blood Pressure']
        .str.split('/', expand=True)
        .astype(int)
    )

    # Categorização de pressão
    def bp_category(row):
        sys = row['Systolic']
        dia = row['Diastolic']

        if sys >= 180 or dia >= 120:
            return "Crisis"
        elif sys >= 140 or dia >= 90:
            return "Stage2"
        elif sys >= 130 or dia >= 80:
            return "Stage1"
        elif sys >= 120 and dia < 80:
            return "Elevated"
        else:
            return "Normal"

    df_base["BP_Category"] = df_base.apply(bp_category, axis=1)

    # Target binário explícito
    df_base["HighRisk"] = df_base["BP_Category"].isin(
        ["Stage1", "Stage2", "Crisis"]
    ).astype(int)

    # Remover pressão original e colunas auxiliares
    df_base = df_base.drop(columns=['Blood Pressure', 'Systolic', 'Diastolic'])

    return df_base


In [27]:
df_clean = base_dataset(df)
df_clean["BP_Category"].value_counts()

Unnamed: 0_level_0,count
BP_Category,Unnamed: 1_level_1
Stage1,232
Stage2,100
Normal,41
Elevated,1


In [29]:
# Análise de balanceamento

df_clean["HighRisk"].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
HighRisk,Unnamed: 1_level_1
1,0.887701
0,0.112299


## Dataset Features Adicionais

In [None]:
def features_dataset(df_clean):

    df_features = df_clean.copy()

    # Eficiência do sono (duração × qualidade)
    df_features["Sleep_Efficiency"] = (
        df_features["Sleep Duration"] * df_features["Quality of Sleep"]
    )

    # Índice de estresse cardíaco
    df_features["Cardiac_Stress_Index"] = (
        df_features["Stress Level"] * df_features["Heart Rate"]
    )

    return df_features

In [None]:
df_features = features_dataset(df_clean)
df_features[["Sleep_Efficiency", "Cardiac_Stress_Index"]].describe()

Unnamed: 0,Sleep_Efficiency,Cardiac_Stress_Index
count,374.0,374.0
mean,52.994652,382.748663
std,13.932198,141.466528
min,23.2,195.0
25%,37.95,260.0
50%,54.6,350.0
75%,61.6,504.0
max,76.5,680.0


# Salvar no Drive

In [None]:
import os
from google.colab import drive

drive.mount('/content/drive')

nome_da_pasta = "Ps Ligia Time 16"
caminho_base = "/content/drive/MyDrive/"
caminho_completo = os.path.join(caminho_base, nome_da_pasta)

if not os.path.exists(caminho_completo):
    os.makedirs(caminho_completo)
    print(f"Pasta '{nome_da_pasta}' criada com sucesso!")
else:
    print(f"A pasta '{nome_da_pasta}' já existe.")

# CSV base
caminho_arquivo_base = os.path.join(caminho_completo, "risco_cardiovascular_base.csv")
df_clean.to_csv(caminho_arquivo_base, index=False)

# CSV features adicionais
caminho_arquivo_base = os.path.join(caminho_completo, "risco_cardiovascular_features.csv")
df_features.to_csv(caminho_arquivo_base, index=False)

print(f"Dataset salvo em: {caminho_arquivo_base}")

Mounted at /content/drive
Pasta 'Ps Ligia Time 16' criada com sucesso!
Dataset salvo em: /content/drive/MyDrive/Ps Ligia Time 16/risco_cardiovascular_features.csv
