# **Preprocessing Steps for Sleep Health Dataset**

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder

# 1. Load Dataset
file_name = list(uploaded.keys())[0]
df = pd.read_csv(file_name)

# Quick look
print(df.head())
print(df.info())
print(df.describe())

   Person ID  Gender  Age Occupation  Sleep Duration  Physical Activity Level  \
0          1    Male   25   Engineer             6.5                       30   
1          2  Female   30     Doctor             7.0                       45   
2          3    Male   35    Teacher             8.0                       60   
3          4  Female   40      Nurse             5.5                       20   
4          5    Male   45     Lawyer             6.0                       35   

   Stress Level BMI Category Blood Pressure Sleep Disorder  
0             5       Normal         120/80            NaN  
1             4   Overweight         130/85       Insomnia  
2             3        Obese         140/90    Sleep Apnea  
3             6       Normal         110/75            NaN  
4             5   Overweight         135/88       Insomnia  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 10 columns):
 #   Column                   Non-Null Count

In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print(f'User uploaded file "{fn}" with length {uploaded[fn]} bytes')

Saving sleep_health_and_lifestyle_dataset.csv to sleep_health_and_lifestyle_dataset (1).csv
User uploaded file "sleep_health_and_lifestyle_dataset (1).csv" with length b'Person ID,Gender,Age,Occupation,Sleep Duration,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Sleep Disorder\n1,Male,25,Engineer,6.5,30,5,Normal,120/80,None\n2,Female,30,Doctor,7.0,45,4,Overweight,130/85,Insomnia\n3,Male,35,Teacher,8.0,60,3,Obese,140/90,Sleep Apnea\n4,Female,40,Nurse,5.5,20,6,Normal,110/75,None\n5,Male,45,Lawyer,6.0,35,5,Overweight,135/88,Insomnia\n6,Female,50,Scientist,7.5,50,2,Obese,145/95,Sleep Apnea\n7,Male,55,Artist,8.5,70,3,Normal,118/78,None\n8,Female,60,Manager,6.0,25,6,Overweight,125/82,Insomnia\n9,Male,28,Engineer,7.0,40,4,Obese,138/92,Sleep Apnea\n10,Female,33,Doctor,6.5,55,5,Normal,115/76,None\n11,Male,37,Teacher,8.0,65,3,Overweight,128/84,Insomnia\n12,Female,42,Nurse,5.0,30,6,Obese,142/96,Sleep Apnea\n13,Male,47,Lawyer,7.5,50,4,Normal,122/80,None\n14,Female,52,Scientist,6

# **Check Missing Values**

In [None]:
# Check null values
print(df.isnull().sum())

# Option: fill missing values
df['Sleep Duration'] = df['Sleep Duration'].fillna(df['Sleep Duration'].mean())  # Example for numerical
df['Occupation'] = df['Occupation'].fillna(df['Occupation'].mode()[0])           # Example for categorical


Person ID                  0
Gender                     0
Age                        0
Occupation                 0
Sleep Duration             0
Physical Activity Level    0
Stress Level               0
BMI Category               0
Blood Pressure             0
Sleep Disorder             7
dtype: int64


# **Encode Categorical Features**

In [None]:
# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
print("Categorical Columns:", categorical_cols)

# Label encode binary categories
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])  # Male/Female -> 0/1

# One-hot encode multi-class categorical features
df = pd.get_dummies(df, columns=['Occupation', 'BMI Category', 'Blood Pressure'], drop_first=True)


Categorical Columns: Index(['Gender', 'Occupation', 'BMI Category', 'Blood Pressure',
       'Sleep Disorder'],
      dtype='object')


# Outlier Detection & **Handling**

In [None]:
# Example: Using IQR for 'Sleep Duration'
Q1 = df['Sleep Duration'].quantile(0.25)
Q3 = df['Sleep Duration'].quantile(0.75)
IQR = Q3 - Q1

# Define bounds
lower_bound = Q1 - 1.5*IQR
upper_bound = Q3 + 1.5*IQR

# Remove outliers
df = df[(df['Sleep Duration'] >= lower_bound) & (df['Sleep Duration'] <= upper_bound)]


# Feature **Scaling**

In [None]:
# Scale numerical columns
scaler = StandardScaler()
num_cols = ['Sleep Duration', 'Physical Activity Level', 'Stress Level']  # example columns
df[num_cols] = scaler.fit_transform(df[num_cols])


# Train-Test **Split**

In [None]:
# Define X, y
X = df.drop(columns=['Sleep Disorder'])   # Features
y = df['Sleep Disorder']                  # Target variable

# Encode target
y = le.fit_transform(y)

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (16, 34)
Test shape: (4, 34)


# **this are the preprocesseing steps done**


Handling missing values

Encoding categorical variables

Outlier removal

Scaling numerical features

Splitting into train/test







