## 1. Problem Statement

#### This project aims to create a model for predicting sleep disorders. This involves choosing the most suitable Machine Learning algorithm that ensures accurate and dependable results.


## 2. Import Libraries

In [2]:
import pandas as pd
import numpy as np


import warnings
warnings.filterwarnings('ignore')



## 3. Loading Dataset

In [3]:
df=pd.read_csv(r'C:\Users\12042\OneDrive\Documents\Dunni\Data Analysis\DA Projects\Python Projects\SQLifestyle.csv')

## 4. Overview of the Dataset

In [4]:
# A quick view of the first 5 rows 
df.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,No Sleep Disorder
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,No Sleep Disorder
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,No Sleep Disorder
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea


In [5]:
# A quick overview of the last 5 rows 
df.tail()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
369,370,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
370,371,Female,59,Nurse,8.0,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
371,372,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
372,373,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
373,374,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea


In [6]:
# Checking for the shape of the data
df.shape

(374, 13)

In [7]:
# Summary information about dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Person ID                374 non-null    int64  
 1   Gender                   374 non-null    object 
 2   Age                      374 non-null    int64  
 3   Occupation               374 non-null    object 
 4   Sleep Duration           374 non-null    float64
 5   Quality of Sleep         374 non-null    int64  
 6   Physical Activity Level  374 non-null    int64  
 7   Stress Level             374 non-null    int64  
 8   BMI Category             374 non-null    object 
 9   Blood Pressure           374 non-null    object 
 10  Heart Rate               374 non-null    int64  
 11  Daily Steps              374 non-null    int64  
 12  Sleep Disorder           374 non-null    object 
dtypes: float64(1), int64(7), object(5)
memory usage: 38.1+ KB


In [8]:
# Summary information of the numeric columns
df.describe()

Unnamed: 0,Person ID,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,Heart Rate,Daily Steps
count,374.0,374.0,374.0,374.0,374.0,374.0,374.0,374.0
mean,187.5,42.184492,7.132086,7.312834,59.171123,5.385027,70.165775,6816.84492
std,108.108742,8.673133,0.795657,1.196956,20.830804,1.774526,4.135676,1617.915679
min,1.0,27.0,5.8,4.0,30.0,3.0,65.0,3000.0
25%,94.25,35.25,6.4,6.0,45.0,4.0,68.0,5600.0
50%,187.5,43.0,7.2,7.0,60.0,5.0,70.0,7000.0
75%,280.75,50.0,7.8,8.0,75.0,7.0,72.0,8000.0
max,374.0,59.0,8.5,9.0,90.0,8.0,86.0,10000.0


## 5. Data Preprocessing

### 5.1 Data Cleaning

In [9]:
#Counting the duplicate rows
df.duplicated().sum()

0

In [10]:
# Checking for missing values
df.isnull().sum()

Person ID                  0
Gender                     0
Age                        0
Occupation                 0
Sleep Duration             0
Quality of Sleep           0
Physical Activity Level    0
Stress Level               0
BMI Category               0
Blood Pressure             0
Heart Rate                 0
Daily Steps                0
Sleep Disorder             0
dtype: int64

In [11]:
df['Sleep Disorder'].unique()

array(['No Sleep Disorder', 'Sleep Apnea', 'Insomnia'], dtype=object)

In [12]:
# Replacing 'Sleep Disorders' column with a string value of 'Yes' or 'No'

df['Sleep Disorder'].replace({'No Sleep Disorder': 'No', 'Sleep Apnea': 'Yes', 'Insomnia': 'Yes'}, inplace = True )

In [13]:
df['Sleep Disorder'].value_counts()

Sleep Disorder
No     219
Yes    155
Name: count, dtype: int64

In [14]:
# Counting the unique values in the "BMI Category" column

df['BMI Category'].unique()

array(['Overweight', 'Normal', 'Obese', 'Normal Weight'], dtype=object)

In [15]:
# Replacing 'Normal Weight' to 'Normal' and 'Obese' to 'Overweight'
df['BMI Category'].replace({'Normal Weight': 'Normal', 'Obese': 'Overweight'} , inplace = True)

In [16]:
df['BMI Category'].value_counts()

BMI Category
Normal        216
Overweight    158
Name: count, dtype: int64

In [17]:
# Renaming column names

df = df.rename (columns ={'BMI Category' : 'Body Mass Index', 'Physical Activity Level' : 'Excercise Duration'})

In [18]:
df.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Excercise Duration,Stress Level,Body Mass Index,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,No
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,No
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,No
3,4,Male,28,Sales Representative,5.9,4,30,8,Overweight,140/90,85,3000,Yes
4,5,Male,28,Sales Representative,5.9,4,30,8,Overweight,140/90,85,3000,Yes


### 5.2. Feature Selection

In [19]:
# Removing Features from the dataset
df = df.drop(['Person ID', 'Occupation','Daily Steps', ], axis=1)

In [20]:
df.head()

Unnamed: 0,Gender,Age,Sleep Duration,Quality of Sleep,Excercise Duration,Stress Level,Body Mass Index,Blood Pressure,Heart Rate,Sleep Disorder
0,Male,27,6.1,6,42,6,Overweight,126/83,77,No
1,Male,28,6.2,6,60,8,Normal,125/80,75,No
2,Male,28,6.2,6,60,8,Normal,125/80,75,No
3,Male,28,5.9,4,30,8,Overweight,140/90,85,Yes
4,Male,28,5.9,4,30,8,Overweight,140/90,85,Yes


### 5.3. Feature Engineering

In [21]:
# Split the 'Blood Pressure' column into two columns

df[['Systolic BP', 'Diastolic BP']] = df['Blood Pressure'].str.split('/', expand=True)
df

Unnamed: 0,Gender,Age,Sleep Duration,Quality of Sleep,Excercise Duration,Stress Level,Body Mass Index,Blood Pressure,Heart Rate,Sleep Disorder,Systolic BP,Diastolic BP
0,Male,27,6.1,6,42,6,Overweight,126/83,77,No,126,83
1,Male,28,6.2,6,60,8,Normal,125/80,75,No,125,80
2,Male,28,6.2,6,60,8,Normal,125/80,75,No,125,80
3,Male,28,5.9,4,30,8,Overweight,140/90,85,Yes,140,90
4,Male,28,5.9,4,30,8,Overweight,140/90,85,Yes,140,90
...,...,...,...,...,...,...,...,...,...,...,...,...
369,Female,59,8.1,9,75,3,Overweight,140/95,68,Yes,140,95
370,Female,59,8.0,9,75,3,Overweight,140/95,68,Yes,140,95
371,Female,59,8.1,9,75,3,Overweight,140/95,68,Yes,140,95
372,Female,59,8.1,9,75,3,Overweight,140/95,68,Yes,140,95


In [22]:
# Convert the new columns to numeric type
df[['Systolic BP', 'Diastolic BP']] = df[['Systolic BP', 'Diastolic BP']].apply(pd.to_numeric)
df

Unnamed: 0,Gender,Age,Sleep Duration,Quality of Sleep,Excercise Duration,Stress Level,Body Mass Index,Blood Pressure,Heart Rate,Sleep Disorder,Systolic BP,Diastolic BP
0,Male,27,6.1,6,42,6,Overweight,126/83,77,No,126,83
1,Male,28,6.2,6,60,8,Normal,125/80,75,No,125,80
2,Male,28,6.2,6,60,8,Normal,125/80,75,No,125,80
3,Male,28,5.9,4,30,8,Overweight,140/90,85,Yes,140,90
4,Male,28,5.9,4,30,8,Overweight,140/90,85,Yes,140,90
...,...,...,...,...,...,...,...,...,...,...,...,...
369,Female,59,8.1,9,75,3,Overweight,140/95,68,Yes,140,95
370,Female,59,8.0,9,75,3,Overweight,140/95,68,Yes,140,95
371,Female,59,8.1,9,75,3,Overweight,140/95,68,Yes,140,95
372,Female,59,8.1,9,75,3,Overweight,140/95,68,Yes,140,95


In [23]:
# Removing Features from the dataset
df = df.drop(['Blood Pressure'], axis=1)

In [24]:
df

Unnamed: 0,Gender,Age,Sleep Duration,Quality of Sleep,Excercise Duration,Stress Level,Body Mass Index,Heart Rate,Sleep Disorder,Systolic BP,Diastolic BP
0,Male,27,6.1,6,42,6,Overweight,77,No,126,83
1,Male,28,6.2,6,60,8,Normal,75,No,125,80
2,Male,28,6.2,6,60,8,Normal,75,No,125,80
3,Male,28,5.9,4,30,8,Overweight,85,Yes,140,90
4,Male,28,5.9,4,30,8,Overweight,85,Yes,140,90
...,...,...,...,...,...,...,...,...,...,...,...
369,Female,59,8.1,9,75,3,Overweight,68,Yes,140,95
370,Female,59,8.0,9,75,3,Overweight,68,Yes,140,95
371,Female,59,8.1,9,75,3,Overweight,68,Yes,140,95
372,Female,59,8.1,9,75,3,Overweight,68,Yes,140,95


### 5.3. Feature Encoding

In [25]:
# Encoding Gender column
df['Gender']= df['Gender'].map({'Female': 0, 'Male': 1})

In [26]:
# Encoding Body Mass Index column
df['Body Mass Index']= df['Body Mass Index'].map({'Normal': 0, 'Overweight': 1})

In [27]:
# Encoding Sleep Disorder column using scikit Learn label Encoder
from sklearn.preprocessing import LabelEncoder

In [28]:
# creating Label Encoder Instance
label_encoder = LabelEncoder()

In [29]:
# Fit and transform the 'Sleep Disorder' column
df['Sleep Disorder'] = label_encoder.fit_transform(df['Sleep Disorder'])
df

Unnamed: 0,Gender,Age,Sleep Duration,Quality of Sleep,Excercise Duration,Stress Level,Body Mass Index,Heart Rate,Sleep Disorder,Systolic BP,Diastolic BP
0,1,27,6.1,6,42,6,1,77,0,126,83
1,1,28,6.2,6,60,8,0,75,0,125,80
2,1,28,6.2,6,60,8,0,75,0,125,80
3,1,28,5.9,4,30,8,1,85,1,140,90
4,1,28,5.9,4,30,8,1,85,1,140,90
...,...,...,...,...,...,...,...,...,...,...,...
369,0,59,8.1,9,75,3,1,68,1,140,95
370,0,59,8.0,9,75,3,1,68,1,140,95
371,0,59,8.1,9,75,3,1,68,1,140,95
372,0,59,8.1,9,75,3,1,68,1,140,95


In [30]:
# Defining Features into groups (X and y)

X = df.drop(columns = ['Sleep Disorder'])     ## Predictor variables
y = df['Sleep Disorder']  ## Target variable

print(X.shape)
print(y.shape)

(374, 10)
(374,)


In [31]:
# Initialializing StandardScaler
from sklearn.preprocessing import StandardScaler

In [32]:
# Create the Standard Scalar instance
scaler = StandardScaler()

In [33]:
# Fit the scaler to the data and transform the data
scaled_x = scaler.fit_transform(X)

In [34]:
# Importing Model
from sklearn.model_selection import train_test_split

In [35]:
# Splitting Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.20, random_state = 42)

### 6. Building Models

### 6.1. Import Models and Analysis

#### 6.1.1 Logistic Regression

In [36]:
# Initialializing Logistic Regression Model
from sklearn.linear_model import LogisticRegression

In [37]:
# Create an instance
logr = LogisticRegression()

In [38]:
# Fit the model to the training data 
logr.fit(X_train, y_train)

In [39]:
# Making Prediction using the model
y_pred_logr = logr.predict(X_test)

 #### 6.1.2 Support Vector Machine

In [40]:
# Initialializing Support Vector Machine  Model
from sklearn.svm import SVC

In [41]:
# Create an instance
svm = SVC()

In [42]:
# Fit the model to training data
svm.fit(X_train, y_train)

In [43]:
# Making prediction
y_pred_svm = svm.predict(X_test)

#### 6.1.3 Random Forest Classification

In [44]:
# Initializing Random Forest Model
from sklearn.ensemble import RandomForestClassifier

In [45]:
# create an instance
rf = RandomForestClassifier()

In [46]:
# fit the model
rf.fit(X_train, y_train)

In [47]:
# Making  prediction
y_pred_rf = rf.predict(X_test)

#### 6.1.4 Gradient Boosting Classifier

In [48]:
from sklearn.ensemble import GradientBoostingClassifier

In [49]:
gb = GradientBoostingClassifier()

In [50]:
gb.fit(X_train, y_train)

In [51]:
y_pred_gb =gb.predict(X_test)

## 7. Metric Evaluation

### 7.1 Accuracy Score

In [52]:
from sklearn.metrics import accuracy_score

In [53]:
# Acccuracy Score for Logistic Regression
accuracy_score(y_test, y_pred_logr)


0.9333333333333333

In [54]:
# Acccuracy Score for SVM
accuracy_score(y_test, y_pred_svm)

0.8266666666666667

In [55]:
# Acccuracy Score for Random Forest Classifier
accuracy_score(y_test, y_pred_rf)

0.96

In [56]:
# Acccuracy Score for Gradient Boosting Classifier
accuracy_score(y_test, y_pred_gb)

0.96

### 7.2 Classification Report

In [57]:
from sklearn.metrics import classification_report

In [58]:
print(classification_report(y_test, y_pred_logr))


              precision    recall  f1-score   support

           0       0.95      0.93      0.94        43
           1       0.91      0.94      0.92        32

    accuracy                           0.93        75
   macro avg       0.93      0.93      0.93        75
weighted avg       0.93      0.93      0.93        75



In [59]:
print(classification_report(y_test, y_pred_svm))

              precision    recall  f1-score   support

           0       0.77      1.00      0.87        43
           1       1.00      0.59      0.75        32

    accuracy                           0.83        75
   macro avg       0.88      0.80      0.81        75
weighted avg       0.87      0.83      0.82        75



In [60]:
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.95      0.98      0.97        43
           1       0.97      0.94      0.95        32

    accuracy                           0.96        75
   macro avg       0.96      0.96      0.96        75
weighted avg       0.96      0.96      0.96        75



In [61]:
print(classification_report(y_test, y_pred_gb))

              precision    recall  f1-score   support

           0       0.95      0.98      0.97        43
           1       0.97      0.94      0.95        32

    accuracy                           0.96        75
   macro avg       0.96      0.96      0.96        75
weighted avg       0.96      0.96      0.96        75



### 7.3 Performance Metrics

In [62]:
from sklearn.metrics import precision_score, recall_score, f1_score

#### 7.3.1. Precision Score

In [63]:
# Precision Score for Logistic Regression Classifier
precision_score(y_test, y_pred_logr)


0.9090909090909091

In [64]:
# Precision Score for SVM Classifier
precision_score(y_test, y_pred_svm)


1.0

In [65]:
# Precision Score for Random Forest Classifier
precision_score(y_test, y_pred_rf)


0.967741935483871

In [66]:
# Precision Score for  Gradient Boosting Classifier
precision_score(y_test, y_pred_gb)

0.967741935483871

#### 7.3.2 Recall Score

In [67]:
# Recall Score for Logistic Regression Classifier
recall_score(y_test, y_pred_logr)

0.9375

In [68]:
# Recall Score for SVM Classifier
recall_score(y_test, y_pred_svm)

0.59375

In [69]:
# Recall Score for Random Forest Classifier
recall_score(y_test, y_pred_rf)

0.9375

In [70]:
# Recall Score for Gradient Boosting Classifier
recall_score(y_test, y_pred_gb)

0.9375

#### 7.3.3. F1 Score

In [71]:
# F1 Score for Logistic Regression Classifier
f1_score(y_test, y_pred_logr)

0.923076923076923

In [72]:
# F1 Score for SVM Classifier
f1_score(y_test, y_pred_svm)

0.7450980392156863

In [73]:
# F1 Score for Random Forest Classifier
f1_score(y_test, y_pred_rf)

0.9523809523809523

In [74]:
# F1 Score for Gradient Boosting Classifier
f1_score(y_test, y_pred_gb)

0.9523809523809523

In [75]:
X

Unnamed: 0,Gender,Age,Sleep Duration,Quality of Sleep,Excercise Duration,Stress Level,Body Mass Index,Heart Rate,Systolic BP,Diastolic BP
0,1,27,6.1,6,42,6,1,77,126,83
1,1,28,6.2,6,60,8,0,75,125,80
2,1,28,6.2,6,60,8,0,75,125,80
3,1,28,5.9,4,30,8,1,85,140,90
4,1,28,5.9,4,30,8,1,85,140,90
...,...,...,...,...,...,...,...,...,...,...
369,0,59,8.1,9,75,3,1,68,140,95
370,0,59,8.0,9,75,3,1,68,140,95
371,0,59,8.1,9,75,3,1,68,140,95
372,0,59,8.1,9,75,3,1,68,140,95


## 8. Making Prediction for the Entire Dataset

In [76]:
# Training the Entire dataset using Random Forest Classifier

X = df.drop(columns = ['Sleep Disorder'])     ## Predictor variables
y = df['Sleep Disorder']  ## Target variable


print(X.shape)
print(y.shape)

(374, 10)
(374,)


In [77]:
from sklearn.ensemble import RandomForestClassifier

In [78]:
rf = RandomForestClassifier()

In [79]:
rf.fit(X,y)

In [80]:
# Making prediction on a new data

new_data = pd.DataFrame({
    'Gender': 0,
    'Age': 47,
    'Sleep Duration': 5.7,
    'Quality of Sleep': 7,
    'Excercise Duration' : 30,
    'Stress Level' : 5,
    'Body Mass Index' : 0,
    'Heart Rate' : 77,
    'Systolic BP': 130,
    'Diastolic BP' : 95 }, 
    index = [0]
    
)

In [81]:
new_data

Unnamed: 0,Gender,Age,Sleep Duration,Quality of Sleep,Excercise Duration,Stress Level,Body Mass Index,Heart Rate,Systolic BP,Diastolic BP
0,0,47,5.7,7,30,5,0,77,130,95


In [82]:
sd = rf.predict(new_data)
if sd==[0]:
    print("You have no Sleep Disorders")
else: 
    print("You may have Sleep Disorders")

You may have Sleep Disorders


In [83]:
# Making prediction on a new data

new_data1 = pd.DataFrame({
    'Gender': 1,
    'Age': 28,
    'Sleep Duration': 6.2,
    'Quality of Sleep': 6,
    'Excercise Duration' : 60,
    'Stress Level' : 8,
    'Body Mass Index' : 0,
    'Heart Rate' : 75,
    'Systolic BP': 125,
    'Diastolic BP' : 80 }, 
    index = [0]
    
)

In [84]:
sd = rf.predict(new_data1)
if sd==[0]:
    print("You have no Sleep Disorders")
else: 
    print("You may have Sleep Disorders")

You have no Sleep Disorders


## 9. Saving Model

In [85]:
import joblib

In [86]:
# Save the encoded data to a joblib file
model_joblib_sd = 'final_model.sav'
joblib.dump(rf, 'model_joblib_sd')

['model_joblib_sd']