In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split

### Reading the data

In [3]:
df = pd.read_csv('../data/sleep_health_lifestyle.csv')

In [4]:
df.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea


### Data Preparation
Lets prepare the data with the 

##### Make all the column names small letters

In [5]:
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.columns

Index(['person_id', 'gender', 'age', 'occupation', 'sleep_duration',
       'quality_of_sleep', 'physical_activity_level', 'stress_level',
       'bmi_category', 'blood_pressure', 'heart_rate', 'daily_steps',
       'sleep_disorder'],
      dtype='object')

Lets look at the data to find missing values and other metrics

Numeric columns:

In [6]:
df.describe()

Unnamed: 0,person_id,age,sleep_duration,quality_of_sleep,physical_activity_level,stress_level,heart_rate,daily_steps
count,374.0,374.0,374.0,374.0,374.0,374.0,374.0,374.0
mean,187.5,42.184492,7.132086,7.312834,59.171123,5.385027,70.165775,6816.84492
std,108.108742,8.673133,0.795657,1.196956,20.830804,1.774526,4.135676,1617.915679
min,1.0,27.0,5.8,4.0,30.0,3.0,65.0,3000.0
25%,94.25,35.25,6.4,6.0,45.0,4.0,68.0,5600.0
50%,187.5,43.0,7.2,7.0,60.0,5.0,70.0,7000.0
75%,280.75,50.0,7.8,8.0,75.0,7.0,72.0,8000.0
max,374.0,59.0,8.5,9.0,90.0,8.0,86.0,10000.0


Some observations:
- **Age**: The age group of individuals in the dataset is between 27-59 years.
- **Sleep Duration**: Range is between 5.8-8.5 hours.
- **Quality of sleep**: This is a subjective rating of the quality of sleep, ranging from 1 to 10. Range: 4-9.
- **Physical Activity Level**: The number of minutes the person engages in physical activity daily. Range: 30-90 mins (Everyone exercises!)
- **Stress Level**: A subjective rating of the stress level experienced by the person, ranging from 1 to 10. Range: 3-8.
- **Heart Rate**: The resting heart rate of the person in beats per minute. Range: 65-86 bpm.
- **Daily Steps**: The number of steps the person takes per day. Range: 3000-10000

Lets separate the numeric columns from the categorical ones

In [7]:
df.dtypes

person_id                    int64
gender                      object
age                          int64
occupation                  object
sleep_duration             float64
quality_of_sleep             int64
physical_activity_level      int64
stress_level                 int64
bmi_category                object
blood_pressure              object
heart_rate                   int64
daily_steps                  int64
sleep_disorder              object
dtype: object

In [8]:
numerical = df.columns[(df.dtypes == 'int64') | (df.dtypes == 'float64')]
numerical

Index(['person_id', 'age', 'sleep_duration', 'quality_of_sleep',
       'physical_activity_level', 'stress_level', 'heart_rate', 'daily_steps'],
      dtype='object')

In [9]:
categorical = df.columns[(df.dtypes == 'object')]
categorical

Index(['gender', 'occupation', 'bmi_category', 'blood_pressure',
       'sleep_disorder'],
      dtype='object')

Lets understand the distribution of labels in the **categorical columns**

In [10]:
for c in categorical:
    print(df[c].value_counts())
    print()

gender
Male      189
Female    185
Name: count, dtype: int64

occupation
Nurse                   73
Doctor                  71
Engineer                63
Lawyer                  47
Teacher                 40
Accountant              37
Salesperson             32
Scientist                4
Software Engineer        4
Sales Representative     2
Manager                  1
Name: count, dtype: int64

bmi_category
Normal           195
Overweight       148
Normal Weight     21
Obese             10
Name: count, dtype: int64

blood_pressure
130/85    99
140/95    65
125/80    65
120/80    45
115/75    32
135/90    27
140/90     4
125/82     4
128/85     3
132/87     3
117/76     2
130/86     2
126/83     2
119/77     2
135/88     2
128/84     2
131/86     2
139/91     2
115/78     2
129/84     2
142/92     2
118/75     2
118/76     1
122/80     1
121/79     1
Name: count, dtype: int64

sleep_disorder
Sleep Apnea    78
Insomnia       77
Name: count, dtype: int64



From the above categorical variables, the `blood_pressure` would be better used if it were a numeric variable. Lets convert this into numeric and use some Feature Engineering to create a new variable called **`pulse_pressure`**.

In [13]:
bp_split = df['blood_pressure'].str.split('/', expand=True)

In [14]:
df['systolic'] = bp_split[0].astype(int)
df['diastolic'] = bp_split[1].astype(int)

Lets now get rid of the `blood_pressure` column from the dataset and rearrange our **numerical** and **categorical** variables

In [16]:
del df['blood_pressure']

In [17]:
df.head()

Unnamed: 0,person_id,gender,age,occupation,sleep_duration,quality_of_sleep,physical_activity_level,stress_level,bmi_category,heart_rate,daily_steps,sleep_disorder,systolic,diastolic
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,77,4200,,126,83
1,2,Male,28,Doctor,6.2,6,60,8,Normal,75,10000,,125,80
2,3,Male,28,Doctor,6.2,6,60,8,Normal,75,10000,,125,80
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,85,3000,Sleep Apnea,140,90
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,85,3000,Sleep Apnea,140,90


In [19]:
numerical = df.columns[(df.dtypes == 'int64') | (df.dtypes == 'float64')]
categorical = df.columns[(df.dtypes == 'object')]
numerical, categorical

(Index(['person_id', 'age', 'sleep_duration', 'quality_of_sleep',
        'physical_activity_level', 'stress_level', 'heart_rate', 'daily_steps',
        'systolic', 'diastolic'],
       dtype='object'),
 Index(['gender', 'occupation', 'bmi_category', 'sleep_disorder'], dtype='object'))

### Exploratory Data Analysis