### Importing Required Packages

In [1]:
# Use Erdos_Spring_2025 conda env as created
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

### Loading the Dataset from Kaggle

Here, we define the relative path of the dataset file and use `kagglehub.load_dataset()` to fetch the latest version of the **Dry Eye Disease Dataset** from Kaggle.

- Dataset: `"dakshnagra/dry-eye-disease"`
- File: `"Dry_Eye_Dataset.csv"`

In [2]:
# path to the dataset
file_path = "Dry_Eye_Dataset.csv"

# Load the latest version of the file
df0 = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "dakshnagra/dry-eye-disease",
  file_path,
)



### Save the dataset locally in 'Data' directory 

In [3]:

fname = '../Data/dry_eye_disease_rawfile.csv'
df0.to_csv(fname, index=False)

### Read data from Data directory


In [5]:
df0  = pd.read_csv(fname)

### Print Headers of first five entries


In [6]:
df0.head(5)

Unnamed: 0,Gender,Age,Sleep duration,Sleep quality,Stress level,Blood pressure,Heart rate,Daily steps,Physical activity,Height,...,Smoking,Medical issue,Ongoing medication,Smart device before bed,Average screen time,Blue-light filter,Discomfort Eye-strain,Redness in eye,Itchiness/Irritation in eye,Dry Eye Disease
0,F,24,9.5,2,1,137/89,67,3000,31,161,...,N,Y,Y,N,8.7,N,Y,Y,N,Y
1,M,39,9.6,2,3,108/64,60,12000,74,164,...,N,Y,Y,N,9.6,Y,N,N,Y,Y
2,F,45,5.4,1,5,134/81,95,12000,93,179,...,N,N,N,Y,4.0,N,Y,N,N,N
3,F,45,5.4,4,5,110/90,78,19000,32,160,...,N,Y,N,N,7.6,N,Y,N,Y,N
4,F,42,5.7,3,2,99/67,72,4000,173,179,...,N,Y,N,N,3.5,N,Y,Y,N,Y


In [45]:
#Copy the data file so to have the raw file intact
df = df0.copy()
#Describe the numerical data
print(df.describe())

                Age  Sleep duration  Sleep quality  Stress level  \
count  20000.000000    20000.000000   20000.000000  20000.000000   
mean      31.422800        6.998245       2.997250      2.993750   
std        8.103717        1.731723       1.412283      1.407235   
min       18.000000        4.000000       1.000000      1.000000   
25%       24.000000        5.500000       2.000000      2.000000   
50%       31.000000        7.000000       3.000000      3.000000   
75%       39.000000        8.500000       4.000000      4.000000   
max       45.000000       10.000000       5.000000      5.000000   

         Heart rate   Daily steps  Physical activity        Height  \
count  20000.000000  20000.000000       20000.000000  20000.000000   
mean      79.912200  10536.900000          90.069750    174.865900   
std       11.808279   5752.729186          52.317283     14.719903   
min       60.000000   1000.000000           0.000000    150.000000   
25%       70.000000   6000.000000    

### Make everything lower case, and replace space with '_'

In [None]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

### Split the blood pressure into **systolic** and **diastolic** parts. Then categorize according to systolic and diastolic levels. 

In [47]:
# Split the blood_pressure column into two new columns
df[['systolic', 'diastolic']] = df['blood_pressure'].str.split('/', expand=True).astype(int)

# Define the Blood Pressure Category
def categorize_bp(row):
    systolic = int(row['systolic'])
    diastolic = int(row['diastolic'])
    if systolic < 90 and diastolic < 60:
        return 'hypotension'
    elif systolic < 120 and diastolic < 80:
        return 'normal'
    elif 120 <= systolic < 130 and diastolic < 80:
        return 'elevated'
    elif (130 <= systolic < 140) or (80 <= diastolic < 90):
        return 'hypertension_stage_1'
    elif (140 <= systolic < 180) or (90 <= diastolic < 120):
        return 'hypertension_stage_2'
    elif systolic >= 180 or diastolic >= 120:
        return 'hypertensive_crisis'
    else:
        return 'unclassified'
    
# New BP category column
df['bp_category'] = df.apply(categorize_bp, axis=1)
print('Types of variables in bp_category:',df['bp_category'].nunique())

df = df.drop(columns=['blood_pressure']) 

Types of variables in bp_category: 4


### We do not have a well defined variable called **Insomnia** in our data. So, we prepare a target 'Insomnia' based on five available features. If **3 or more** of these conditions are met, the function returns 'Y'.

In [48]:
#Define Insomnia
def define_insomnia(row):
    criteria_met = 0
    
    # Check if each criterion is met
    if row['sleep_disorder'] == 'Y':  # Only consider those with sleep disorder
        criteria_met+=1
    if row['sleep_duration'] < 6:
            criteria_met += 1
    if row['sleep_quality'] <= 3:
        criteria_met += 1
    if row['feel_sleepy_during_day'] == 'Y':
        criteria_met += 1
    if row['wake_up_during_night'] == 'Y':
        criteria_met += 1
        
    # Assign insomnia as 1 if 2 or more criteria are met
    if criteria_met >= 3:
        return 'Y'
    return 'N'  # Insomnia = 0 if fewer than 2 criteria are met

# Apply the function to create a new column 'Insomnia'
df['insomnia'] = df.apply(define_insomnia, axis=1)

### Make another category **Combined Condition** based on 'Dry Eye Disease' and 'Insomnia'

In [49]:
#Define Insomnia
def combined_condition(row):
    insomnia = row['insomnia']
    dry_eye_disease = row['dry_eye_disease']
    
    if insomnia=='Y' and dry_eye_disease=='N':
        return 'insomnia'
    elif insomnia=='N' and dry_eye_disease=='Y':
        return 'dry_eye_disease'
    elif insomnia=='Y' and dry_eye_disease=='Y':
        return 'both'
    else:
        return 'none'

# Apply the function to create a new column 'Insomnia'
df['combined_condition'] = df.apply(combined_condition, axis=1)

### Compute the **Body Mass Index (BMI)** for each individual in the dataset using their weight and height.

The formula used is:

$\text{BMI} = \frac{\text{Weight (kg)}}{(\text{Height (m)})^2}$

Since height is stored in centimeters, we divide it by 100 to convert to meters before squaring. The result is stored in a new column called `'bmi'`.


In [50]:
df['bmi'] = df['weight']/(df['height']/100.)**2. # weight in k. g., height in m
df['bmi']

0        26.619343
1        32.346817
2        29.337411
3        30.078125
4        30.897912
           ...    
19995    18.000000
19996    25.246548
19997    25.147416
19998    22.837370
19999    17.541874
Name: bmi, Length: 20000, dtype: float64

### Save dataset with updated columns


In [None]:
fname_fin = '../Data/dry_eye_disease_parsed.csv'
df.to_csv(fname_fin, index=False)