# **ETL – Lifestyle & Wellbeing Dataset**

## Objectives

* Objectives
- Load the Lifestyle & Wellbeing dataset from the local `dataset/raw` folder.
- Clean and transform the data (types, missing values, basic feature renaming).
- Save a cleaned version into `dataset/clean` for later analysis.

## Inputs
- `../dataset/raw/Wellbeing_and_lifestyle_data_Kaggle.csv`

## Outputs
- `../dataset/clean/lifestyle_wellbeing_clean.csv`

## Additional Comments

* TBD (currently none)



---

TEST

In [1]:
import pandas as pd
import numpy as np

raw_path = "../dataset/raw/Wellbeing_and_lifestyle_data_Kaggle.csv"  # change if needed

df_raw = pd.read_csv(raw_path)

df_raw.head()

Unnamed: 0,Timestamp,FRUITS_VEGGIES,DAILY_STRESS,PLACES_VISITED,CORE_CIRCLE,SUPPORTING_OTHERS,SOCIAL_NETWORK,ACHIEVEMENT,DONATION,BMI_RANGE,...,SLEEP_HOURS,LOST_VACATION,DAILY_SHOUTING,SUFFICIENT_INCOME,PERSONAL_AWARDS,TIME_FOR_PASSION,WEEKLY_MEDITATION,AGE,GENDER,WORK_LIFE_BALANCE_SCORE
0,7/7/15,3,2,2,5,0,5,2,0,1,...,7,5,5,1,4,0,5,36 to 50,Female,609.5
1,7/7/15,2,3,4,3,8,10,5,2,2,...,8,2,2,2,3,2,6,36 to 50,Female,655.6
2,7/7/15,2,3,3,4,4,10,3,2,2,...,8,10,2,2,4,8,3,36 to 50,Female,631.6
3,7/7/15,3,3,10,3,10,7,2,5,2,...,5,7,5,1,5,2,0,51 or more,Female,622.7
4,7/7/15,5,1,3,3,10,4,2,4,2,...,7,0,0,2,8,1,5,51 or more,Female,663.9


In [6]:
df_raw.info()
df_raw.columns


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15972 entries, 0 to 15971
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Timestamp                15972 non-null  object 
 1   FRUITS_VEGGIES           15972 non-null  int64  
 2   DAILY_STRESS             15972 non-null  object 
 3   PLACES_VISITED           15972 non-null  int64  
 4   CORE_CIRCLE              15972 non-null  int64  
 5   SUPPORTING_OTHERS        15972 non-null  int64  
 6   SOCIAL_NETWORK           15972 non-null  int64  
 7   ACHIEVEMENT              15972 non-null  int64  
 8   DONATION                 15972 non-null  int64  
 9   BMI_RANGE                15972 non-null  int64  
 10  TODO_COMPLETED           15972 non-null  int64  
 11  FLOW                     15972 non-null  int64  
 12  DAILY_STEPS              15972 non-null  int64  
 13  LIVE_VISION              15972 non-null  int64  
 14  SLEEP_HOURS           

Index(['Timestamp', 'FRUITS_VEGGIES', 'DAILY_STRESS', 'PLACES_VISITED',
       'CORE_CIRCLE', 'SUPPORTING_OTHERS', 'SOCIAL_NETWORK', 'ACHIEVEMENT',
       'DONATION', 'BMI_RANGE', 'TODO_COMPLETED', 'FLOW', 'DAILY_STEPS',
       'LIVE_VISION', 'SLEEP_HOURS', 'LOST_VACATION', 'DAILY_SHOUTING',
       'SUFFICIENT_INCOME', 'PERSONAL_AWARDS', 'TIME_FOR_PASSION',
       'WEEKLY_MEDITATION', 'AGE', 'GENDER', 'WORK_LIFE_BALANCE_SCORE'],
      dtype='object')

In [11]:
df = df_raw.copy()

# Renaming to snake_case, easier to work with later
df = df.rename(columns={
    "Timestamp": "timestamp",
    "FRUITS_VEGGIES": "fruits_veggies",
    "DAILY_STRESS": "daily_stress",
    "PLACES_VISITED": "places_visited",
    "CORE_CIRCLE": "core_circle",
    "SUPPORTING_OTHERS": "supporting_others",
    "SOCIAL_NETWORK": "social_network",
    "ACHIEVEMENT": "achievement",
    "DONATION": "donation",
    "BMI_RANGE": "bmi_range",
    "TODO_COMPLETED": "todo_completed",
    "FLOW": "flow",
    "DAILY_STEPS": "daily_steps",
    "LIVE_VISION": "live_vision",
    "SLEEP_HOURS": "sleep_hours",
    "LOST_VACATION": "lost_vacation",
    "DAILY_SHOUTING": "daily_shouting",
    "SUFFICIENT_INCOME": "sufficient_income",
    "PERSONAL_AWARDS": "personal_awards",
    "TIME_FOR_PASSION": "time_for_passion",
    "WEEKLY_MEDITATION": "weekly_meditation",
    "AGE": "age",
    "GENDER": "gender",
    "WORK_LIFE_BALANCE_SCORE": "work_life_balance_score"
})

df.head()


Unnamed: 0,timestamp,fruits_veggies,daily_stress,places_visited,core_circle,supporting_others,social_network,achievement,donation,bmi_range,...,sleep_hours,lost_vacation,daily_shouting,sufficient_income,personal_awards,time_for_passion,weekly_meditation,age,gender,work_life_balance_score
0,7/7/15,3,2,2,5,0,5,2,0,1,...,7,5,5,1,4,0,5,36 to 50,Female,609.5
1,7/7/15,2,3,4,3,8,10,5,2,2,...,8,2,2,2,3,2,6,36 to 50,Female,655.6
2,7/7/15,2,3,3,4,4,10,3,2,2,...,8,10,2,2,4,8,3,36 to 50,Female,631.6
3,7/7/15,3,3,10,3,10,7,2,5,2,...,5,7,5,1,5,2,0,51 or more,Female,622.7
4,7/7/15,5,1,3,3,10,4,2,4,2,...,7,0,0,2,8,1,5,51 or more,Female,663.9


In [16]:
# Converting timestamp
df["timestamp"] = pd.to_datetime(df["timestamp"])

numeric_cols = [
    "fruits_veggies",
    "daily_stress",
    "places_visited",
    "core_circle",
    "supporting_others",
    "social_network",
    "achievement",
    "donation",
    "bmi_range",
    "todo_completed",
    "flow",
    "daily_steps",
    "live_vision",
    "sleep_hours",
    "lost_vacation",
    "daily_shouting",
    "sufficient_income",
    "personal_awards",
    "time_for_passion",
    "weekly_meditation",
    "age",
    "work_life_balance_score"
]

df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors="coerce")
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15972 entries, 0 to 15971
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   timestamp                15972 non-null  datetime64[ns]
 1   fruits_veggies           15972 non-null  int64         
 2   daily_stress             15971 non-null  float64       
 3   places_visited           15972 non-null  int64         
 4   core_circle              15972 non-null  int64         
 5   supporting_others        15972 non-null  int64         
 6   social_network           15972 non-null  int64         
 7   achievement              15972 non-null  int64         
 8   donation                 15972 non-null  int64         
 9   bmi_range                15972 non-null  int64         
 10  todo_completed           15972 non-null  int64         
 11  flow                     15972 non-null  int64         
 12  daily_steps              15972 n

  df["timestamp"] = pd.to_datetime(df["timestamp"])


In [26]:
# Checking for duplicates
print("Duplicate rows:", df.duplicated().sum())
df = df.drop_duplicates()

print("Rows before:", len(df_raw))
print("Rows after dropping duplicates:", len(df))

Duplicate rows: 0
Rows before: 15972
Rows after dropping duplicates: 15490


I identified 482 fully duplicated rows in the raw dataset and removed them using drop_duplicates(), leaving 15,490 unique observations.

In [43]:
import os

os.chdir("/Users/giaaxa/data analytics project/jupyter_notebooks")

print("CWD now:", os.getcwd())


FileNotFoundError: [Errno 2] No such file or directory: '/Users/giaaxa/data analytics project/jupyter_notebooks'

In [None]:
clean_path = "dataset/clean/lifestyle_wellbeing_clean.csv"
df.to_csv(clean_path, index=False)


OSError: Cannot save file into a non-existent directory: 'dataset/clean'

In [None]:
clean_path = "../dataset/clean/lifestyle_wellbeing_clean.csv"
df.to_csv(clean_path, index=False)

clean_path

OSError: Cannot save file into a non-existent directory: '../dataset/clean'