# Data Cleaning
## Libraries

In [1]:
import pandas as pd

## Date Conversion

In [2]:
df_dict = pd.read_pickle('../data/processed/pickle/fitbit_data.pkl')

In [5]:
params_dict = {
    'rr': {'columns': ['timestamp'], 'time_format': [None]},
    'vo2_max': {'columns': ['dateTime'], 'time_format': ['%m/%d/%y %H:%M:%S']},
    'ox_var': {'columns': ['timestamp'], 'time_format': ['%m/%d/%y %H:%M:%S']},
    'spo2_daily': {'columns': ['timestamp'], 'time_format': [None]},
    'spo2_intraday': {'columns': ['timestamp'], 'time_format': [None]},
    'afib_ecg': {'columns': ['reading_time'], 'time_format': [None]},
    'hr': {'columns': ['dateTime'], 'time_format': ['%m/%d/%y %H:%M:%S']},
    'hrv_summary': {'columns': ['timestamp'], 'time_format': ['%Y-%m-%dT%H:%M:%S']},
    'hrv_histogram': {'columns': ['timestamp'], 'time_format': ['%Y-%m-%dT%H:%M:%S']},
    'hrv_details': {'columns': ['timestamp'], 'time_format': ['%Y-%m-%dT%H:%M:%S']},
    'time_in_hr_zones': {'columns': ['dateTime'], 'time_format': ['%m/%d/%y %H:%M:%S']},
    'sleep_profile': {'columns': ['creation_date'], 'time_format': ['%Y-%m-%d']},
    'sleep_score': {'columns': ['timestamp'], 'time_format': ['%Y-%m-%dT%H:%M:%SZ']},
    'sleep_json': {'columns': ['dateOfSleep','startTime','endTime'], 'time_format': ['%Y-%m-%d','%Y-%m-%dT%H:%M:%S.%f','%Y-%m-%dT%H:%M:%S.%f']},
    'stress': {'columns': ['DATE','UPDATED_AT'], 'time_format': ['%Y-%m-%dT%H:%M:%S','%Y-%m-%dT%H:%M:%S.%f']},
    'weight': {'columns':['date','time'] ,'time_format':['%m/%d/%y','%H:%M:%S']}
}

In [6]:
def datetime_conversion(df,columns,time_format):
    for col,fmt in zip(columns,time_format):
        df[col] = pd.to_datetime(df[col],format=fmt)
        df['date'] = pd.to_datetime(df[col].dt.date)
        df.set_index('date',inplace=True)
        return df

In [7]:
for name,params in params_dict.items():
    df_dict[name] = datetime_conversion(df_dict[name],**params)

## Intraday to daily

In [8]:
df_dict['hr'] = df_dict['hr'].resample('D').mean()

  df_dict['hr'] = df_dict['hr'].resample('D').mean()


In [9]:
df_dict['ox_var'] = df_dict['ox_var'].resample('D').mean()

  df_dict['ox_var'] = df_dict['ox_var'].resample('D').mean()


## Granularity
### Less than daily 
- Profile
- Sleep Profile
### Daily
- Respiration rate
- Vo2 Max
- Spo2 daily
- Afib ECG
- HRV summary
- HRV histogram
- Time in HR Zones
- Sleep score
- Sleep json
- Stress
- Weight
### Intraday
- Oxygen variation
- Spo2 intraday
- Heart rate
- HRV details

## Joining

In [10]:
daily_dfs = ['rr','hr','ox_var','vo2_max','spo2_daily','afib_ecg','hrv_summary','hrv_histogram','time_in_hr_zones','sleep_score','sleep_json','stress','weight']

In [11]:
de_duplicated_dfs = [] 
for name in daily_dfs:
    df = df_dict[name].copy()
    df = df[~df.index.duplicated(keep='first')]
    df = df.add_prefix((name+'.'))
    de_duplicated_dfs.append(df)

In [12]:
df_final = pd.concat(de_duplicated_dfs,axis=1, join='outer')

In [13]:
df_final

Unnamed: 0_level_0,rr.timestamp,rr.full_sleep_breathing_rate,rr.full_sleep_standard_deviation,rr.full_sleep_signal_to_noise,rr.deep_sleep_breathing_rate,rr.deep_sleep_standard_deviation,rr.deep_sleep_signal_to_noise,rr.light_sleep_breathing_rate,rr.light_sleep_standard_deviation,rr.light_sleep_signal_to_noise,...,stress.EXERTION_POINTS,stress.MAX_EXERTION_POINTS,stress.STATUS,stress.CALCULATION_FAILED,weight.logId,weight.weight,weight.bmi,weight.time,weight.source,weight.fat
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-12,NaT,,,,,,,,,,...,,,,,1.642032e+12,197.7,23.35,23:59:59,API,
2022-01-13,2022-01-13 06:43:00,14.0,0.9,9.564,14.0,0.9,9.564,12.6,1.7,14.518,...,,,,,1.642118e+12,197.9,23.38,23:59:59,API,20.0
2022-01-14,2022-01-14 06:07:00,13.4,1.1,9.308,13.4,1.1,9.308,12.4,1.1,19.405,...,0.0,0.0,READY_NOT_PREMIUM,False,,,,,,
2022-01-15,2022-01-15 08:59:00,13.8,0.9,9.985,13.8,0.9,9.985,12.6,1.2,12.030,...,0.0,0.0,READY_NOT_PREMIUM,False,,,,,,
2022-01-16,2022-01-16 08:57:00,14.8,1.0,11.748,14.8,1.0,11.748,12.4,1.6,11.046,...,0.0,0.0,READY_NOT_PREMIUM,False,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-03-11,2024-03-11 06:24:30,13.6,1.3,10.727,13.6,1.3,10.727,13.0,1.2,12.247,...,26.0,40.0,READY,False,,,,,,
2024-03-12,2024-03-12 09:01:30,13.0,0.9,11.367,13.0,0.9,11.367,11.4,0.8,9.491,...,26.0,40.0,READY,False,,,,,,
2024-03-13,2024-03-13 08:24:30,12.8,0.8,6.631,12.8,0.8,6.631,11.4,1.1,7.617,...,21.0,40.0,READY,False,,,,,,
2024-03-14,2024-03-14 08:27:30,14.0,1.2,10.598,14.0,1.2,10.598,11.0,1.1,8.234,...,31.0,40.0,READY,False,,,,,,


In [14]:
df_final.to_pickle('../data/processed/pickle/fitbit_data_final.pkl')