##### This notebook collects all the data from MongoDB and creates the corresponding dataframe.

In [1]:
import os
import warnings
import datetime
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from pymongo import MongoClient
from functions import data_loading

warnings.filterwarnings("ignore")

Connect securely to the database

In [2]:
load_dotenv("config.env")
MONGO_USER = os.getenv("MONGO_USER")
MONGO_PASSWORD = os.getenv("MONGO_PASSWORD")
client = MongoClient("mongodb://" + MONGO_USER + ":" + MONGO_PASSWORD + "@localhost:27017/")
db = client.rais

#### Fitbit data

Find all users provided their Fitbit data

In [3]:
users = db.fitbit.distinct('id')
print(len(users), "users provided their Fitbit data")

71 users provided their Fitbit data


Find all the data types

In [4]:
types = db.fitbit.distinct('type')
types

['Afib ECG Readings',
 'Computed Temperature',
 'Daily Heart Rate Variability Summary',
 'Daily SpO2',
 'Device Temperature',
 'Heart Rate Variability Details',
 'Heart Rate Variability Histogram',
 'Profile',
 'Respiratory Rate Summary',
 'Stress Score',
 'Wrist Temperature',
 'altitude',
 'badge',
 'calories',
 'demographic_vo2_max',
 'distance',
 'estimated_oxygen_variation',
 'exercise',
 'heart_rate',
 'journal_entries',
 'lightly_active_minutes',
 'mindfulness_eda_data_sessions',
 'mindfulness_goals',
 'mindfulness_sessions',
 'moderately_active_minutes',
 'resting_heart_rate',
 'sedentary_minutes',
 'sleep',
 'steps',
 'time_in_heart_rate_zones',
 'very_active_minutes',
 'water_logs']

##### Afib ECG Readings

In [5]:
ecg = pd.DataFrame(columns=["id", "data"])

# read and load from MongoDB
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "Afib ECG Readings"},
            {"id": user}]},
            {"id": 1, "data.reading_time": 1, "data.result_classification": 1, "data.heart_rate_alert": 1, "_id": 0})))
    ecg = pd.concat([ecg, user_data], axis=0)

# split data column (json format) into two columns (df format)
ecg["date"] = ecg["data"].apply(lambda d: d["reading_time"])
ecg["ecg"] = ecg["data"].apply(lambda d: d["result_classification"])
ecg["heart_rate_alert"] = ecg["data"].apply(lambda d: d["heart_rate_alert"])
ecg.drop(["data"], inplace=True, axis=1)

# process the datetime object, group and aggregate the data
ecg = data_loading.date_conversion(ecg)
ecg = data_loading.aggregate_column(ecg, list(ecg.columns))

# merge with the final dataframe
df = ecg
df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE
1,621e301367b776a24057738e,2021-06-08,21,NSR,NONE
2,621e312a67b776a240164d59,2021-10-07,17,NSR,NONE
3,621e312a67b776a240164d59,2021-10-10,20,NSR,NONE
4,621e326767b776a24012e179,2021-07-22,17,UNCLASSIFIABLE,NONE
...,...,...,...,...,...
65,621e351a67b776a240f6204b,2021-07-22,9,NSR,NONE
66,621e351a67b776a240f6204b,2021-08-04,1,NSR,NONE
67,621e351a67b776a240f6204b,2021-08-10,10,NSR,NONE
68,621e36dd67b776a240ce9a45,2021-05-24,13,NSR,NONE


##### Computed Temperature

In [8]:
nightly_temperature = pd.DataFrame(columns=["id", "data"])

# read and load from MongoDB
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "Computed Temperature"},
            {"id": user}]},
            {"id": 1, "data.sleep_start": 1, "data.type": 1, "data.nightly_temperature": 1, "_id": 0})))
    nightly_temperature = pd.concat([nightly_temperature, user_data], axis=0)

# split data column (json format) into two columns (df format)
nightly_temperature["date"] = nightly_temperature["data"].apply(lambda d: d["sleep_start"])
nightly_temperature["type"] = nightly_temperature["data"].apply(lambda d: d["type"])
nightly_temperature["nightly_temperature"] = nightly_temperature["data"].apply(lambda d: d["nightly_temperature"])
nightly_temperature.drop(["data"], inplace=True, axis=1)

# process the datetime object, feature types and group and aggregate the data
nightly_temperature = data_loading.date_conversion(nightly_temperature)
nightly_temperature['nightly_temperature'] = pd.to_numeric(nightly_temperature['nightly_temperature'])
nightly_temperature = data_loading.aggregate_column(nightly_temperature, list(nightly_temperature.columns))

# merge with the final dataframe
df = df.merge(nightly_temperature, how='outer', on=['id', 'date', 'hour'])
df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert,type,nightly_temperature
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE,,
1,621e301367b776a24057738e,2021-06-08,21,NSR,NONE,,
2,621e312a67b776a240164d59,2021-10-07,17,NSR,NONE,,
3,621e312a67b776a240164d59,2021-10-10,20,NSR,NONE,,
4,621e326767b776a24012e179,2021-07-22,17,UNCLASSIFIABLE,NONE,,
...,...,...,...,...,...,...,...
3487,621e375b67b776a240290cdc,2021-07-24,0,,,SKIN,33.687826
3488,621e375b67b776a240290cdc,2021-07-25,1,,,SKIN,34.112386
3489,621e375b67b776a240290cdc,2021-07-26,0,,,SKIN,33.895137
3490,621e375b67b776a240290cdc,2021-07-27,0,,,SKIN,33.758319


##### Daily Heart Rate Variability Summary

In [10]:
nremhr = pd.DataFrame(columns=["id", "data"])

# read and load from MongoDB
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "Daily Heart Rate Variability Summary"},
            {"id": user}]},
            {"id": 1, "data.timestamp": 1, "data.nremhr": 1, "_id": 0})))
    nremhr = pd.concat([nremhr, user_data], axis=0)

# split data column (json format) into two columns (df format)
nremhr["date"] = nremhr["data"].apply(lambda d: d["timestamp"])
nremhr["nremhr"] = nremhr["data"].apply(lambda d: d["nremhr"])
nremhr.drop(["data"], inplace=True, axis=1)

# process the datetime object, feature types and group and aggregate the data
nremhr = data_loading.date_conversion(nremhr)
nremhr['nremhr'] = pd.to_numeric(nremhr['nremhr'])
nremhr = data_loading.aggregate_column(nremhr, list(nremhr.columns))

# merge with the final dataframe
df = df.merge(nremhr, how='outer', on=['id', 'date', 'hour'])
df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert,type,nightly_temperature,nremhr
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE,,,
1,621e301367b776a24057738e,2021-06-08,21,NSR,NONE,,,
2,621e312a67b776a240164d59,2021-10-07,17,NSR,NONE,,,
3,621e312a67b776a240164d59,2021-10-10,20,NSR,NONE,,,
4,621e326767b776a24012e179,2021-07-22,17,UNCLASSIFIABLE,NONE,,,
...,...,...,...,...,...,...,...,...
5424,621e375b67b776a240290cdc,2021-07-10,0,,,,,66.633
5425,621e375b67b776a240290cdc,2021-07-13,0,,,,,70.155
5426,621e375b67b776a240290cdc,2021-07-19,0,,,,,57.362
5427,621e375b67b776a240290cdc,2021-07-20,0,,,,,61.824


##### Daily SpO2

In [12]:
spo2 = pd.DataFrame(columns=["id", "data"])

# read and load from MongoDB
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "Daily SpO2"},
            {"id": user}]},
            {"id": 1, "data.timestamp": 1, "data.average_value": 1, "_id": 0})))
    spo2 = pd.concat([spo2, user_data], axis=0)

# split data column (json format) into two columns (df format)
spo2["date"] = spo2["data"].apply(lambda d: d["timestamp"])
spo2["spo2"] = spo2["data"].apply(lambda d: d["average_value"])
spo2.drop(["data"], inplace=True, axis=1)

# process the datetime object, feature types and group and aggregate the data
spo2 = data_loading.date_conversion(spo2)
spo2['spo2'] = pd.to_numeric(spo2['spo2'])
spo2 = data_loading.aggregate_column(spo2, list(spo2.columns))

# merge with the final dataframe
df = df.merge(spo2, how='outer', on=['id', 'date', 'hour'])
df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert,type,nightly_temperature,nremhr,spo2
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE,,,,
1,621e301367b776a24057738e,2021-06-08,21,NSR,NONE,,,,
2,621e312a67b776a240164d59,2021-10-07,17,NSR,NONE,,,,
3,621e312a67b776a240164d59,2021-10-10,20,NSR,NONE,,,,
4,621e326767b776a24012e179,2021-07-22,17,UNCLASSIFIABLE,NONE,,,,
...,...,...,...,...,...,...,...,...,...
5558,621e351a67b776a240f6204b,2021-09-14,0,,,,,,96.9
5559,621e362467b776a2404ad513,2021-07-21,0,,,,,,95.4
5560,621e362467b776a2404ad513,2021-07-22,0,,,,,,98.5
5561,621e362467b776a2404ad513,2021-07-25,0,,,,,,96.9


##### Heart Rate Variability Details

In [14]:
rmssd = pd.DataFrame(columns=["id", "data"])

# read and load from MongoDB
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "Heart Rate Variability Details"},
            {"id": user}]},
            {"id": 1, "data.timestamp": 1, "data.rmssd": 1, "_id": 0})))
    rmssd = pd.concat([rmssd , user_data], axis=0)

# split data column (json format) into two columns (df format)
rmssd["date"] = rmssd["data"].apply(lambda d: d["timestamp"])
rmssd["rmssd"] = rmssd["data"].apply(lambda d: d["rmssd"])
rmssd.drop(["data"], inplace=True, axis=1)

# process the datetime object, feature types and group and aggregate the data
rmssd = data_loading.date_conversion(rmssd)
rmssd['rmssd'] = pd.to_numeric(rmssd['rmssd'])
rmssd = data_loading.aggregate_column(rmssd, list(rmssd.columns))

# merge with the final dataframe
df = df.merge(rmssd, how='outer', on=['id', 'date', 'hour'])
df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert,type,nightly_temperature,nremhr,spo2,rmssd
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE,,,,,
1,621e301367b776a24057738e,2021-06-08,21,NSR,NONE,,,,,
2,621e312a67b776a240164d59,2021-10-07,17,NSR,NONE,,,,,
3,621e312a67b776a240164d59,2021-10-10,20,NSR,NONE,,,,,
4,621e326767b776a24012e179,2021-07-22,17,UNCLASSIFIABLE,NONE,,,,,
...,...,...,...,...,...,...,...,...,...,...
23384,621e375b67b776a240290cdc,2021-07-29,4,,,,,,,20.740667
23385,621e375b67b776a240290cdc,2021-07-29,5,,,,,,,18.778083
23386,621e375b67b776a240290cdc,2021-07-29,6,,,,,,,19.412083
23387,621e375b67b776a240290cdc,2021-07-29,7,,,,,,,21.974833


##### Respiratory Rate Summary

In [16]:
respiratory_rate = pd.DataFrame(columns=["id", "data"])

# read and load from MongoDB
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "Respiratory Rate Summary"},
            {"id": user}]},
            {"id": 1, "data.timestamp": 1, "data.full_sleep_breathing_rate": 1, "data.deep_sleep_breathing_rate": 1, "data.light_sleep_breathing_rate": 1, "data.rem_sleep_breathing_rate": 1, "_id": 0})))
    respiratory_rate = pd.concat([respiratory_rate , user_data], axis=0)

# split data column (json format) into two columns (df format)
respiratory_rate["date"] = respiratory_rate["data"].apply(lambda d: d["timestamp"])
respiratory_rate["full_sleep_breathing_rate"] = respiratory_rate["data"].apply(lambda d: d["full_sleep_breathing_rate"])
respiratory_rate["deep_sleep_breathing_rate"] = respiratory_rate["data"].apply(lambda d: d["deep_sleep_breathing_rate"])
respiratory_rate["light_sleep_breathing_rate"] = respiratory_rate["data"].apply(lambda d: d["light_sleep_breathing_rate"])
respiratory_rate["rem_sleep_breathing_rate"] = respiratory_rate["data"].apply(lambda d: d["rem_sleep_breathing_rate"])
respiratory_rate.drop(["data"], inplace=True, axis=1)

# process the datetime object, feature types and group and aggregate the data
respiratory_rate = data_loading.date_conversion(respiratory_rate)
respiratory_rate['full_sleep_breathing_rate'] = pd.to_numeric(respiratory_rate['full_sleep_breathing_rate'])
respiratory_rate['deep_sleep_breathing_rate'] = pd.to_numeric(respiratory_rate['deep_sleep_breathing_rate'])
respiratory_rate['light_sleep_breathing_rate'] = pd.to_numeric(respiratory_rate['light_sleep_breathing_rate'])
respiratory_rate['rem_sleep_breathing_rate'] = pd.to_numeric(respiratory_rate['rem_sleep_breathing_rate'])
respiratory_rate = data_loading.aggregate_column(respiratory_rate, list(respiratory_rate.columns))

# merge with the final dataframe
df = df.merge(respiratory_rate, how='outer', on=['id', 'date', 'hour'])
df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert,type,nightly_temperature,nremhr,spo2,rmssd,full_sleep_breathing_rate,deep_sleep_breathing_rate,light_sleep_breathing_rate,rem_sleep_breathing_rate
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE,,,,,,,,,
1,621e301367b776a24057738e,2021-06-08,21,NSR,NONE,,,,,,,,,
2,621e312a67b776a240164d59,2021-10-07,17,NSR,NONE,,,,,,,,,
3,621e312a67b776a240164d59,2021-10-10,20,NSR,NONE,,,,,,,,,
4,621e326767b776a24012e179,2021-07-22,17,UNCLASSIFIABLE,NONE,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23698,621e375b67b776a240290cdc,2021-06-15,9,,,,,,,,18.4,17.6,,-1.0
23699,621e375b67b776a240290cdc,2021-06-19,9,,,,,,,,16.4,17.0,,-1.0
23700,621e375b67b776a240290cdc,2021-06-25,8,,,,,,,,17.6,17.0,,8.8
23701,621e375b67b776a240290cdc,2021-07-20,7,,,,,,,,16.0,16.0,,-1.0


##### Stress Score

In [18]:
stress = pd.DataFrame(columns=["id", "data"])

# read and load from MongoDB
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "Stress Score"},
            {"id": user}]},
            {"id": 1, "data.DATE": 1, "data.STRESS_SCORE": 1, "data.SLEEP_POINTS": 1, "data.RESPONSIVENESS_POINTS": 1, "data.EXERTION_POINTS": 1, "_id": 0})))
    stress = pd.concat([stress , user_data], axis=0)

# split data column (json format) into two columns (df format)
stress["date"] = stress["data"].apply(lambda d: d["DATE"])
stress["stress_score"] = stress["data"].apply(lambda d: d["STRESS_SCORE"])
stress["sleep_points"] = stress["data"].apply(lambda d: d["SLEEP_POINTS"])
stress["responsiveness_points"] = stress["data"].apply(lambda d: d["RESPONSIVENESS_POINTS"])
stress["exertion_points"] = stress["data"].apply(lambda d: d["EXERTION_POINTS"])
stress.drop(["data"], inplace=True, axis=1)

# process the datetime object, feature types and group and aggregate the data
stress = data_loading.date_conversion(stress)
stress['stress_score'] = pd.to_numeric(stress['stress_score'])
stress['sleep_points'] = pd.to_numeric(stress['sleep_points'])
stress['responsiveness_points'] = pd.to_numeric(stress['responsiveness_points'])
stress['exertion_points'] = pd.to_numeric(stress['exertion_points'])
stress = data_loading.aggregate_column(stress, list(stress.columns))

# merge with the final dataframe
df = df.merge(stress, how='outer', on=['id', 'date', 'hour'])
df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert,type,nightly_temperature,nremhr,spo2,rmssd,full_sleep_breathing_rate,deep_sleep_breathing_rate,light_sleep_breathing_rate,rem_sleep_breathing_rate,stress_score,sleep_points,responsiveness_points,exertion_points
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE,,,,,,,,,,,,,
1,621e301367b776a24057738e,2021-06-08,21,NSR,NONE,,,,,,,,,,,,,
2,621e312a67b776a240164d59,2021-10-07,17,NSR,NONE,,,,,,,,,,,,,
3,621e312a67b776a240164d59,2021-10-10,20,NSR,NONE,,,,,,,,,,,,,
4,621e326767b776a24012e179,2021-07-22,17,UNCLASSIFIABLE,NONE,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23898,621e375b67b776a240290cdc,2021-06-26,0,,,,,,,,,,,,80.0,22.0,19.0,39.0
23899,621e375b67b776a240290cdc,2021-07-11,0,,,,,,,,,,,,0.0,0.0,0.0,0.0
23900,621e375b67b776a240290cdc,2021-07-18,0,,,,,,,,,,,,0.0,0.0,0.0,0.0
23901,621e375b67b776a240290cdc,2021-07-22,0,,,,,,,,,,,,0.0,0.0,0.0,0.0


##### Wrist Temperature

In [20]:
wrist_temperature = pd.DataFrame(columns=["id", "data"])

# read and load from MongoDB
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "Wrist Temperature"},
            {"id": user}]},
            {"id": 1, "data.recorded_time": 1, "data.temperature": 1, "_id": 0})))
    wrist_temperature = pd.concat([wrist_temperature , user_data], axis=0)

# split data column (json format) into two columns (df format)
wrist_temperature["date"] = wrist_temperature["data"].apply(lambda d: d["recorded_time"])
wrist_temperature["wrist_temperature"] = wrist_temperature["data"].apply(lambda d: d["temperature"])
wrist_temperature.drop(["data"], inplace=True, axis=1)

# process the datetime object, feature types and group and aggregate the data
wrist_temperature = data_loading.date_conversion(wrist_temperature)
wrist_temperature['wrist_temperature'] = pd.to_numeric(wrist_temperature['wrist_temperature'])
wrist_temperature = data_loading.aggregate_column(wrist_temperature, list(wrist_temperature.columns))

# merge with the final dataframe
df = df.merge(wrist_temperature, how='outer', on=['id', 'date', 'hour'])
df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert,type,nightly_temperature,nremhr,spo2,rmssd,full_sleep_breathing_rate,deep_sleep_breathing_rate,light_sleep_breathing_rate,rem_sleep_breathing_rate,stress_score,sleep_points,responsiveness_points,exertion_points,wrist_temperature
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE,,,,,,,,,,,,,,-4.726633
1,621e301367b776a24057738e,2021-06-08,21,NSR,NONE,,,,,,,,,,,,,,
2,621e312a67b776a240164d59,2021-10-07,17,NSR,NONE,,,,,,,,,,,,,,
3,621e312a67b776a240164d59,2021-10-10,20,NSR,NONE,,,,,,,,,,,,,,-3.497025
4,621e326767b776a24012e179,2021-07-22,17,UNCLASSIFIABLE,NONE,,,,,,,,,,,,,,0.120849
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76169,621e375b67b776a240290cdc,2021-07-29,19,,,,,,,,,,,,,,,,-1.698178
76170,621e375b67b776a240290cdc,2021-07-29,20,,,,,,,,,,,,,,,,-3.432095
76171,621e375b67b776a240290cdc,2021-07-29,21,,,,,,,,,,,,,,,,-2.635845
76172,621e375b67b776a240290cdc,2021-07-29,22,,,,,,,,,,,,,,,,-1.439095


##### altitude

In [22]:
altitude = pd.DataFrame(columns=["id", "data"])

# read and load from MongoDB
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "altitude"},
            {"id": user}]},
            {"id": 1, "data.dateTime": 1, "data.value": 1, "_id": 0})))
    altitude = pd.concat([altitude , user_data], axis=0)

# split data column (json format) into two columns (df format)
altitude["date"] = altitude["data"].apply(lambda d: d["dateTime"])
altitude["altitude"] = altitude["data"].apply(lambda d: d["value"])
altitude.drop(["data"], inplace=True, axis=1)

# process the datetime object, feature types and group and aggregate the data
altitude = data_loading.date_conversion(altitude)
altitude['altitude'] = pd.to_numeric(altitude['altitude'])
altitude = data_loading.aggregate_column(altitude, list(altitude.columns))

# merge with the final dataframe
df = df.merge(altitude, how='outer', on=['id', 'date', 'hour'])
df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert,type,nightly_temperature,nremhr,spo2,rmssd,full_sleep_breathing_rate,deep_sleep_breathing_rate,light_sleep_breathing_rate,rem_sleep_breathing_rate,stress_score,sleep_points,responsiveness_points,exertion_points,wrist_temperature,altitude
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE,,,,,,,,,,,,,,-4.726633,10.0
1,621e301367b776a24057738e,2021-06-08,21,NSR,NONE,,,,,,,,,,,,,,,
2,621e312a67b776a240164d59,2021-10-07,17,NSR,NONE,,,,,,,,,,,,,,,
3,621e312a67b776a240164d59,2021-10-10,20,NSR,NONE,,,,,,,,,,,,,,-3.497025,10.0
4,621e326767b776a24012e179,2021-07-22,17,UNCLASSIFIABLE,NONE,,,,,,,,,,,,,,0.120849,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83631,621e375b67b776a240290cdc,2021-08-01,18,,,,,,,,,,,,,,,,,10.0
83632,621e375b67b776a240290cdc,2021-08-01,19,,,,,,,,,,,,,,,,,10.0
83633,621e375b67b776a240290cdc,2021-08-01,21,,,,,,,,,,,,,,,,,10.0
83634,621e375b67b776a240290cdc,2021-08-01,22,,,,,,,,,,,,,,,,,10.0


##### badge

In [24]:
badge = pd.DataFrame(columns=["id", "data"])

# read and load from MongoDB
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "badge"},
            {"id": user}]},
            {"id": 1, "data.dateTime": 1, "data.badgeType": 1, "data.value": 1, "_id": 0})))
    badge = pd.concat([badge , user_data], axis=0)

# split data column (json format) into two columns (df format)
badge["date"] = badge["data"].apply(lambda d: d["dateTime"])
badge["badge_type"] = badge["data"].apply(lambda d: d["badgeType"])
badge["badge_value"] = badge["data"].apply(lambda d: d["value"])
badge.drop(["data"], inplace=True, axis=1)

# process the datetime object, feature types and group and aggregate the data
badge = data_loading.date_conversion(badge)
badge['badge_value'] = pd.to_numeric(badge['badge_value'])
badge = data_loading.aggregate_column(badge, list(badge.columns))

# merge with the final dataframe
df = df.merge(badge, how='outer', on=['id', 'date', 'hour'])
df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert,type,nightly_temperature,nremhr,spo2,rmssd,...,light_sleep_breathing_rate,rem_sleep_breathing_rate,stress_score,sleep_points,responsiveness_points,exertion_points,wrist_temperature,altitude,badge_type,badge_value
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE,,,,,,...,,,,,,,-4.726633,10.0,,
1,621e301367b776a24057738e,2021-06-08,21,NSR,NONE,,,,,,...,,,,,,,,,,
2,621e312a67b776a240164d59,2021-10-07,17,NSR,NONE,,,,,,...,,,,,,,,,,
3,621e312a67b776a240164d59,2021-10-10,20,NSR,NONE,,,,,,...,,,,,,,-3.497025,10.0,,
4,621e326767b776a24012e179,2021-07-22,17,UNCLASSIFIABLE,NONE,,,,,,...,,,,,,,0.120849,10.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83761,621e375367b776a24021e950,2022-01-13,0,,,,,,,,...,,,,,,,,,DAILY_STEPS,5000.0
83762,621e375367b776a24021e950,2022-01-18,0,,,,,,,,...,,,,,,,,,LIFETIME_FLOORS,1000.0
83763,621e375367b776a24021e950,2022-01-19,0,,,,,,,,...,,,,,,,,,DAILY_FLOORS,25.0
83764,621e375367b776a24021e950,2022-01-20,0,,,,,,,,...,,,,,,,,,DAILY_FLOORS,10.0


##### calories

In [26]:
calories = pd.DataFrame(columns=["id", "data"])

# read and load from MongoDB
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "calories"},
            {"id": user}]},
            {"id": 1, "data.dateTime": 1, "data.value": 1, "_id": 0})))
    calories = pd.concat([calories , user_data], axis=0)

# split data column (json format) into two columns (df format)
calories["date"] = calories["data"].apply(lambda d: d["dateTime"])
calories["calories"] = calories["data"].apply(lambda d: d["value"])
calories.drop(["data"], inplace=True, axis=1)

# process the datetime object, feature types and group and aggregate the data
calories = data_loading.date_conversion(calories)
calories['calories'] = pd.to_numeric(calories['calories'])
calories = data_loading.aggregate_column(calories, list(calories.columns))

# merge with the final dataframe
df = df.merge(calories, how='outer', on=['id', 'date', 'hour'])
df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert,type,nightly_temperature,nremhr,spo2,rmssd,...,rem_sleep_breathing_rate,stress_score,sleep_points,responsiveness_points,exertion_points,wrist_temperature,altitude,badge_type,badge_value,calories
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE,,,,,,...,,,,,,-4.726633,10.0,,,1.641667
1,621e301367b776a24057738e,2021-06-08,21,NSR,NONE,,,,,,...,,,,,,,,,,1.616667
2,621e312a67b776a240164d59,2021-10-07,17,NSR,NONE,,,,,,...,,,,,,,,,,1.290000
3,621e312a67b776a240164d59,2021-10-10,20,NSR,NONE,,,,,,...,,,,,,-3.497025,10.0,,,4.222833
4,621e326767b776a24012e179,2021-07-22,17,UNCLASSIFIABLE,NONE,,,,,,...,,,,,,0.120849,10.0,,,0.977667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159014,621e375b67b776a240290cdc,2021-08-17,7,,,,,,,,...,,,,,,,,,,0.900000
159015,621e375b67b776a240290cdc,2021-08-17,8,,,,,,,,...,,,,,,,,,,0.900000
159016,621e375b67b776a240290cdc,2021-08-17,9,,,,,,,,...,,,,,,,,,,0.900000
159017,621e375b67b776a240290cdc,2021-08-17,10,,,,,,,,...,,,,,,,,,,0.900000


In [27]:
df.to_pickle("data/temp_after_calories.pkl")

In [5]:
df = pd.read_pickle("data/temp_after_calories.pkl")

##### demographic_vo2_max

In [8]:
vo2max = pd.DataFrame(columns=["id", "data"])

# read and load from MongoDB
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "demographic_vo2_max"},
            {"id": user}]},
            {"id": 1, "data.value.filteredDemographicVO2Max": 1, "data.dateTime": 1, "_id": 0})))
    vo2max = pd.concat([vo2max , user_data], axis=0)

# split data column (json format) into two columns (df format)
vo2max["date"] = vo2max["data"].apply(lambda d: d["dateTime"])
vo2max["vo2max"] = vo2max["data"].apply(lambda d: d["value"].get("filteredDemographicVO2Max"))
vo2max.drop(["data"], inplace=True, axis=1)

# process the datetime object, feature types and group and aggregate the data
vo2max = data_loading.date_conversion(vo2max)
vo2max['vo2max'] = pd.to_numeric(vo2max['vo2max'])
vo2max = data_loading.aggregate_column(vo2max, list(vo2max.columns))

# merge with the final dataframe
df = df.merge(vo2max, how='outer', on=['id', 'date', 'hour'])
df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert,type,nightly_temperature,nremhr,spo2,rmssd,...,stress_score,sleep_points,responsiveness_points,exertion_points,wrist_temperature,altitude,badge_type,badge_value,calories,vo2max
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE,,,,,,...,,,,,-4.726633,10.0,,,1.641667,
1,621e301367b776a24057738e,2021-06-08,21,NSR,NONE,,,,,,...,,,,,,,,,1.616667,
2,621e312a67b776a240164d59,2021-10-07,17,NSR,NONE,,,,,,...,,,,,,,,,1.290000,
3,621e312a67b776a240164d59,2021-10-10,20,NSR,NONE,,,,,,...,,,,,-3.497025,10.0,,,4.222833,
4,621e326767b776a24012e179,2021-07-22,17,UNCLASSIFIABLE,NONE,,,,,,...,,,,,0.120849,10.0,,,0.977667,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159014,621e375b67b776a240290cdc,2021-08-17,7,,,,,,,,...,,,,,,,,,0.900000,
159015,621e375b67b776a240290cdc,2021-08-17,8,,,,,,,,...,,,,,,,,,0.900000,
159016,621e375b67b776a240290cdc,2021-08-17,9,,,,,,,,...,,,,,,,,,0.900000,
159017,621e375b67b776a240290cdc,2021-08-17,10,,,,,,,,...,,,,,,,,,0.900000,


##### distance

In [9]:
distance = pd.DataFrame(columns=["id", "data"])

# read and load from MongoDB
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "distance"},
            {"id": user}]},
            {"id": 1, "data.dateTime": 1, "data.value": 1, "_id": 0})))
    distance = pd.concat([distance , user_data], axis=0)

# split data column (json format) into two columns (df format)
distance["date"] = distance["data"].apply(lambda d: d["dateTime"])
distance["distance"] = distance["data"].apply(lambda d: d["value"])
distance.drop(["data"], inplace=True, axis=1)

# process the datetime object, feature types and group and aggregate the data
distance = data_loading.date_conversion(distance)
distance['distance'] = pd.to_numeric(distance['distance'])
distance = data_loading.aggregate_column(distance, list(distance.columns))

# merge with the final dataframe
df = df.merge(distance, how='outer', on=['id', 'date', 'hour'])
df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert,type,nightly_temperature,nremhr,spo2,rmssd,...,sleep_points,responsiveness_points,exertion_points,wrist_temperature,altitude,badge_type,badge_value,calories,vo2max,distance
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE,,,,,,...,,,,-4.726633,10.0,,,1.641667,,0.0
1,621e301367b776a24057738e,2021-06-08,21,NSR,NONE,,,,,,...,,,,,,,,1.616667,,0.0
2,621e312a67b776a240164d59,2021-10-07,17,NSR,NONE,,,,,,...,,,,,,,,1.290000,,0.0
3,621e312a67b776a240164d59,2021-10-10,20,NSR,NONE,,,,,,...,,,,-3.497025,10.0,,,4.222833,,0.0
4,621e326767b776a24012e179,2021-07-22,17,UNCLASSIFIABLE,NONE,,,,,,...,,,,0.120849,10.0,,,0.977667,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159014,621e375b67b776a240290cdc,2021-08-17,7,,,,,,,,...,,,,,,,,0.900000,,
159015,621e375b67b776a240290cdc,2021-08-17,8,,,,,,,,...,,,,,,,,0.900000,,
159016,621e375b67b776a240290cdc,2021-08-17,9,,,,,,,,...,,,,,,,,0.900000,,
159017,621e375b67b776a240290cdc,2021-08-17,10,,,,,,,,...,,,,,,,,0.900000,,


##### estimated_oxygen_variation

In [11]:
oxygen_var = pd.DataFrame(columns=["id", "data"])

# read and load from MongoDB
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "estimated_oxygen_variation"},
            {"id": user}]},
            {"id": 1, "data.timestamp": 1, "data.Infrared to Red Signal Ratio": 1, "_id": 0})))
    oxygen_var = pd.concat([oxygen_var , user_data], axis=0)

# split data column (json format) into two columns (df format)
oxygen_var["date"] = oxygen_var["data"].apply(lambda d: d["timestamp"])
oxygen_var["oxygen_variation"] = oxygen_var["data"].apply(lambda d: d["Infrared to Red Signal Ratio"])
oxygen_var.drop(["data"], inplace=True, axis=1)

# process the datetime object, feature types and group and aggregate the data
oxygen_var = data_loading.date_conversion(oxygen_var)
oxygen_var['oxygen_variation'] = pd.to_numeric(oxygen_var['oxygen_variation'])
oxygen_var = data_loading.aggregate_column(oxygen_var, list(oxygen_var.columns))

# merge with the final dataframe
df = df.merge(oxygen_var, how='outer', on=['id', 'date', 'hour'])
df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert,type,nightly_temperature,nremhr,spo2,rmssd,...,exertion_points,wrist_temperature,altitude,badge_type,badge_value,calories,vo2max,distance,oxygen_variation_x,oxygen_variation_y
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE,,,,,,...,,-4.726633,10.0,,,1.641667,,0.0,,
1,621e301367b776a24057738e,2021-06-08,21,NSR,NONE,,,,,,...,,,,,,1.616667,,0.0,,
2,621e312a67b776a240164d59,2021-10-07,17,NSR,NONE,,,,,,...,,,,,,1.290000,,0.0,,
3,621e312a67b776a240164d59,2021-10-10,20,NSR,NONE,,,,,,...,,-3.497025,10.0,,,4.222833,,0.0,,
4,621e326767b776a24012e179,2021-07-22,17,UNCLASSIFIABLE,NONE,,,,,,...,,0.120849,10.0,,,0.977667,,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159014,621e375b67b776a240290cdc,2021-08-17,7,,,,,,,,...,,,,,,0.900000,,,,
159015,621e375b67b776a240290cdc,2021-08-17,8,,,,,,,,...,,,,,,0.900000,,,,
159016,621e375b67b776a240290cdc,2021-08-17,9,,,,,,,,...,,,,,,0.900000,,,,
159017,621e375b67b776a240290cdc,2021-08-17,10,,,,,,,,...,,,,,,0.900000,,,,


##### heart_rate

In [6]:
# bpm = pd.DataFrame(columns=["id", "data"])
#
# # read and load from MongoDB
# for user in users:
#     user_data = pd.DataFrame(list(
#         db.fitbit.find({"$and": [
#             {"type": "heart_rate"},
#             {"id": user}]},
#             {"id": 1, "data.dateTime": 1, "data.value.bpm": 1, "_id": 0})))
#     bpm = pd.concat([bpm , user_data], axis=0)
#
# # split data column (json format) into two columns (df format)
# bpm["date"] = bpm["data"].apply(lambda d: d["dateTime"])
# bpm["bpm"] = bpm["data"].apply(lambda d: d["value"].get("bpm"))
# bpm.drop(["data"], inplace=True, axis=1)
#
# # process the datetime object, feature types and group and aggregate the data
# bpm = data_loading.date_conversion(bpm)
# bpm['bpm'] = pd.to_numeric(bpm['bpm'])
# bpm = data_loading.aggregate_column(bpm, list(bpm.columns), False)
#
# # merge with the final dataframe
# df = df.merge(bpm, how='outer', on=['id', 'date', 'hour'])
# df

TypeError: cannot concatenate object of type '<class 'pymongo.cursor.Cursor'>'; only Series and DataFrame objs are valid

In [None]:
# df.to_pickle("data/temp_after_heart_rate.pkl")

In [None]:
# df = pd.read_pickle("data/temp_after_heart_rate.pkl")

##### lightly_active_minutes

In [13]:
light_minutes = pd.DataFrame(columns=["id", "data"])

# read and load from MongoDB
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "lightly_active_minutes"},
            {"id": user}]},
            {"id": 1, "data.dateTime": 1, "data.value": 1, "_id": 0})))
    light_minutes = pd.concat([light_minutes , user_data], axis=0)

# split data column (json format) into two columns (df format)
light_minutes["date"] = light_minutes["data"].apply(lambda d: d["dateTime"])
light_minutes["lightly_active_minutes"] = light_minutes["data"].apply(lambda d: d["value"])
light_minutes.drop(["data"], inplace=True, axis=1)

# process the datetime object, feature types and group and aggregate the data
light_minutes = data_loading.date_conversion(light_minutes)
light_minutes['lightly_active_minutes'] = pd.to_numeric(light_minutes['lightly_active_minutes'])
light_minutes = data_loading.aggregate_column(light_minutes, list(light_minutes.columns))

# merge with the final dataframe
df = df.merge(light_minutes, how='outer', on=['id', 'date', 'hour'])
df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert,type,nightly_temperature,nremhr,spo2,rmssd,...,wrist_temperature,altitude,badge_type,badge_value,calories,vo2max,distance,oxygen_variation_x,oxygen_variation_y,lightly_active_minutes
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE,,,,,,...,-4.726633,10.0,,,1.641667,,0.0,,,
1,621e301367b776a24057738e,2021-06-08,21,NSR,NONE,,,,,,...,,,,,1.616667,,0.0,,,
2,621e312a67b776a240164d59,2021-10-07,17,NSR,NONE,,,,,,...,,,,,1.290000,,0.0,,,
3,621e312a67b776a240164d59,2021-10-10,20,NSR,NONE,,,,,,...,-3.497025,10.0,,,4.222833,,0.0,,,
4,621e326767b776a24012e179,2021-07-22,17,UNCLASSIFIABLE,NONE,,,,,,...,0.120849,10.0,,,0.977667,,0.0,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159437,621e36f967b776a240e5e7c9,2021-08-03,0,,,,,,,,...,,,,,,,,,,0.0
159438,621e36f967b776a240e5e7c9,2021-08-04,0,,,,,,,,...,,,,,,,,,,0.0
159439,621e36f967b776a240e5e7c9,2021-08-05,0,,,,,,,,...,,,,,,,,,,0.0
159440,621e36f967b776a240e5e7c9,2021-08-06,0,,,,,,,,...,,,,,,,,,,0.0


##### mindfulness_eda_data_sessions

In [15]:
eda_sessions = pd.DataFrame(columns=["id", "data"])

# read and load from MongoDB
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "mindfulness_eda_data_sessions"},
            {"id": user}]},
            {"id": 1, "data.timestamp": 1, "data.scl_avg": 1, "_id": 0})))
    eda_sessions = pd.concat([eda_sessions , user_data], axis=0)

# split data column (json format) into two columns (df format)
eda_sessions["date"] = eda_sessions["data"].apply(lambda d: d["timestamp"])
eda_sessions["scl_avg"] = eda_sessions["data"].apply(lambda d: d["scl_avg"])
eda_sessions.drop(["data"], inplace=True, axis=1)

# process the datetime object, feature types and group and aggregate the data
eda_sessions = data_loading.date_conversion(eda_sessions)
eda_sessions['scl_avg'] = pd.to_numeric(eda_sessions['scl_avg'])
eda_sessions = data_loading.aggregate_column(eda_sessions, list(eda_sessions.columns))

# merge with the final dataframe
df = df.merge(eda_sessions, how='outer', on=['id', 'date', 'hour'])
df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert,type,nightly_temperature,nremhr,spo2,rmssd,...,altitude,badge_type,badge_value,calories,vo2max,distance,oxygen_variation_x,oxygen_variation_y,lightly_active_minutes,scl_avg
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE,,,,,,...,10.0,,,1.641667,,0.0,,,,
1,621e301367b776a24057738e,2021-06-08,21,NSR,NONE,,,,,,...,,,,1.616667,,0.0,,,,
2,621e312a67b776a240164d59,2021-10-07,17,NSR,NONE,,,,,,...,,,,1.290000,,0.0,,,,9.703215
3,621e312a67b776a240164d59,2021-10-10,20,NSR,NONE,,,,,,...,10.0,,,4.222833,,0.0,,,,
4,621e326767b776a24012e179,2021-07-22,17,UNCLASSIFIABLE,NONE,,,,,,...,10.0,,,0.977667,,0.0,0.0,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159437,621e36f967b776a240e5e7c9,2021-08-03,0,,,,,,,,...,,,,,,,,,0.0,
159438,621e36f967b776a240e5e7c9,2021-08-04,0,,,,,,,,...,,,,,,,,,0.0,
159439,621e36f967b776a240e5e7c9,2021-08-05,0,,,,,,,,...,,,,,,,,,0.0,
159440,621e36f967b776a240e5e7c9,2021-08-06,0,,,,,,,,...,,,,,,,,,0.0,


##### mindfulness_goals

In [16]:
mindfulness_goals = pd.DataFrame(columns=["id", "data"])

# read and load from MongoDB
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "mindfulness_goals"},
            {"id": user}]},
            {"id": 1, "data.date": 1, "data.days": 1, "_id": 0})))
    mindfulness_goals = pd.concat([mindfulness_goals , user_data], axis=0)

# split data column (json format) into two columns (df format)
mindfulness_goals["date"] = mindfulness_goals["data"].apply(lambda d: d["date"])
mindfulness_goals["mindfulness_goal"] = mindfulness_goals["data"].apply(lambda d: d["days"])
mindfulness_goals.drop(["data"], inplace=True, axis=1)

# process the datetime object, feature types and group and aggregate the data
mindfulness_goals = data_loading.date_conversion(mindfulness_goals)
mindfulness_goals['mindfulness_goal'] = pd.to_numeric(mindfulness_goals['mindfulness_goal'])
mindfulness_goals = data_loading.aggregate_column(mindfulness_goals, list(mindfulness_goals.columns))

# merge with the final dataframe
df = df.merge(mindfulness_goals, how='outer', on=['id', 'date', 'hour'])
df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert,type,nightly_temperature,nremhr,spo2,rmssd,...,badge_type,badge_value,calories,vo2max,distance,oxygen_variation_x,oxygen_variation_y,lightly_active_minutes,scl_avg,mindfulness_goal
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE,,,,,,...,,,1.641667,,0.0,,,,,
1,621e301367b776a24057738e,2021-06-08,21,NSR,NONE,,,,,,...,,,1.616667,,0.0,,,,,
2,621e312a67b776a240164d59,2021-10-07,17,NSR,NONE,,,,,,...,,,1.290000,,0.0,,,,9.703215,
3,621e312a67b776a240164d59,2021-10-10,20,NSR,NONE,,,,,,...,,,4.222833,,0.0,,,,,
4,621e326767b776a24012e179,2021-07-22,17,UNCLASSIFIABLE,NONE,,,,,,...,,,0.977667,,0.0,0.0,0.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159437,621e36f967b776a240e5e7c9,2021-08-03,0,,,,,,,,...,,,,,,,,0.0,,
159438,621e36f967b776a240e5e7c9,2021-08-04,0,,,,,,,,...,,,,,,,,0.0,,
159439,621e36f967b776a240e5e7c9,2021-08-05,0,,,,,,,,...,,,,,,,,0.0,,
159440,621e36f967b776a240e5e7c9,2021-08-06,0,,,,,,,,...,,,,,,,,0.0,,


##### mindfulness_sessions

In [17]:
mindfulness_sessions = pd.DataFrame(columns=["id", "data"])

# read and load from MongoDB
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "mindfulness_sessions"},
            {"id": user}]},
            {"id": 1, "data.start_date_time": 1, "data.start_heart_rate": 1, "data.end_heart_rate": 1, "_id": 0})))
    mindfulness_sessions = pd.concat([mindfulness_sessions , user_data], axis=0)

# split data column (json format) into two columns (df format)
mindfulness_sessions["date"] = mindfulness_sessions["data"].apply(lambda d: d["start_date_time"])
mindfulness_sessions["mindfulness_start_heart_rate"] = mindfulness_sessions["data"].apply(lambda d: d["start_heart_rate"])
mindfulness_sessions["mindfulness_end_heart_rate"] = mindfulness_sessions["data"].apply(lambda d: d["end_heart_rate"])
mindfulness_sessions.drop(["data"], inplace=True, axis=1)

# process the datetime object, feature types and group and aggregate the data
mindfulness_sessions = data_loading.date_conversion(mindfulness_sessions)
mindfulness_sessions['mindfulness_start_heart_rate'] = pd.to_numeric(mindfulness_sessions['mindfulness_start_heart_rate'])
mindfulness_sessions['mindfulness_end_heart_rate'] = pd.to_numeric(mindfulness_sessions['mindfulness_end_heart_rate'])
mindfulness_sessions = data_loading.aggregate_column(mindfulness_sessions, list(mindfulness_sessions.columns))

# merge with the final dataframe
df = df.merge(mindfulness_sessions, how='outer', on=['id', 'date', 'hour'])
df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert,type,nightly_temperature,nremhr,spo2,rmssd,...,calories,vo2max,distance,oxygen_variation_x,oxygen_variation_y,lightly_active_minutes,scl_avg,mindfulness_goal,mindfulness_start_heart_rate,mindfulness_end_heart_rate
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE,,,,,,...,1.641667,,0.0,,,,,,,
1,621e301367b776a24057738e,2021-06-08,21,NSR,NONE,,,,,,...,1.616667,,0.0,,,,,,,
2,621e312a67b776a240164d59,2021-10-07,17,NSR,NONE,,,,,,...,1.290000,,0.0,,,,9.703215,,,
3,621e312a67b776a240164d59,2021-10-10,20,NSR,NONE,,,,,,...,4.222833,,0.0,,,,,,,
4,621e326767b776a24012e179,2021-07-22,17,UNCLASSIFIABLE,NONE,,,,,,...,0.977667,,0.0,0.0,0.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159437,621e36f967b776a240e5e7c9,2021-08-03,0,,,,,,,,...,,,,,,0.0,,,,
159438,621e36f967b776a240e5e7c9,2021-08-04,0,,,,,,,,...,,,,,,0.0,,,,
159439,621e36f967b776a240e5e7c9,2021-08-05,0,,,,,,,,...,,,,,,0.0,,,,
159440,621e36f967b776a240e5e7c9,2021-08-06,0,,,,,,,,...,,,,,,0.0,,,,


##### moderately_active_minutes

In [19]:
moderately_minutes = pd.DataFrame(columns=["id", "data"])

# read and load from MongoDB
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "moderately_active_minutes"},
            {"id": user}]},
            {"id": 1, "data.dateTime": 1, "data.value": 1, "_id": 0})))
    moderately_minutes = pd.concat([moderately_minutes , user_data], axis=0)

# split data column (json format) into two columns (df format)
moderately_minutes["date"] = moderately_minutes["data"].apply(lambda d: d["dateTime"])
moderately_minutes["moderately_active_minutes"] = moderately_minutes["data"].apply(lambda d: d["value"])
moderately_minutes.drop(["data"], inplace=True, axis=1)

# process the datetime object, feature types and group and aggregate the data
moderately_minutes = data_loading.date_conversion(moderately_minutes)
moderately_minutes["moderately_active_minutes"] = pd.to_numeric(moderately_minutes["moderately_active_minutes"])
moderately_minutes = data_loading.aggregate_column(moderately_minutes, list(moderately_minutes.columns))

# merge with the final dataframe
df = df.merge(moderately_minutes, how='outer', on=['id', 'date', 'hour'])
df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert,type,nightly_temperature,nremhr,spo2,rmssd,...,vo2max,distance,oxygen_variation_x,oxygen_variation_y,lightly_active_minutes,scl_avg,mindfulness_goal,mindfulness_start_heart_rate,mindfulness_end_heart_rate,moderately_active_minutes
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE,,,,,,...,,0.0,,,,,,,,
1,621e301367b776a24057738e,2021-06-08,21,NSR,NONE,,,,,,...,,0.0,,,,,,,,
2,621e312a67b776a240164d59,2021-10-07,17,NSR,NONE,,,,,,...,,0.0,,,,9.703215,,,,
3,621e312a67b776a240164d59,2021-10-10,20,NSR,NONE,,,,,,...,,0.0,,,,,,,,
4,621e326767b776a24012e179,2021-07-22,17,UNCLASSIFIABLE,NONE,,,,,,...,,0.0,0.0,0.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159437,621e36f967b776a240e5e7c9,2021-08-03,0,,,,,,,,...,,,,,0.0,,,,,0.0
159438,621e36f967b776a240e5e7c9,2021-08-04,0,,,,,,,,...,,,,,0.0,,,,,0.0
159439,621e36f967b776a240e5e7c9,2021-08-05,0,,,,,,,,...,,,,,0.0,,,,,0.0
159440,621e36f967b776a240e5e7c9,2021-08-06,0,,,,,,,,...,,,,,0.0,,,,,0.0


##### resting_heart_rate

In [23]:
resting_heart_rate = pd.DataFrame(columns=["id", "data"])

# read and load from MongoDB
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "resting_heart_rate"},
            {"id": user}]},
            {"id": 1, "data.dateTime": 1, "data.value": 1, "_id": 0})))
    resting_heart_rate = pd.concat([resting_heart_rate , user_data], axis=0)

# split data column (json format) into two columns (df format)
resting_heart_rate["date"] = resting_heart_rate["data"].apply(lambda d: d["dateTime"])
resting_heart_rate["resting_heart_rate"] = resting_heart_rate["data"].apply(lambda d: d["value"].get('value'))
resting_heart_rate.drop(["data"], inplace=True, axis=1)

# process the datetime object, feature types and group and aggregate the data
resting_heart_rate = data_loading.date_conversion(resting_heart_rate)
resting_heart_rate["resting_heart_rate"] = pd.to_numeric(resting_heart_rate["resting_heart_rate"])
resting_heart_rate = data_loading.aggregate_column(resting_heart_rate, list(resting_heart_rate.columns))

# merge with the final dataframe
df = df.merge(resting_heart_rate, how='outer', on=['id', 'date', 'hour'])
df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert,type,nightly_temperature,nremhr,spo2,rmssd,...,distance,oxygen_variation_x,oxygen_variation_y,lightly_active_minutes,scl_avg,mindfulness_goal,mindfulness_start_heart_rate,mindfulness_end_heart_rate,moderately_active_minutes,resting_heart_rate
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE,,,,,,...,0.0,,,,,,,,,
1,621e301367b776a24057738e,2021-06-08,21,NSR,NONE,,,,,,...,0.0,,,,,,,,,
2,621e312a67b776a240164d59,2021-10-07,17,NSR,NONE,,,,,,...,0.0,,,,9.703215,,,,,
3,621e312a67b776a240164d59,2021-10-10,20,NSR,NONE,,,,,,...,0.0,,,,,,,,,
4,621e326767b776a24012e179,2021-07-22,17,UNCLASSIFIABLE,NONE,,,,,,...,0.0,0.0,0.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164490,621e375b67b776a240290cdc,2022-01-18,0,,,,,,,,...,,,,,,,,,,0.0
164491,621e375b67b776a240290cdc,2022-01-19,0,,,,,,,,...,,,,,,,,,,0.0
164492,621e375b67b776a240290cdc,2022-01-20,0,,,,,,,,...,,,,,,,,,,0.0
164493,621e375b67b776a240290cdc,2022-01-21,0,,,,,,,,...,,,,,,,,,,0.0


##### sedentary_minutes

In [25]:
sedentary_minutes = pd.DataFrame(columns=["id", "data"])

# read and load from MongoDB
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "sedentary_minutes"},
            {"id": user}]},
            {"id": 1, "data.dateTime": 1, "data.value": 1, "_id": 0})))
    sedentary_minutes = pd.concat([sedentary_minutes , user_data], axis=0)

# split data column (json format) into two columns (df format)
sedentary_minutes["date"] = sedentary_minutes["data"].apply(lambda d: d["dateTime"])
sedentary_minutes["sedentary_minutes"] = sedentary_minutes["data"].apply(lambda d: d["value"])
sedentary_minutes.drop(["data"], inplace=True, axis=1)

# process the datetime object, feature types and group and aggregate the data
sedentary_minutes = data_loading.date_conversion(sedentary_minutes)
sedentary_minutes["sedentary_minutes"] = pd.to_numeric(sedentary_minutes["sedentary_minutes"])
sedentary_minutes = data_loading.aggregate_column(sedentary_minutes, list(sedentary_minutes.columns))

# merge with the final dataframe
df = df.merge(sedentary_minutes, how='outer', on=['id', 'date', 'hour'])
df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert,type,nightly_temperature,nremhr,spo2,rmssd,...,oxygen_variation_x,oxygen_variation_y,lightly_active_minutes,scl_avg,mindfulness_goal,mindfulness_start_heart_rate,mindfulness_end_heart_rate,moderately_active_minutes,resting_heart_rate,sedentary_minutes
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE,,,,,,...,,,,,,,,,,
1,621e301367b776a24057738e,2021-06-08,21,NSR,NONE,,,,,,...,,,,,,,,,,
2,621e312a67b776a240164d59,2021-10-07,17,NSR,NONE,,,,,,...,,,,9.703215,,,,,,
3,621e312a67b776a240164d59,2021-10-10,20,NSR,NONE,,,,,,...,,,,,,,,,,
4,621e326767b776a24012e179,2021-07-22,17,UNCLASSIFIABLE,NONE,,,,,,...,0.0,0.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164490,621e375b67b776a240290cdc,2022-01-18,0,,,,,,,,...,,,,,,,,,0.0,
164491,621e375b67b776a240290cdc,2022-01-19,0,,,,,,,,...,,,,,,,,,0.0,
164492,621e375b67b776a240290cdc,2022-01-20,0,,,,,,,,...,,,,,,,,,0.0,
164493,621e375b67b776a240290cdc,2022-01-21,0,,,,,,,,...,,,,,,,,,0.0,


##### steps

In [26]:
steps = pd.DataFrame(columns=["id", "data"])

# read and load from MongoDB
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "steps"},
            {"id": user}]},
            {"id": 1, "data.dateTime": 1, "data.value": 1, "_id": 0})))
    steps = pd.concat([steps , user_data], axis=0)

# split data column (json format) into two columns (df format)
steps["date"] = steps["data"].apply(lambda d: d["dateTime"])
steps["steps"] = steps["data"].apply(lambda d: d["value"])
steps.drop(["data"], inplace=True, axis=1)

# process the datetime object, feature types and group and aggregate the data
steps = data_loading.date_conversion(steps)
steps["steps"] = pd.to_numeric(steps["steps"])
steps = data_loading.aggregate_column(steps, list(steps.columns))

# merge with the final dataframe
df = df.merge(steps, how='outer', on=['id', 'date', 'hour'])
df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert,type,nightly_temperature,nremhr,spo2,rmssd,...,oxygen_variation_y,lightly_active_minutes,scl_avg,mindfulness_goal,mindfulness_start_heart_rate,mindfulness_end_heart_rate,moderately_active_minutes,resting_heart_rate,sedentary_minutes,steps
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE,,,,,,...,,,,,,,,,,0.0
1,621e301367b776a24057738e,2021-06-08,21,NSR,NONE,,,,,,...,,,,,,,,,,0.0
2,621e312a67b776a240164d59,2021-10-07,17,NSR,NONE,,,,,,...,,,9.703215,,,,,,,0.0
3,621e312a67b776a240164d59,2021-10-10,20,NSR,NONE,,,,,,...,,,,,,,,,,0.0
4,621e326767b776a24012e179,2021-07-22,17,UNCLASSIFIABLE,NONE,,,,,,...,0.0,,,,,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164490,621e375b67b776a240290cdc,2022-01-18,0,,,,,,,,...,,,,,,,,0.0,,
164491,621e375b67b776a240290cdc,2022-01-19,0,,,,,,,,...,,,,,,,,0.0,,
164492,621e375b67b776a240290cdc,2022-01-20,0,,,,,,,,...,,,,,,,,0.0,,
164493,621e375b67b776a240290cdc,2022-01-21,0,,,,,,,,...,,,,,,,,0.0,,


##### very_active_minutes

In [27]:
very_active_minutes = pd.DataFrame(columns=["id", "data"])

# read and load from MongoDB
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "very_active_minutes"},
            {"id": user}]},
            {"id": 1, "data.dateTime": 1, "data.value": 1, "_id": 0})))
    very_active_minutes = pd.concat([very_active_minutes , user_data], axis=0)

# split data column (json format) into two columns (df format)
very_active_minutes["date"] = very_active_minutes["data"].apply(lambda d: d["dateTime"])
very_active_minutes["very_active_minutes"] = very_active_minutes["data"].apply(lambda d: d["value"])
very_active_minutes.drop(["data"], inplace=True, axis=1)

# process the datetime object, feature types and group and aggregate the data
very_active_minutes = data_loading.date_conversion(very_active_minutes)
very_active_minutes["very_active_minutes"] = pd.to_numeric(very_active_minutes["very_active_minutes"])
very_active_minutes = data_loading.aggregate_column(very_active_minutes, list(very_active_minutes.columns))

# merge with the final dataframe
df = df.merge(very_active_minutes, how='outer', on=['id', 'date', 'hour'])
df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert,type,nightly_temperature,nremhr,spo2,rmssd,...,lightly_active_minutes,scl_avg,mindfulness_goal,mindfulness_start_heart_rate,mindfulness_end_heart_rate,moderately_active_minutes,resting_heart_rate,sedentary_minutes,steps,very_active_minutes
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE,,,,,,...,,,,,,,,,0.0,
1,621e301367b776a24057738e,2021-06-08,21,NSR,NONE,,,,,,...,,,,,,,,,0.0,
2,621e312a67b776a240164d59,2021-10-07,17,NSR,NONE,,,,,,...,,9.703215,,,,,,,0.0,
3,621e312a67b776a240164d59,2021-10-10,20,NSR,NONE,,,,,,...,,,,,,,,,0.0,
4,621e326767b776a24012e179,2021-07-22,17,UNCLASSIFIABLE,NONE,,,,,,...,,,,,,,,,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164490,621e375b67b776a240290cdc,2022-01-18,0,,,,,,,,...,,,,,,,0.0,,,
164491,621e375b67b776a240290cdc,2022-01-19,0,,,,,,,,...,,,,,,,0.0,,,
164492,621e375b67b776a240290cdc,2022-01-20,0,,,,,,,,...,,,,,,,0.0,,,
164493,621e375b67b776a240290cdc,2022-01-21,0,,,,,,,,...,,,,,,,0.0,,,


##### time_in_heart_rate_zones

In [28]:
heart_rate_zones = pd.DataFrame(columns=["id", "data"])

# read and load from MongoDB
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "time_in_heart_rate_zones"},
            {"id": user}]},
            {"id": 1, "data.dateTime": 1, "data.value": 1, "_id": 0})))
    heart_rate_zones = pd.concat([heart_rate_zones , user_data], axis=0)

# split data column (json format) into two columns (df format)
heart_rate_zones["date"] = heart_rate_zones["data"].apply(lambda d: d["dateTime"])
heart_rate_zones["minutes_below_zone_1"] = heart_rate_zones["data"].apply(lambda d: d["value"].get("valuesInZones").get("BELOW_DEFAULT_ZONE_1"))
heart_rate_zones["minutes_in_zone_1"] = heart_rate_zones["data"].apply(lambda d: d["value"].get("valuesInZones").get("IN_DEFAULT_ZONE_1"))
heart_rate_zones["minutes_in_zone_2"] = heart_rate_zones["data"].apply(lambda d: d["value"].get("valuesInZones").get("IN_DEFAULT_ZONE_2"))
heart_rate_zones["minutes_in_zone_3"] = heart_rate_zones["data"].apply(lambda d: d["value"].get("valuesInZones").get("IN_DEFAULT_ZONE_3"))
heart_rate_zones.drop(["data"], inplace=True, axis=1)

# process the datetime object, feature types and group and aggregate the data
heart_rate_zones = data_loading.date_conversion(heart_rate_zones)
heart_rate_zones["minutes_below_zone_1"] = pd.to_numeric(heart_rate_zones["minutes_below_zone_1"])
heart_rate_zones["minutes_in_zone_1"] = pd.to_numeric(heart_rate_zones["minutes_in_zone_1"])
heart_rate_zones["minutes_in_zone_2"] = pd.to_numeric(heart_rate_zones["minutes_in_zone_2"])
heart_rate_zones["minutes_in_zone_3"] = pd.to_numeric(heart_rate_zones["minutes_in_zone_3"])
heart_rate_zones = data_loading.aggregate_column(heart_rate_zones, list(heart_rate_zones.columns))

# merge with the final dataframe
df = df.merge(heart_rate_zones, how='outer', on=['id', 'date', 'hour'])
df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert,type,nightly_temperature,nremhr,spo2,rmssd,...,mindfulness_end_heart_rate,moderately_active_minutes,resting_heart_rate,sedentary_minutes,steps,very_active_minutes,minutes_below_zone_1,minutes_in_zone_1,minutes_in_zone_2,minutes_in_zone_3
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE,,,,,,...,,,,,0.0,,,,,
1,621e301367b776a24057738e,2021-06-08,21,NSR,NONE,,,,,,...,,,,,0.0,,,,,
2,621e312a67b776a240164d59,2021-10-07,17,NSR,NONE,,,,,,...,,,,,0.0,,,,,
3,621e312a67b776a240164d59,2021-10-10,20,NSR,NONE,,,,,,...,,,,,0.0,,,,,
4,621e326767b776a24012e179,2021-07-22,17,UNCLASSIFIABLE,NONE,,,,,,...,,,,,0.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164490,621e375b67b776a240290cdc,2022-01-18,0,,,,,,,,...,,,0.0,,,,,,,
164491,621e375b67b776a240290cdc,2022-01-19,0,,,,,,,,...,,,0.0,,,,,,,
164492,621e375b67b776a240290cdc,2022-01-20,0,,,,,,,,...,,,0.0,,,,,,,
164493,621e375b67b776a240290cdc,2022-01-21,0,,,,,,,,...,,,0.0,,,,,,,


##### water_logs

In [29]:
water = pd.DataFrame(columns=["id", "data"])

# read and load from MongoDB
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "water_logs"},
            {"id": user}]},
            {"id": 1, "data.date": 1, "data.waterAmount": 1, "_id": 0})))
    water = pd.concat([water , user_data], axis=0)

# split data column (json format) into two columns (df format)
water["date"] = water["data"].apply(lambda d: d["date"])
water["water_amount"] = water["data"].apply(lambda d: d["waterAmount"])
water.drop(["data"], inplace=True, axis=1)

# process the datetime object, feature types and group and aggregate the data
water = data_loading.date_conversion(water)
water["water_amount"] = pd.to_numeric(water["water_amount"])
water = data_loading.aggregate_column(water, list(water.columns))

# merge with the final dataframe
df = df.merge(water, how='outer', on=['id', 'date', 'hour'])
df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert,type,nightly_temperature,nremhr,spo2,rmssd,...,moderately_active_minutes,resting_heart_rate,sedentary_minutes,steps,very_active_minutes,minutes_below_zone_1,minutes_in_zone_1,minutes_in_zone_2,minutes_in_zone_3,water_amount
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE,,,,,,...,,,,0.0,,,,,,
1,621e301367b776a24057738e,2021-06-08,21,NSR,NONE,,,,,,...,,,,0.0,,,,,,
2,621e312a67b776a240164d59,2021-10-07,17,NSR,NONE,,,,,,...,,,,0.0,,,,,,
3,621e312a67b776a240164d59,2021-10-10,20,NSR,NONE,,,,,,...,,,,0.0,,,,,,
4,621e326767b776a24012e179,2021-07-22,17,UNCLASSIFIABLE,NONE,,,,,,...,,,,0.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164490,621e375b67b776a240290cdc,2022-01-18,0,,,,,,,,...,,0.0,,,,,,,,
164491,621e375b67b776a240290cdc,2022-01-19,0,,,,,,,,...,,0.0,,,,,,,,
164492,621e375b67b776a240290cdc,2022-01-20,0,,,,,,,,...,,0.0,,,,,,,,
164493,621e375b67b776a240290cdc,2022-01-21,0,,,,,,,,...,,0.0,,,,,,,,


##### journal_entries

In [34]:
journal = pd.DataFrame(columns=["id", "data"])

# read and load from MongoDB
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "journal_entries"},
            {"id": user}]},
            {"id": 1, "data.log_time": 1, "data.value": 1, "_id": 0})))
    journal = pd.concat([journal , user_data], axis=0)

# split data column (json format) into two columns (df format)
journal["date"] = journal["data"].apply(lambda d: d["log_time"])
journal["mood_value"] = journal["data"].apply(lambda d: d["value"])
journal.drop(["data"], inplace=True, axis=1)

# process the datetime object, feature types and group and aggregate the data
journal = data_loading.date_conversion(journal)
journal["mood_value"] = pd.to_numeric(journal["mood_value"])
journal = data_loading.aggregate_column(journal, list(journal.columns))

# merge with the final dataframe
df = df.merge(journal, how='outer', on=['id', 'date', 'hour'])
df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert,type,nightly_temperature,nremhr,spo2,rmssd,...,resting_heart_rate,sedentary_minutes,steps,very_active_minutes,minutes_below_zone_1,minutes_in_zone_1,minutes_in_zone_2,minutes_in_zone_3,water_amount,mood_value
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE,,,,,,...,,,0.0,,,,,,,
1,621e301367b776a24057738e,2021-06-08,21,NSR,NONE,,,,,,...,,,0.0,,,,,,,
2,621e312a67b776a240164d59,2021-10-07,17,NSR,NONE,,,,,,...,,,0.0,,,,,,,
3,621e312a67b776a240164d59,2021-10-10,20,NSR,NONE,,,,,,...,,,0.0,,,,,,,
4,621e326767b776a24012e179,2021-07-22,17,UNCLASSIFIABLE,NONE,,,,,,...,,,0.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164490,621e375b67b776a240290cdc,2022-01-18,0,,,,,,,,...,0.0,,,,,,,,,
164491,621e375b67b776a240290cdc,2022-01-19,0,,,,,,,,...,0.0,,,,,,,,,
164492,621e375b67b776a240290cdc,2022-01-20,0,,,,,,,,...,0.0,,,,,,,,,
164493,621e375b67b776a240290cdc,2022-01-21,0,,,,,,,,...,0.0,,,,,,,,,


In [49]:
df.to_pickle('data/temp_before_journal.pkl')

In [4]:
df = pd.read_pickle('data/temp_before_journal.pkl')
df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert,type,nightly_temperature,nremhr,spo2,rmssd,...,resting_heart_rate,sedentary_minutes,steps,very_active_minutes,minutes_below_zone_1,minutes_in_zone_1,minutes_in_zone_2,minutes_in_zone_3,water_amount,mood_value
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE,,,,,,...,,,0.0,,,,,,,
1,621e301367b776a24057738e,2021-06-08,21,NSR,NONE,,,,,,...,,,0.0,,,,,,,
2,621e312a67b776a240164d59,2021-10-07,17,NSR,NONE,,,,,,...,,,0.0,,,,,,,
3,621e312a67b776a240164d59,2021-10-10,20,NSR,NONE,,,,,,...,,,0.0,,,,,,,
4,621e326767b776a24012e179,2021-07-22,17,UNCLASSIFIABLE,NONE,,,,,,...,,,0.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164490,621e375b67b776a240290cdc,2022-01-18,0,,,,,,,,...,0.0,,,,,,,,,
164491,621e375b67b776a240290cdc,2022-01-19,0,,,,,,,,...,0.0,,,,,,,,,
164492,621e375b67b776a240290cdc,2022-01-20,0,,,,,,,,...,0.0,,,,,,,,,
164493,621e375b67b776a240290cdc,2022-01-21,0,,,,,,,,...,0.0,,,,,,,,,


##### Profile

In [5]:
profile = pd.DataFrame(columns=["id", "data"])

# read and load from MongoDB
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "Profile"},
            {"id": user}]},
            {"id": 1, "data.gender": 1, "data.bmi": 1, "data.age": 1, "_id": 0})))
    profile = pd.concat([profile , user_data], axis=0)

# split data column (json format) into two columns (df format)
profile["gender"] = profile["data"].apply(lambda d: d["gender"])
profile["age"] = profile["data"].apply(lambda d: d["age"] if "age" in d else np.NaN)
profile["bmi"] = profile["data"].apply(lambda d: d["bmi"] if "bmi" in d else np.NaN)
profile.drop(["data"], inplace=True, axis=1)

# merge with the final dataframe
df = df.merge(profile, how='outer', on=['id'])
df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert,type,nightly_temperature,nremhr,spo2,rmssd,...,very_active_minutes,minutes_below_zone_1,minutes_in_zone_1,minutes_in_zone_2,minutes_in_zone_3,water_amount,mood_value,gender,age,bmi
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE,,,,,,...,,,,,,,,FEMALE,<30,>=25
1,621e2ff067b776a2403eb737,2021-11-18,0,,,SKIN,35.025730,,,,...,0.0,1303.0,119.0,0.0,0.0,,,FEMALE,<30,>=25
2,621e2ff067b776a2403eb737,2021-11-18,21,,,SKIN,34.866951,,,,...,,,,,,,,FEMALE,<30,>=25
3,621e2ff067b776a2403eb737,2021-11-20,0,,,SKIN,35.349583,,,,...,0.0,1400.0,36.0,0.0,0.0,,,FEMALE,<30,>=25
4,621e2ff067b776a2403eb737,2021-11-20,23,,,SKIN,34.495486,,,,...,,,,,,,,FEMALE,<30,>=25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164490,621e36bb67b776a240b40d64,2022-01-18,0,,,,,,,,...,,,,,,,,FEMALE,<30,24.0
164491,621e36bb67b776a240b40d64,2022-01-19,0,,,,,,,,...,,,,,,,,FEMALE,<30,24.0
164492,621e36bb67b776a240b40d64,2022-01-20,0,,,,,,,,...,,,,,,,,FEMALE,<30,24.0
164493,621e36bb67b776a240b40d64,2022-01-21,0,,,,,,,,...,,,,,,,,FEMALE,<30,24.0
