##### This notebook collects all the sema data from MongoDB and creates the corresponding dataframe.

In [126]:
import os
import warnings
import datetime
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from pymongo import MongoClient
from functions import data_loading
from functions import surveys_scoring
from dataprep.eda import create_report

warnings.filterwarnings("ignore")

Connect securely to the database

In [127]:
load_dotenv("../config.env")
MONGO_USER = os.getenv("MONGO_USER")
MONGO_PASSWORD = os.getenv("MONGO_PASSWORD")
client = MongoClient("mongodb://" + MONGO_USER + ":" + MONGO_PASSWORD + "@localhost:27017/")
db = client.rais

Load fitbit data

In [128]:
fitbit_df = pd.read_pickle('../data/temp_fitbit.pkl')
fitbit_df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert,type,nightly_temperature,nremhr,spo2,rmssd,...,very_active_minutes,minutes_below_zone_1,minutes_in_zone_1,minutes_in_zone_2,minutes_in_zone_3,water_amount,mood_value,gender,age,bmi
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE,,,,,,...,,,,,,,,FEMALE,<30,>=25
1,621e2ff067b776a2403eb737,2021-11-18,0,,,SKIN,35.025730,,,,...,0.0,1303.0,119.0,0.0,0.0,,,FEMALE,<30,>=25
2,621e2ff067b776a2403eb737,2021-11-18,21,,,SKIN,34.866951,,,,...,,,,,,,,FEMALE,<30,>=25
3,621e2ff067b776a2403eb737,2021-11-20,0,,,SKIN,35.349583,,,,...,0.0,1400.0,36.0,0.0,0.0,,,FEMALE,<30,>=25
4,621e2ff067b776a2403eb737,2021-11-20,23,,,SKIN,34.495486,,,,...,,,,,,,,FEMALE,<30,>=25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164490,621e36bb67b776a240b40d64,2022-01-18,0,,,,,,,,...,,,,,,,,FEMALE,<30,24.0
164491,621e36bb67b776a240b40d64,2022-01-19,0,,,,,,,,...,,,,,,,,FEMALE,<30,24.0
164492,621e36bb67b776a240b40d64,2022-01-20,0,,,,,,,,...,,,,,,,,FEMALE,<30,24.0
164493,621e36bb67b776a240b40d64,2022-01-21,0,,,,,,,,...,,,,,,,,FEMALE,<30,24.0


#### Sema data

Find all users provided their survey data

In [129]:
users = db.sema.distinct('user_id')
print(len(users), "users provided their survey data")

63 users provided their survey data


Information about the manually declared steps goal from sema

In [130]:
sema_goals = pd.DataFrame(columns=["user_id", "data"])

# read and load
for user in users:
    user_data = pd.DataFrame(list(
                    db.sema.find({ "$and": [
                        { "data.STEPS": { "$ne": "<no-response>" } },
                        {"user_id": user}
                    ]},{"data.STEPS": 1, "id": 1, "user_id": 1, "data.STARTED_TS": 1, '_id': 0})))
    sema_goals = pd.concat([sema_goals, user_data], axis=0)

# split data column (json format) into two columns (df format)
sema_goals["date"] = sema_goals["data"].apply(lambda d: d["STARTED_TS"])
sema_goals["step_goal"] = sema_goals["data"].apply(lambda d: d["STEPS"])
sema_goals.drop(["data"], inplace=True, axis=1)

# process the datetime object and rename the dataframe columns
sema_goals = data_loading.date_conversion(sema_goals)
sema_goals.rename(columns={'user_id': 'id'}, inplace=True)

# check for duplicates
# Findings: 6 duplicates found based on (id, date) which is wrong, since this answer had to be collected max once per day
# Approach: keep the latest record
sema_goals.drop_duplicates(subset=["id", "date"], keep="last", inplace=True)

# merge with fitbit data
df = fitbit_df.merge(sema_goals, how='outer', on=['id', 'date', 'hour'])
df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert,type,nightly_temperature,nremhr,spo2,rmssd,...,minutes_below_zone_1,minutes_in_zone_1,minutes_in_zone_2,minutes_in_zone_3,water_amount,mood_value,gender,age,bmi,step_goal
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE,,,,,,...,,,,,,,FEMALE,<30,>=25,
1,621e2ff067b776a2403eb737,2021-11-18,0,,,SKIN,35.025730,,,,...,1303.0,119.0,0.0,0.0,,,FEMALE,<30,>=25,
2,621e2ff067b776a2403eb737,2021-11-18,21,,,SKIN,34.866951,,,,...,,,,,,,FEMALE,<30,>=25,
3,621e2ff067b776a2403eb737,2021-11-20,0,,,SKIN,35.349583,,,,...,1400.0,36.0,0.0,0.0,,,FEMALE,<30,>=25,
4,621e2ff067b776a2403eb737,2021-11-20,23,,,SKIN,34.495486,,,,...,,,,,,,FEMALE,<30,>=25,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164538,621e36f967b776a240e5e7c9,2021-05-03,11,,,,,,,,...,,,,,,,,,,7999
164539,621e36f967b776a240e5e7c9,2021-05-06,11,,,,,,,,...,,,,,,,,,,9999
164540,621e36f967b776a240e5e7c9,2021-05-15,13,,,,,,,,...,,,,,,,,,,14999
164541,621e36f967b776a240e5e7c9,2021-05-16,12,,,,,,,,...,,,,,,,,,,14999


Information about the manually reported mood and place from sema

In [131]:
sema_mood = pd.DataFrame(columns=["user_id", "data"])

# read and load
for user in users:
    user_data = pd.DataFrame(list(
                    db.sema.find({
                        "$or": [{"$and": [{ "data.MOOD": { "$ne": "<no-response>" } },
                                    {"data.MOOD": { "$ne": None }},
                                    {"user_id": user}]},
                            {"$and": [{ "data.PLACE": { "$ne": "<no-response>" } },
                                    {"data.PLACE": { "$ne": None }},
                                    {"user_id": user}]}]},
                        {"data.MOOD": 1, "data.PLACE": 1, "id": 1, "_id": 0, "user_id": 1, "data.STARTED_TS": 1})))
    sema_mood = pd.concat([sema_mood, user_data], axis=0)

# split data column (json format) into two columns (df format)
sema_mood["date"] = sema_mood["data"].apply(lambda d: d["STARTED_TS"])
sema_mood["mood"] = sema_mood["data"].apply(lambda d: d["MOOD"])
sema_mood["place"] = sema_mood["data"].apply(lambda d: d["PLACE"])
sema_mood.drop(["data"], axis=1, inplace=True)

# process the datetime object and rename the dataframe columns
sema_mood = data_loading.date_conversion(sema_mood)
sema_mood.rename(columns={'user_id': 'id'}, inplace=True)

# check for duplicates
# Findings: 2 duplicates found based on all the features of the dataframe (aka exact replicates)
# Approach: keep the latest record
sema_mood.drop_duplicates(keep="last", inplace=True)

# check for duplicates
# Findings: 14 4-plicates found based on (id,date) which is wrong, since this answer had to be collected max thrice per day
# Approach: keep the latest record
sema_mood.sort_values(by=['id', 'date', 'hour'], inplace=True)
mood_place = pd.DataFrame()
for user_id, group in sema_mood.groupby(['id', 'date']):
    if len(group) == 4:
        group = group.iloc[1:]
    mood_place = pd.concat([mood_place, group])

# merge with fitbit data
df = df.merge(mood_place, how='outer', on=['id', 'date', 'hour'])
df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert,type,nightly_temperature,nremhr,spo2,rmssd,...,minutes_in_zone_2,minutes_in_zone_3,water_amount,mood_value,gender,age,bmi,step_goal,mood,place
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE,,,,,,...,,,,,FEMALE,<30,>=25,,,
1,621e2ff067b776a2403eb737,2021-11-18,0,,,SKIN,35.025730,,,,...,0.0,0.0,,,FEMALE,<30,>=25,,,
2,621e2ff067b776a2403eb737,2021-11-18,21,,,SKIN,34.866951,,,,...,,,,,FEMALE,<30,>=25,,,
3,621e2ff067b776a2403eb737,2021-11-20,0,,,SKIN,35.349583,,,,...,0.0,0.0,,,FEMALE,<30,>=25,,,
4,621e2ff067b776a2403eb737,2021-11-20,23,,,SKIN,34.495486,,,,...,,,,,FEMALE,<30,>=25,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164615,621e36f967b776a240e5e7c9,2021-05-20,16,,,,,,,,...,,,,,,,,,TIRED,WORK/SCHOOL
164616,621e36f967b776a240e5e7c9,2021-05-20,20,,,,,,,,...,,,,,,,,,RESTED/RELAXED,HOME
164617,621e36f967b776a240e5e7c9,2021-05-21,17,,,,,,,,...,,,,,,,,,HAPPY,TRANSIT
164618,621e36f967b776a240e5e7c9,2021-05-21,22,,,,,,,,...,,,,,,,,,RESTED/RELAXED,HOME


In [132]:
df.to_pickle('../data/fitbit_semas.pkl')