##### This notebook collects all the sema data from MongoDB and creates the corresponding dataframe.

In [3]:
import os
import warnings
import datetime
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from pymongo import MongoClient
from functions import daily_data_loading
from functions import surveys_scoring
from dataprep.eda import create_report

warnings.filterwarnings("ignore")

Connect securely to the database

In [2]:
load_dotenv("../config.env")
MONGO_USER = os.getenv("MONGO_USER")
MONGO_PASSWORD = os.getenv("MONGO_PASSWORD")
client = MongoClient("mongodb://" + MONGO_USER + ":" + MONGO_PASSWORD + "@localhost:27017/")
db = client.rais

Load fitbit and survey data

In [4]:
fitbit_surveys_df = pd.read_pickle('../data/loading_final/daily_fitbit_surveys.pkl')
fitbit_surveys_df

Unnamed: 0,id,date,ecg,heart_rate_alert,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,deep_sleep_breathing_rate,...,consciousness_raising,dramatic_relief_category,environmental_reevaluation_category,self_reevaluation_category,social_liberation_category,counterconditioning_category,helping_relationships_category,reinforcement_management_category,self_liberation_category,stimulus_control_category
0,621e2ff067b776a2403eb737,2021-12-22,NSR,NONE,33.737162,,,,,,...,,,,,,,,,,
1,621e2ff067b776a2403eb737,2021-11-18,,,34.946341,,,,,,...,,,,,,,,,,
2,621e2ff067b776a2403eb737,2021-11-20,,,34.922535,,,,,,...,,,,,,,,,,
3,621e2ff067b776a2403eb737,2021-11-22,,,34.284711,,,,,,...,,,,,,,,,,
4,621e2ff067b776a2403eb737,2021-11-23,,,34.819364,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12298,621e375367b776a24021e950,2022-02-01,,,,,,,,,...,,,,,,,,,,
12299,621e2f1b67b776a240b3d87c,2022-04-01,,,,,,,,,...,,,,,,,,,,
12300,621e312a67b776a240164d59,2022-03-01,,,,,,,,,...,,,,,,,,,,
12301,621e310d67b776a24003096d,2021-03-12,,,,,,,,,...,2.333333,Above average,Average,Above average,Average,Above average,Average,Above average,Above average,Above average


#### Sema data

Find all users provided their sema data

In [5]:
users = db.sema.distinct('user_id')
print(len(users), "users provided their survey data")

63 users provided their survey data


Information about the manually declared steps goal from sema

In [6]:
sema_goals = pd.DataFrame(columns=["user_id", "data"])

# read and load
for user in users:
    user_data = pd.DataFrame(list(
                    db.sema.find({ "$and": [
                        { "data.STEPS": { "$ne": "<no-response>" } },
                        {"user_id": user}
                    ]},{"data.STEPS": 1, "id": 1, "user_id": 1, "data.STARTED_TS": 1, '_id': 0})))
    sema_goals = pd.concat([sema_goals, user_data], axis=0)

# split data column (json format) into two columns (df format)
sema_goals["date"] = sema_goals["data"].apply(lambda d: d["STARTED_TS"])
sema_goals["step_goal"] = sema_goals["data"].apply(lambda d: d["STEPS"])
sema_goals.drop(["data"], inplace=True, axis=1)

# process the datetime object and rename the dataframe columns
sema_goals = daily_data_loading.date_conversion(sema_goals)
sema_goals.rename(columns={'user_id': 'id'}, inplace=True)

# check for duplicates
# Findings: 6 duplicates found based on (id, date) which is wrong, since this answer had to be collected max once per day
# Approach: keep the latest record
sema_goals.sort_values(by=['id', 'date'], inplace=True)
sema_goals.drop_duplicates(subset=["id", "date"], keep="last", inplace=True)

# merge with fitbit data
df = fitbit_surveys_df.merge(sema_goals, how='outer', on=['id', 'date'])
df

Unnamed: 0,id,date,ecg,heart_rate_alert,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,deep_sleep_breathing_rate,...,dramatic_relief_category,environmental_reevaluation_category,self_reevaluation_category,social_liberation_category,counterconditioning_category,helping_relationships_category,reinforcement_management_category,self_liberation_category,stimulus_control_category,step_goal
0,621e2ff067b776a2403eb737,2021-12-22,NSR,NONE,33.737162,,,,,,...,,,,,,,,,,
1,621e2ff067b776a2403eb737,2021-11-18,,,34.946341,,,,,,...,,,,,,,,,,4999
2,621e2ff067b776a2403eb737,2021-11-20,,,34.922535,,,,,,...,,,,,,,,,,7999
3,621e2ff067b776a2403eb737,2021-11-22,,,34.284711,,,,,,...,,,,,,,,,,7999
4,621e2ff067b776a2403eb737,2021-11-23,,,34.819364,,,,,,...,,,,,,,,,,7999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12344,621e36f967b776a240e5e7c9,2021-05-18,,,,,,,,,...,,,,,,,,,,14999
12345,621e36f967b776a240e5e7c9,2021-05-19,,,,,,,,,...,,,,,,,,,,9999
12346,621e36f967b776a240e5e7c9,2021-05-21,,,,,,,,,...,,,,,,,,,,9999
12347,621e36f967b776a240e5e7c9,2021-05-22,,,,,,,,,...,,,,,,,,,,14999


Information about the manually reported mood and place from sema

In [7]:
sema_mood = pd.DataFrame(columns=["user_id", "data"])

# read and load
for user in users:
    user_data = pd.DataFrame(list(
                    db.sema.find({
                        "$or": [{"$and": [{ "data.MOOD": { "$ne": "<no-response>" } },
                                    {"data.MOOD": { "$ne": None }},
                                    {"user_id": user}]},
                            {"$and": [{ "data.PLACE": { "$ne": "<no-response>" } },
                                    {"data.PLACE": { "$ne": None }},
                                    {"user_id": user}]}]},
                        {"data.MOOD": 1, "data.PLACE": 1, "id": 1, "_id": 0, "user_id": 1, "data.STARTED_TS": 1})))
    sema_mood = pd.concat([sema_mood, user_data], axis=0)
print(len(sema_mood))
# split data column (json format) into two columns (df format)
sema_mood["date"] = sema_mood["data"].apply(lambda d: d["STARTED_TS"])
sema_mood["mood"] = sema_mood["data"].apply(lambda d: d["MOOD"])
sema_mood["place"] = sema_mood["data"].apply(lambda d: d["PLACE"])
sema_mood.drop(["data"], axis=1, inplace=True)

# process the datetime object and rename the dataframe columns
sema_mood = daily_data_loading.date_conversion(sema_mood)
sema_mood.rename(columns={'user_id': 'id'}, inplace=True)

# check for duplicates
# Findings: 2 duplicates found based on all the features of the dataframe (aka exact replicates)
# Approach: keep the latest record
sema_mood.drop_duplicates(keep="last", inplace=True)

# check for duplicates
# Findings: 14 4-plicates found based on (id,date) which is wrong, since this answer had to be collected max thrice per day
# Approach: keep the latest record
sema_mood.sort_values(by=['id', 'date'], inplace=True)
mood_place = pd.DataFrame()
for user_id, group in sema_mood.groupby(['id', 'date']):
    if len(group) == 4:
        group = group.iloc[1:]
    mood_place = pd.concat([mood_place, group])

# merge with fitbit data
df = df.merge(mood_place, how='outer', on=['id', 'date'])
df

5065


Unnamed: 0,id,date,ecg,heart_rate_alert,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,deep_sleep_breathing_rate,...,self_reevaluation_category,social_liberation_category,counterconditioning_category,helping_relationships_category,reinforcement_management_category,self_liberation_category,stimulus_control_category,step_goal,mood,place
0,621e2ff067b776a2403eb737,2021-12-22,NSR,NONE,33.737162,,,,,,...,,,,,,,,,TIRED,HOME
1,621e2ff067b776a2403eb737,2021-12-22,NSR,NONE,33.737162,,,,,,...,,,,,,,,,TIRED,TRANSIT
2,621e2ff067b776a2403eb737,2021-11-18,,,34.946341,,,,,,...,,,,,,,,4999,<no-response>,OTHER
3,621e2ff067b776a2403eb737,2021-11-18,,,34.946341,,,,,,...,,,,,,,,4999,TIRED,HOME
4,621e2ff067b776a2403eb737,2021-11-18,,,34.946341,,,,,,...,,,,,,,,4999,RESTED/RELAXED,ENTERTAINMENT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14430,621e362467b776a2404ad513,2021-05-18,,,,,,,,,...,,,,,,,,,HAPPY,HOME
14431,621e362467b776a2404ad513,2021-05-23,,,,,,,,,...,,,,,,,,,HAPPY,TRANSIT
14432,621e36f967b776a240e5e7c9,2021-05-20,,,,,,,,,...,,,,,,,,,RESTED/RELAXED,WORK/SCHOOL
14433,621e36f967b776a240e5e7c9,2021-05-20,,,,,,,,,...,,,,,,,,,TIRED,WORK/SCHOOL


In [8]:
df.to_pickle('../data/loading_final/daily_fitbit_surveys_semas.pkl')