In [1]:
import numpy as np
import pandas as pd
import pickle
import datetime

import seaborn as sns
import os

### Variable Information

In [2]:
## Files of interest
##    - HealthKitSamples
##    - HealthKitActivitySummaries (not using for now)
##    - AppleLocationVisits (not using for now)

In [3]:
## Input Directory
directory = "../cumulative_data_811-910"
h_samples_path = "../daily_data/RK.8D1DBFAD.DJW Thesis_20220626-20220627/HealthKitSamples_20220626-20220627.csv"

In [4]:
## Output Directory
out_dir = "../indv_table_exports/"

In [5]:
## Good Subjects
good_subjects = ['01801252-3a7e-4f5f-8b6d-49e8da3902f3',
                 'd26d4b78-7fcf-488e-b687-2d1c93c47b74',
                 '531d7f6d-b880-4a0b-b467-80005a316f1c']

### Some handy functions

In [20]:
## Convert time from UTC to ET
def fix_date_to_ET(end_date):
    if pd.to_datetime(end_date, format= '%Y-%m-%d', utc=True).tz_convert('US/Eastern').hour < 5:
        return pd.to_datetime(end_date, format= '%Y-%m-%d', utc=True).tz_convert('US/Eastern').date() - datetime.timedelta(days=1)
    else:
        return pd.to_datetime(end_date, format= '%Y-%m-%d', utc=True).tz_convert('US/Eastern').date()

def fix_columns_by_category(dataframe, categories):
    df = dataframe
    for item in categories:
        if item not in df.columns.to_list():
            df[item] = 'NaN'
    return df

def pivot_df(dataframe, pos: list, col, val):
    df = dataframe
    df = df.pivot_table(index=pos,
                    columns=col, 
                    values=val).reset_index()
    return df
    
def get_sleep_df(dataframe):
    df_sleep = dataframe
    df_sleep = df_sleep.loc[df_sleep.Type.isin(['SleepAnalysisInterval'])].reset_index(drop=True)

    # Add date column
    df_sleep['SleepDay'] = df_sleep.apply(lambda x: fix_date_to_ET(x.Date), axis=1)
    
    # Calculate and append sleep duration to df
    # Will calculate the duration of each InBed and Asleep Value
    df_sleep['Duration'] = 0
    for i in range(len(df_sleep)):
        m = datetime.datetime.fromisoformat(df_sleep.loc[0, 'Date'])
        n = datetime.datetime.fromisoformat(df_sleep.loc[0, 'StartDate'])
        df_sleep.loc[i, 'Duration'] = (m - n)/60;
    
    # Get sum for each value (InBed, Asleep) for each participant for each SLEEP day
    df_sleep = pd.DataFrame(df_sleep.groupby(['SleepDay','ParticipantIdentifier', 'Value'])['Duration'].sum()).reset_index()
    
    # Make separate columns for InBed and Asleep values
    indices = ['SleepDay', 'ParticipantIdentifier']
    df_sleep = pivot_df(df_sleep, indices, "Value", "Duration")
    
    # Get rid of index name (set to "Value")
    df_sleep.columns.name = None

    # Rename columns for clarity
    df_sleep.rename(columns={'SleepDay': 'StudyDay','Asleep': 'Passive_Asleep','InBed' : 'Passive_InBed'}, inplace=True)
    df_sleep = fix_columns_by_category(df_sleep, ['Passive_Asleep', 'Passive_InBed'])
    
    return df_sleep

def get_heart_rate_df(dataframe, categories):
    df = dataframe
    df_heart = df.loc[df.Type.isin(categories)].reset_index(drop=True)

    # cast Value to numeric
    df_heart.Value = pd.to_numeric(df_heart.Value)

    # Get mean for each type for each participant for each day
    df_heart = pd.DataFrame(df_heart.groupby(['StudyDay','ParticipantIdentifier', 'Type'])['Value'].mean()).reset_index()

    # Make separate columns for type of HeartRate data
    indices = ['StudyDay', 'ParticipantIdentifier']
    df_heart = pivot_df(df_heart, indices, 'Type', 'Value')
    
    # Fix column name
    df_heart = fix_columns_by_category(df_heart, categories)

    # Get rid of index name (set to "Type")
    df_heart.columns.name = None

    # Rename columns
    df_heart.rename(columns={'HeartRateVariability': 'Passive_HeartRate_Variability',
                       'RestingHeartRate' : 'Passive_HeartRate_Resting',
                       'WalkingHeartRateAverage':'Passive_HeartRate_AverageWalking'
                      }, inplace=True)
    return df_heart
    
def get_activity_df(dataframe, mean_categories, sum_categories):
    df = dataframe
    
    # Get activities that need to be summed, and those that need to be averaged in separate DataFrames
    df_mean = df.loc[df.Type.isin(mean_categories)].reset_index(drop=True)
    df_sum = df.loc[df.Type.isin(sum_categories)].reset_index(drop=True)
    
    # cast Value to numeric
    df_mean.Value = pd.to_numeric(df_mean.Value)
    df_sum.Value = pd.to_numeric(df_sum.Value)
    
    # Calculate sums and means
    df_mean = pd.DataFrame(df_mean.groupby(['StudyDay','ParticipantIdentifier', 'Type'])['Value'].mean()).reset_index()
    df_sum = pd.DataFrame(df_sum.groupby(['StudyDay','ParticipantIdentifier', 'Type'])['Value'].sum()).reset_index()
    
    # Pivot take according to activity categories
    # Long to wide
    indices = ['StudyDay', 'ParticipantIdentifier']
    df_mean = pivot_df(df_mean, indices, 'Type', 'Value')
    df_sum = pivot_df(df_sum, indices, 'Type', 'Value')
    
    # Accountfor missing columns
    df_mean = fix_columns_by_category(df_mean, activity_mean_categories)
    df_sum = fix_columns_by_category(df_sum, activity_sum_categories)
    
    # Rename columns
    df_mean.rename(columns={'WalkingSpeed': 'Passive_Activity_AverageWalkingSpeed'
                           }, inplace=True)
    
    df_sum.rename(columns={'ActiveEnergyBurned': 'Passive_Activity_ActiveEnergyBurned',
                        'RestingEnergyBurned' : 'Passive_Activity_RestingEnergyBurned',
                        'DistanceWalkingRunning' :'Passive_Activity_DistanceWalkingRunning',
                        'DistanceCycling' : 'Passive_Activity_DistanceCycling',
                        'AppleStandTime': 'Passive_Activity_AppleStandTime'
                       }, inplace=True)
    
    df_activity = df_sum
    df_activity = df_activity.merge(df_mean, how='left', on=['ParticipantIdentifier', 'StudyDay'])
    return df_activity

def get_other_df(dataframe, categories):
    df = dataframe
    df_other = df.loc[df.Type.isin(categories)].reset_index(drop=True)

    # cast Value to float
    df_other.Value = pd.to_numeric(df_other.Value)

    # Get mean for each type for each participant for each day
    df_other = pd.DataFrame(df_other.groupby(['StudyDay','ParticipantIdentifier', 'Type'])['Value'].mean()).reset_index()

    # convert from long to wide
    df_other = pivot_df(df_other, ['StudyDay', 'ParticipantIdentifier'], 'Type', 'Value')
    
    df_other = fix_columns_by_category(df_other, categories)

    # Get rid of index name (set to "Type")
    df_other.columns.name = None

    # Rename columns
    df_other.rename(columns={'HeadphoneAudioExposure': 'Passive_Audio_HeadphoneExposure',
                       'EnvironmentalAudioExposure' : 'Passive_Audio_EnvironmentalExposure',
                      }, inplace=True)
    return df_other

###  Make HealthKitSamples Dataframe for all results till today

In [21]:
## Make HealthKitSamples Dataframe for all results till today

survey_file_name = ""
path = directory + "/"
for f_name in os.listdir(path):
    if f_name.startswith("HealthKitSamples"):
        survey_file_name = f_name
        break
path = path + '/' + survey_file_name

current_df = pd.read_csv(path) 
current_df["StudyDay"] = current_df.apply(lambda x: fix_date_to_ET(x.Date), axis=1)

In [16]:
## From here we need to get Sleep Data, HeartRate Data, Activity Data, Other Data
df_samples = current_df

### Get Sleep, Heart, Activity, and Other Dataframe

In [17]:
# Get sleep data
df_sleep = get_sleep_df(df_samples)

# Get heart data
heart_categories = ["RestingHeartRate", "WalkingHeartRateAverage", "HeartRateVariability"]
df_heart = get_heart_rate_df(df_samples, heart_categories)

# Get activity data
activity_mean_categories = ['WalkingSpeed']
activity_sum_categories = ['ActiveEnergyBurned', 'RestingEnergyBurned', 'DistanceWalkingRunning',
                  'DistanceCycling', 'AppleStandTime']
df_activity = get_activity_df(df_samples, activity_mean_categories, activity_sum_categories)

# Get other data
other_data_categories = ['HeadphoneAudioExposure', 'EnvironmentalAudioExposure']
df_other = get_other_df(df_samples, other_data_categories)

TypeError: fix_date_to_ET() takes 1 positional argument but 2 were given

### Participant List

In [18]:
df_participants = pd.read_csv(directory + '/StudyParticipants_20220910.csv')
df_participants["CustomFields"] = df_participants["CustomFields"].apply(json.loads)

participant_list = []
for index, row in df_participants.iterrows():
    if row["CustomFields"]["exp_version"] == "app_pilot_1":
        participant_list.append(row["ParticipantIdentifier"])

NameError: name 'json' is not defined

### Merge the 4 dataframes to get Passive Dataframe

In [9]:
# Merge the 4 Data Frames
df_passive = df_sleep
df_passive = df_passive.merge(df_heart, how='left', on=['ParticipantIdentifier', 'StudyDay'])
df_passive = df_passive.merge(df_activity, how='left', on=['ParticipantIdentifier', 'StudyDay'])
df_passive = df_passive.merge(df_other, how='left', on=['ParticipantIdentifier', 'StudyDay'])

### Keep data of good subjects and export passive Dataframe as CSV

In [10]:
# Only keep data for the good participants
df = df_passive
df_passive_good = df[df.ParticipantIdentifier.isin(good_subjects)].reset_index(drop=True)

In [11]:
## Export CSV File
df_passive_good.to_csv('passive.csv', index=False)

In [12]:
####################################Just Testing###########################################
# df_sleep_good
df = df_sleep
df_sleep_good = df[df.ParticipantIdentifier.isin(good_subjects)].reset_index(drop=True)

# df_heart_good
# No data for one of the good participants
df = df_heart
df_heart_good = df[df.ParticipantIdentifier.isin(good_subjects)].reset_index(drop=True)

# df_activity_good
df = df_activity
df_activity_good = df[df.ParticipantIdentifier.isin(good_subjects)].reset_index(drop=True)

# df_other_good
df = df_other
df_other_good = df[df.ParticipantIdentifier.isin(good_subjects)].reset_index(drop=True) 
####################################Just Testing###########################################