In [1]:
import pandas as pd
import re
import json
import numpy as np
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', None)

third_party_csv_path = r'C:\Users\elien\OneDrive - TU Eindhoven\Internship\Data\third_party_data.csv'
baseline_csv_path = r'C:\Users\elien\baseline_data2.csv'
features_remove_outliers = ['delta_min', 'calories', 'quantity', 'distance', 'step_min', 'move_min', 'MET', 'min_high', 'min_med']

start_date = '2014-01-01'
end_date = '2022-05-22'                       
stepday_min=200
stepday_max=50000
act_min=2
min_days=3
max_days=600

In [2]:
def json_str_to_list(data_act, part_id, dt):
    string = data_act
    string = string.replace("'", '"')
    string = string.replace("None", '"None"')
    string = string.replace("True", '"True"')
    string = string.replace("False", '"False"')

    json_object = json.loads(string)
    activity = pd.DataFrame(json_object)
    
    activity.insert(0, 'participant_id', part_id)
    activity.insert(1, 'datetime', dt)
    js = activity.to_json(orient = 'records')
    json_object = json.loads(js)
    return json_object

def np_vec_json(df):
    vfunc = np.vectorize(json_str_to_list)
    return vfunc(df['data'], df['participant_id'], df['datetime'])

def google_activities(third_party_csv_path):
    third_party_data = pd.read_csv(third_party_csv_path)
    GoogleFitness = third_party_data[third_party_data['data_source'] == 'GoogleFitness']
    GoogleFitness = GoogleFitness.drop(columns=['Unnamed: 0', 'id', 'research_stage'])
    
    #Activity data 
    act_dat = GoogleFitness[GoogleFitness['name'] == 'Activities']
    array = np_vec_json(act_dat)
    array = np.concatenate((array), axis=None)
    activities_participants = pd.json_normalize(array)
    
    ## Add delta time columns to activity dataframe
    activities_participants['end'] = pd.to_datetime(activities_participants['end'], unit='ms')
    activities_participants['start'] = pd.to_datetime(activities_participants['start'], unit='ms')

    #delta_time
    delta_t = activities_participants['end'] - activities_participants['start']
    activities_participants.insert(4, 'delta_time', delta_t)

    #delta_sec
    delta_sec = activities_participants['delta_time'].astype('timedelta64[s]')
    activities_participants.insert(5, 'delta_sec', delta_sec)

    #delta_min
    delta_min = activities_participants['delta_sec'] /60
    activities_participants.insert(6, 'delta_min', delta_min)
    
    activities_participants = activities_participants.drop(columns=['source_id', 'device', 'tracked', 'source_name', 'delta_time'])
    
    return activities_participants

In [13]:
def Mifflin_bmr(weight, height, age, gender):
    bmr = (9.99*weight) + (6.25*height) - (4.92*age) + (166*gender) - 161
    return bmr

def add_baseline_features(act_df, baseline_csv_path):
    baseline = pd.read_csv(baseline_csv_path)
    base = baseline.drop(columns=['Unnamed: 0', 'Date', 'standing_one_min_blood_pressure_systolic', 'lying_blood_pressure_diastolic', 'standing_three_min_blood_pressure_systolic', 'lying_blood_pressure_systolic', 'body_temperature', 'sitting_blood_pressure_pulse_rate', 'standing_one_min_blood_pressure_pulse_rate', 'abdominal', 'body_fat', 'waist', 'standing_three_min_blood_pressure_diastolic', 'dominant_hand', 'sitting_blood_pressure_diastolic', 'standing_three_min_blood_pressure_pulse_rate', 'lying_blood_pressure_pulse_rate', 'standing_one_min_blood_pressure_diastolic', 'sitting_blood_pressure_systolic', 'hips', 'hand_grip_right', 'hand_grip_left', 'bmr'])

    base['bmr_eq'] = Mifflin_bmr(base['weight'].values, base['height'].values, base['age'].values, base['gender'].values) 
    act_base_df = act_df.merge(base.rename(columns={"RegistrationCode": "participant_id"}), on="participant_id", how="inner")
    act_base_df = act_base_df.dropna(subset = ["bmr_eq"])
    return act_base_df

def MET_score(calories, time, weight):
    MET = ((200*calories) / (time*weight)) / 3.5
    return MET

def add_activity_features(act_df, baseline_csv_path):
    act_base_df = add_baseline_features(act_df, baseline_csv_path)
    
    # Move min (>= 30 steps per min) and steps per minute:
    act_base_df['step_min'] = act_base_df['quantity'] / act_base_df['delta_min']
    act_base_df['move_min'] = np.where(act_base_df['step_min'] < 30 , 0, np.where(act_base_df['step_min'] >= 30, act_base_df['delta_min'], 0))

    # MET score (time in min)
    act_base_df['MET'] = MET_score(act_base_df['calories'], act_base_df['delta_min'], act_base_df['weight'])

    # Add heart points - HP's
    # MET value = 3-6: 1 HP_MET
    # MET value = >6: 2 HP_MET
    # step per min = 100-130: 1 HP_step
    # step per min = >130: 2 HP_step
    act_base_df['HP_MET'] = np.where((act_base_df['MET'] >= 3) & (act_base_df['MET'] < 6) , 1, np.where(act_base_df['MET'] >= 6, 2, 0))
    act_base_df['HP_step'] = np.where((act_base_df['step_min'] >= 100) & (act_base_df['step_min'] < 130) , 1, np.where(act_base_df['step_min'] >= 130, 2, 0))

    # HP total (max of HP_MET and HP_step)
    act_base_df['HP'] = (act_base_df['HP_MET'] + act_base_df['HP_step'])
    act_base_df['HP'] = act_base_df['HP'].replace([2], 1)
    act_base_df['HP'] = act_base_df['HP'].replace([3], 2)
    act_base_df['HP'] = act_base_df['HP'].replace([4], 2)
    
    ## Moderate and high intensity minutes:
    # When HP > 2: high intensity minutes
    # When HP == 1: moderate intensity minutes 
    act_base_df['min_high'] = np.where(act_base_df['HP'] == 2 , act_base_df['delta_min'], 0)
    act_base_df['min_med'] = np.where(act_base_df['HP'] == 1 , act_base_df['delta_min'], 0)

    # Multiply HP factor (0,1 or 2) with amount of minutes
    act_base_df['HP_MET'] = act_base_df['HP_MET']*act_base_df['delta_min']
    act_base_df['HP_step'] = act_base_df['HP_step']*act_base_df['delta_min']
    act_base_df['HP'] = act_base_df['HP']*act_base_df['delta_min']

    act_features_df = act_base_df.drop(columns=['delta_sec'])
    return act_features_df

In [4]:
# Outlier: value that is more than 3 standard deviations from the mean

def outlier_removal(df, variable):
    upper_limit = df[variable].mean() + 3 * df[variable].std()
    lower_limit = df[variable].mean() - 3 * df[variable].std()
    return upper_limit, lower_limit

def remove_feature_outliers(act_features_df, features_remove_outliers):
    var = features_remove_outliers
    upper = []
    lower = []

    for v in var:
        upper_limit, lower_limit = outlier_removal(act_features_df, v)
        upper.append(upper_limit)
        lower.append(lower_limit)

    for v in range(len(var)):
        variable = var[v]
        act_features_df = act_features_df[(act_features_df[variable] > lower[v]) & (act_features_df[variable] < upper[v])]

    act_features_outliers_df = act_features_df
    
    return act_features_outliers_df


In [5]:
def aggregate_part_day(act_features_outliers_df, 
                       start_date, end_date, 
                       stepday_min, stepday_max, 
                       act_min):

    table = pd.pivot_table(act_features_outliers_df, values=['HP', 'HP_step', 'HP_MET','MET', 'quantity', 'bmr_eq', 'gender', 'age', 'bmi', 'weight', 'height', 'move_min', 'step_min', 'distance', 'calories', 'activity_name', 'min_high', 'min_med'], index=['participant_id', 'datetime'],
                        aggfunc={'HP': np.sum,
                                 'HP_step': np.sum,
                                 'HP_MET': np.sum,
                                'MET': [np.mean, np.std, min, max],
                                 'quantity': np.sum,
                                'bmr_eq': np.mean,
                                'gender': np.mean,
                                'age': np.mean,
                                'bmi': np.mean,
                                'weight': np.mean,
                                'height': np.mean,
                                'move_min': np.sum,
                                'step_min': np.mean,
                                'distance': np.sum,
                                'calories': np.sum,
                                'activity_name': 'count',
                                'min_high': np.sum,
                                'min_med': np.sum},                          
                             fill_value = 0).sort_index()

    table['activity_score'] = table[('calories', 'sum')] / table[('bmr_eq', 'mean')]
    part_day = table.reset_index()
    part_day.columns = ["_".join((i,j)) for i,j in part_day.columns]
    part_day = part_day.rename(columns={'participant_id_': 'participant_id', 
                                        'datetime_': 'datetime',
                                        'activity_score_': 'activity_score'})

   
    # 1) Data within timerange
    part_day['datetime'] = pd.to_datetime(part_day['datetime'])  
    df0 = part_day[(part_day['datetime'] > start_date) & (part_day['datetime'] <= end_date)]
    
    # 2) Remove outliers activity_score
    upper_limit, lower_limit = outlier_removal(df0, 'activity_score')
    df1 = df0[(df0['activity_score'] > lower_limit) & (df0['activity_score'] < upper_limit)]
    
    # 3) stepday min and max
    df2 = df1[(df1['quantity_sum'] > stepday_min) & (df1['quantity_sum'] <= stepday_max)]
    
    # 4) act_min per day
    df3 = df2[df2['activity_name_count'] > act_min]
    
    act_cleaned_part_day = df3
    
    return act_cleaned_part_day


In [19]:
def aggregate_part(act_cleaned_part_day, min_days, max_days):

    table2 = pd.pivot_table(act_cleaned_part_day, values=['datetime', 'HP_sum', 'HP_MET_sum', 'HP_step_sum', 'MET_max', 'MET_mean', 'MET_min', 'MET_std', 'activity_name_count', 'age_mean', 'bmi_mean', 'bmr_eq_mean', 'calories_sum', 'distance_sum', 'gender_mean', 'height_mean', 'min_high_sum', 'min_med_sum', 'move_min_sum', 'quantity_sum', 'step_min_mean', 'weight_mean', 'activity_score'], 
                           index=['participant_id'],
                        aggfunc={'datetime': 'count',
                                'HP_sum': [np.mean, np.std],
                                 'HP_MET_sum': [np.mean, np.std],
                                 'HP_step_sum': [np.mean, np.std],
                                 'MET_max': np.mean,
                                 'MET_mean': np.mean,
                                 'MET_min': np.mean,
                                 'MET_std': np.mean,
                                 'activity_name_count': [np.mean, np.std],
                                 'age_mean': np.mean,
                                 'bmi_mean': np.mean,
                                 'bmr_eq_mean': np.mean,
                                 'calories_sum': [np.mean, np.std],
                                 'distance_sum': np.mean,
                                 'gender_mean': np.mean,
                                 'height_mean': np.mean,
                                 'min_high_sum': [np.mean, np.std],
                                 'min_med_sum': [np.mean, np.std],
                                 'move_min_sum': [np.mean, np.std],
                                 'quantity_sum' : [np.mean, np.std],
                                 'step_min_mean': [np.mean, np.std],
                                 'weight_mean': np.mean,
                                 'activity_score' : [np.mean, np.std]},                          
                             fill_value = 0).sort_index()

    part_tot = table2.reset_index()
    part_tot.columns = ["_".join((i,j)) for i,j in part_tot.columns]
    part_tot = part_tot.rename(columns={'participant_id_': 'participant_id'})

    
    act_cleaned_part = part_tot[(part_tot['datetime_count'] > min_days) & (part_tot['datetime_count'] <= max_days)]
    
    act_cleaned_part = act_cleaned_part.rename(columns={ 
                                                  'MET_mean_mean': "MET_mean",
                                                 'MET_std_mean': "MET_std",
                                                 'activity_name_count_mean': "activities_count",
                                                 'age_mean_mean': 'age',
                                                 'bmi_mean_mean': 'bmi',
                                                 'bmr_eq_mean_mean': 'bmr_eq',
                                                 'calories_sum_mean': 'calories',
                                                 'calories_sum_mean': 'calories_std',
                                                 'gender_mean_mean': 'gender',
                                                 'height_mean_mean': 'height',
                                                 'move_min_sum_mean': 'move_min_mean',
                                                 'move_min_sum_std': 'move_min_std',
                                                 'weight_mean_mean': 'weight',
                                                 'quantity_sum_mean': 'steps_day_mean',
                                                   'quantity_sum_std': 'steps_day_std',
                                                'step_min_mean_mean': 'step_min_mean'})
    
    return act_cleaned_part

In [7]:
act_df = google_activities(third_party_csv_path)
act_features_df = add_activity_features(act_df, baseline_csv_path)
act_features_outliers_df = remove_feature_outliers(act_features_df, features_remove_outliers)
act_cleaned_part_day = aggregate_part_day(act_features_outliers_df, 
                       start_date, end_date, 
                       stepday_min, stepday_max, 
                       act_min)
act_cleaned_part = aggregate_part(act_cleaned_part_day, min_days, max_days)