In [18]:
import pandas as pd
import numpy as np
from datetime import datetime
import pytz
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', None)

third_party_csv_path = r'C:\Users\elien\OneDrive - TU Eindhoven\Internship\Data\third_party_data.csv'
stepday_min=200
stepday_max=50000

In [5]:
def act_features_apple_google(third_party_csv_path, stepday_min, stepday_max):
    third_party_data = pd.read_csv(third_party_csv_path)
    step_data = third_party_data[third_party_data['name'] == 'StepCount']
    step_data = step_data.drop(columns=['Unnamed: 0', 'id', 'research_stage'])

    for x in range(len(step_data)):
        if step_data['data_source'].values[x] == 'AppleHealthkit':
            data_Apple = step_data['data'].values[x]
            len_str = len(data_Apple)
            steps_slice = slice(54, (len_str-47))
            step_data['data'].values[x] = data_Apple[steps_slice]

    step_data['data'] = pd.to_numeric(step_data['data'])
    step_data = step_data.round({'data': 0})

    step_data_clean = step_data[step_data['data'] >= stepday_min]
    step_data_clean = step_data_clean[step_data_clean['data'] <= stepday_max]
    return step_data_clean

def time_features(participant, var, timecode):
    oldest_t = participant[var].min()
    newest_t = participant[var].max()
    start = datetime.strptime(oldest_t, timecode)
    end = datetime.strptime(newest_t, timecode)
    diff_t = end.date() - start.date()
    return oldest_t, newest_t, diff_t

def aggregate_apple_google(step_data_clean):
    participants = step_data_clean['participant_id'].unique()

    data_source = []
    datetime_start = []; datetime_end = []; datetime_days = []
    source_created_at_start = []; source_created_at_end = []; source_created_at_days = []
    updated_at_start = []; updated_at_end = []; updated_at_days = []
    created_at_start = []; created_at_end = []; created_at_days = []
    source_updated_at_start = []; source_updated_at_end = []; source_updated_at_days = []
    steps_mean_day = []

    for p in range(len(participants)):
        participant = step_data_clean[step_data_clean['participant_id'] == participants[p]]

        #data_source
        if len(participant['data_source'].value_counts()) == 1:
            data_source.append(participant['data_source'].values[0])
        else: 
            data_source.append('AppleHealthkit/GoogleFitness')

        #steps_mean_day
        steps_mean = participant['data'].mean()
        steps_mean_day.append(steps_mean)    

        #datetime
        oldest_datetime, newest_datetime, diff_datetime = time_features(participant, 'datetime', '%Y-%m-%d %H:%M:%S%z')
        datetime_start.append(oldest_datetime)
        datetime_end.append(newest_datetime)
        datetime_days.append(diff_datetime.days)

        #source_created_at       
        oldest_source_created_at, newest_source_created_at, diff_source_created_at = time_features(participant, var = 'source_created_at', timecode = '%Y-%m-%d %H:%M:%S.%f%z')
        source_created_at_start.append(oldest_source_created_at)
        source_created_at_end.append(newest_source_created_at)
        source_created_at_days.append(diff_source_created_at.days)

        #updated_at       
        oldest_updated_at, newest_updated_at, diff_updated_at = time_features(participant, var = 'updated_at', timecode = '%Y-%m-%d %H:%M:%S.%f%z')
        updated_at_start.append(oldest_updated_at)
        updated_at_end.append(newest_updated_at)
        updated_at_days.append(diff_updated_at.days)

        #created_at       
        oldest_created_at, newest_created_at, diff_created_at = time_features(participant, var = 'created_at', timecode = '%Y-%m-%d %H:%M:%S.%f%z')
        created_at_start.append(oldest_created_at)
        created_at_end.append(newest_created_at)
        created_at_days.append(diff_created_at.days)

        #source_updated_at
        oldest_source_updated_at, newest_source_updated_at, diff_source_updated_at = time_features(participant, var = 'source_updated_at', timecode = '%Y-%m-%d %H:%M:%S.%f%z')
        source_updated_at_start.append(oldest_source_updated_at)
        source_updated_at_end.append(newest_source_updated_at)
        source_updated_at_days.append(diff_source_updated_at.days)

    data = {'participant_id': participants,
            'data_source':data_source,
           'datetime_start': datetime_start, 
           'datetime_end': datetime_end,
           'datetime_days': datetime_days,
            'source_created_at_start':source_created_at_start,
           'source_created_at_end': source_created_at_end, 
           'source_created_at_days': source_created_at_days,
           'updated_at_start': updated_at_start,
           'updated_at_end': updated_at_end, 
           'updated_at_days': updated_at_days,
           'created_at_start': created_at_start,
           'created_at_end': created_at_end, 
           'created_at_days': created_at_days,
           'source_updated_at_start':source_updated_at_start,
           'source_updated_at_end': source_updated_at_end, 
           'source_updated_at_days': source_updated_at_days,
           'steps_mean_day': steps_mean_day}

    aggregated_act_df = pd.DataFrame(data)
    return aggregated_act_df

In [6]:
act_df_clean = act_features_apple_google(third_party_csv_path, stepday_min, stepday_max)
aggregated_act_df_part = aggregate_apple_google(act_df_clean)