In [1]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', None)

activity_logging_csv_path = r'C:\Users\elien\OneDrive - TU Eindhoven\Internship\Data\logging_data.csv'
baseline_csv_path = r'C:\Users\elien\baseline_data2.csv'

min_days=3
max_days=600

In [2]:
def load_data_add_features(activity_logging_csv_path):
    data = pd.read_csv(activity_logging_csv_path)
    data = data.drop(['Unnamed: 0'], axis=1)

    # Assign MET values to exercise_types 
    data['MET_score'] = 0.0

    for i in range(len(data)):
        if data['exercise_type'].values[i] == 'walking':
            data['MET_score'].values[i] = 3.5
        if data['exercise_type'].values[i] == 'yogaAndPilates':
            data['MET_score'].values[i] = 3.0
        if data['exercise_type'].values[i] == 'running':
            data['MET_score'].values[i] = 7.0
        if data['exercise_type'].values[i] == 'other':
            data['MET_score'].values[i] = 3.5
        if data['exercise_type'].values[i] == 'bicycle':
            data['MET_score'].values[i] = 7.5
        if data['exercise_type'].values[i] == 'swimming':
            data['MET_score'].values[i] = 6.0
        if data['exercise_type'].values[i] == 'aerobic':
            data['MET_score'].values[i] = 8.0
        if data['exercise_type'].values[i] == 'weightLifting' :
            data['MET_score'].values[i] = 6.0
        if data['exercise_type'].values[i] == 'crossfit' :
            data['MET_score'].values[i] = 8.0
        if data['exercise_type'].values[i] == 'meditation':
            data['MET_score'].values[i] = 1.0
        if data['exercise_type'].values[i] == 'folkDances':
            data['MET_score'].values[i] = 7.8
        if data['exercise_type'].values[i] == 'martialArts':
            data['MET_score'].values[i] = 7.8
        if data['exercise_type'].values[i] == 'walk' :
            data['MET_score'].values[i] = 3.5
        if data['exercise_type'].values[i] == 'basketball':
            data['MET_score'].values[i] = 6.5
        if data['exercise_type'].values[i] == 'football':
            data['MET_score'].values[i] = 8.0
        if data['exercise_type'].values[i] == 'anaerobic':
            data['MET_score'].values[i] = 8.0

    data['HP_MET'] = np.where((data['MET_score'] >= 3) & (data['MET_score'] < 6) , (1*data['minutes_duration']), np.where(data['MET_score'] >= 6, (2*data['minutes_duration']), 0))
    
    # Add moderate and high intensity minutes 
    data['min_med'] = np.where((data['MET_score'] >= 3) & (data['MET_score'] < 6) , (data['minutes_duration']), 0)
    data['min_high'] = np.where(data['MET_score'] >= 6, (data['minutes_duration']), 0)
    return data

def Mifflin_bmr(weight, height, age, gender):
    bmr = (9.99*weight) + (6.25*height) - (4.92*age) + (166*gender) - 161
    return bmr

def add_baseline_features(act_df, baseline_csv_path):
    baseline = pd.read_csv(baseline_csv_path)
    base = baseline.drop(columns=['Unnamed: 0', 'Date', 'standing_one_min_blood_pressure_systolic', 'lying_blood_pressure_diastolic', 'standing_three_min_blood_pressure_systolic', 'lying_blood_pressure_systolic', 'body_temperature', 'sitting_blood_pressure_pulse_rate', 'standing_one_min_blood_pressure_pulse_rate', 'abdominal', 'body_fat', 'waist', 'standing_three_min_blood_pressure_diastolic', 'dominant_hand', 'sitting_blood_pressure_diastolic', 'standing_three_min_blood_pressure_pulse_rate', 'lying_blood_pressure_pulse_rate', 'standing_one_min_blood_pressure_diastolic', 'sitting_blood_pressure_systolic', 'hips', 'hand_grip_right', 'hand_grip_left', 'bmr'])

    base['bmr_eq'] = Mifflin_bmr(base['weight'].values, base['height'].values, base['age'].values, base['gender'].values) 
    act_base_df = act_df.merge(base.rename(columns={"RegistrationCode": "participant_id"}), on="participant_id", how="inner")
    act_base_df = act_base_df.dropna(subset = ["bmr_eq"])
    return act_base_df

def aggregate_part_day(df):
    table = pd.pivot_table(df, values=['HP_MET','MET_score', 'bmr_eq', 'gender', 'age', 'bmi', 'weight', 'height', 'exercise_type', 'min_high', 'min_med'], index=['participant_id', 'datetime'],
                        aggfunc={'HP_MET': np.sum,
                                'MET_score': [np.mean, np.std, min, max],
                                'bmr_eq': np.mean,
                                'gender': np.mean,
                                'age': np.mean,
                                'bmi': np.mean,
                                'weight': np.mean,
                                 'height': np.mean,
                                'exercise_type': 'count',
                                'min_high': np.sum,
                                'min_med': np.sum},                          
                             fill_value = 0).sort_index()

    part_day = table.reset_index()
    part_day.columns = ["_".join((i,j)) for i,j in part_day.columns]
    part_day = part_day.rename(columns={'participant_id_': 'participant_id', 
                                        'datetime_': 'datetime',
                                        'activity_score_': 'activity_score',
                                       'gender_mean': 'gender',
                                       'bmi_mean': 'bmi',
                                       'bmr_eq_mean': 'bmr_eq',
                                       'age_mean': 'age',
                                         'height_mean': 'height',
                                       'weight_mean': 'weight'})
    return part_day

def aggregate_part(part_day, min_days, max_days):
    table2 = pd.pivot_table(part_day, values=['datetime', 'HP_MET_sum', 'MET_score_max', 'MET_score_mean', 'MET_score_min', 'MET_score_std', 'exercise_type_count', 'age', 'bmi', 'bmr_eq', 'gender', 'height', 'min_high_sum', 'min_med_sum', 'weight'], 
                           index=['participant_id'],
                        aggfunc={'datetime': 'count',
                                 'HP_MET_sum': [np.mean, np.std],
                                 'MET_score_max': np.mean,
                                 'MET_score_mean': np.mean,
                                 'MET_score_min': np.mean,
                                 'MET_score_std': np.mean,
                                 'exercise_type_count': [np.mean, np.std],
                                 'age': np.mean,
                                 'bmi': np.mean,
                                 'bmr_eq': np.mean,
                                 'gender': np.mean,
                                 'height': np.mean,
                                 'min_high_sum': [np.mean, np.std],
                                 'min_med_sum': [np.mean, np.std],
                                 'weight': np.mean},                        
                             fill_value = 0).sort_index()

    part_tot = table2.reset_index()
    part_tot.columns = ["_".join((i,j)) for i,j in part_tot.columns]
    part_tot = part_tot.rename(columns={'participant_id_': 'participant_id',
                                         'gender_mean': 'gender',
                                       'bmi_mean': 'bmi',
                                       'bmr_eq_mean': 'bmr_eq',
                                       'age_mean': 'age',
                                       'weight_mean': 'weight'})
    
    part_tot_clean = part_tot[(part_tot['datetime_count'] > min_days) & (part_tot['datetime_count'] <= max_days)] 
    return part_tot_clean

In [3]:
act_df = load_data_add_features(activity_logging_csv_path)
act_base_df = add_baseline_features(act_df, baseline_csv_path)
aggregate_part_day = aggregate_part_day(act_base_df)
aggregate_part = aggregate_part(aggregate_part_day, min_days, max_days)