In [1]:
import pandas as pd

In [None]:
# Logic : All files under app_running follows similar structure.
# Strategy : Reading a sample file and using it to generalize the rest 

Unnamed: 0,null,resp_time,location,number
0,"43.75908069,-72.32885314",2013-03-24 08:39:53,,
1,"43.75908069,-72.32885314",2013-03-24 08:40:55,,
2,4,2013-03-24 08:39:50,,
3,3,2013-03-25 02:16:15,,
4,"43.75885953,-72.32939114",2013-03-24 22:11:35,,


In [123]:
def get_ema_social_df(filepath):
    ema_social_df = pd.read_json(filepath)
    ema_social_df['date'] = ema_social_df.resp_time.dt.date
    ema_social_df = ema_social_df.drop(columns=['null','location','resp_time']).dropna()
    # We assume that multiple entries within a single day occur because the participant updates their 
    # social interaction status multiple times throughout the day. Therefore using the best entry.
    final_df = ema_social_df.groupby('date').agg(
        social_number = ('number','max')
    )
    return final_df

In [None]:
def get_ema_stress_df(filepath):
    ema_stress_df = pd.read_json(filepath)
    ema_stress_df['date'] = ema_stress_df.resp_time.dt.date
    ema_stress_df = ema_stress_df.drop(columns=['null','location','resp_time']).dropna()
    # [1]A little stressed, [2]Definitely stressed, [3]Stressed out, [4]Feeling good, [5]Feeling great,
    values = { 1:2, 2:3, 3:4, 4:1, 5:0 }
    
    ema_stress_df['level'] = ema_stress_df['level'].map(values)
    
    # We assume that multiple entries within a single day occur because the participant updates their 
    # social interaction status multiple times throughout the day. Therefore using the best entry.
    final_df = ema_stress_df.groupby('date').agg(
        stress_number = ('level','max')
    )
    return final_df

In [125]:
def get_ema_sleep_df(filepath):
    df_00 = pd.read_json(filepath)
    df_00['date'] = df_00.resp_time.dt.date
    df_00 = df_00.drop(columns=['null','location','resp_time']).dropna()
    final_df =  df_00.groupby('date').max()
    final_df.columns = 'sleep_'+final_df.columns
    return final_df

In [78]:
def get_ema_exercise_df(filepath):
    df_00 = pd.read_json(filepath)
    df_00['date'] = df_00.resp_time.dt.date
    df_00 = df_00.drop(columns=['location','resp_time','schedule']).dropna()
    final_df = df_00.groupby('date').max()
    final_df.columns = 'exercise_'+final_df.columns
    return final_df

In [None]:
# def get_user_ema_df(user_no):
user = '01'
def get_student_df(student_number:str):
    exercise = get_ema_exercise_df(f'dataset/EMA/response/Exercise/Exercise_u{user}.json')
    sleep = get_ema_sleep_df(f'dataset/EMA/response/Sleep/Sleep_u{user}.json')
    social = get_ema_social_df(f'dataset/EMA/response/Social/Social_u{user}.json')
    stress = get_ema_stress_df(f'dataset/EMA/response/Stress/Stress_u{user}.json')
    student_df = pd.concat([exercise,sleep,social,stress],axis=1)
    return student_df

Combining all functions:

In [6]:
import pandas as pd
import os
import numpy as np

class GetStudentEma:
    
    def __init__(self):
        pass
    
    def get_ema_social_df(self,filepath):
        ema_social_df = pd.read_json(filepath)
        if ema_social_df.shape[0] == 0:
            return pd.DataFrame()
        ema_social_df['date'] = ema_social_df.resp_time.dt.date
        for column in ['null','location','resp_time']:
            if column in ema_social_df.columns:
                ema_social_df = ema_social_df.drop(columns=column)
        ema_social_df = ema_social_df.dropna()
        # We assume that multiple entries within a single day occur because the participant updates their 
        # social interaction status multiple times throughout the day. Therefore using the best entry.
        if 'number' in ema_social_df.columns:
            final_df = ema_social_df.groupby('date').agg(
                social_people_you_contacted_yesterday = ('number','max')
            )
            return final_df
        return pd.DataFrame()
    

    def get_ema_stress_df(self,filepath):
        ema_stress_df = pd.read_json(filepath)
        if ema_stress_df.shape[0] == 0:
            return pd.DataFrame()
        ema_stress_df['date'] = ema_stress_df.resp_time.dt.date
        for column in ['null','location','resp_time']:
            if column in ema_stress_df.columns:
                ema_stress_df = ema_stress_df.drop(columns=column)
        ema_stress_df = ema_stress_df.dropna()
        # We assume that multiple entries within a single day occur because the participant updates their 
        # social interaction status multiple times throughout the day. Therefore using the best entry.
        
        if 'level' not in ema_stress_df.columns:
            return pd.DataFrame()
        
        final_df = ema_stress_df.groupby('date').agg(
            stress_how_stressed_are_you = ('level','max')
        )
        return final_df

    def get_ema_sleep_df(self,filepath):
        df_00 = pd.read_json(filepath)
        if df_00.shape[0] == 0:
            return pd.DataFrame()
        df_00['date'] = df_00.resp_time.dt.date
        for column in ['null','location','resp_time']:
            if column in df_00.columns:
                df_00 = df_00.drop(columns=column)
        df_00 = df_00.dropna()
        final_df =  df_00.groupby('date').max()
        final_df.columns = 'sleep_'+final_df.columns
        return final_df

    def get_ema_exercise_df(self,filepath):
        df_00 = pd.read_json(filepath)
        if df_00.shape[0] == 0:
            return pd.DataFrame()
        df_00['date'] = df_00.resp_time.dt.date
        for column in ['location','resp_time','schedule']:
            if column in df_00.columns:
                df_00 = df_00.drop(columns=column)
        df_00 = df_00.dropna()
        
        final_df = df_00.groupby('date').max()
        final_df.columns = 'exercise_'+final_df.columns
        # final_df = final_df.astype('float')
        return final_df

    # Main Function <-----------------------------
    def get_student_df(self,root_dir,student_number:str):
        exercise = self.get_ema_exercise_df(os.path.join(root_dir,f'Exercise/Exercise_{student_number}.json'))
        sleep = self.get_ema_sleep_df(os.path.join(root_dir,f'Sleep/Sleep_{student_number}.json'))
        social = self.get_ema_social_df(os.path.join(root_dir,f'Social/Social_{student_number}.json'))
        stress = self.get_ema_stress_df(os.path.join(root_dir,f'Stress/Stress_{student_number}.json'))
        student_df = pd.concat([exercise,sleep,social,stress],axis=1)
        student_df['uid'] = student_number
        
        return student_df

In [7]:
# Merging all student datas

import os

root_dir = '../dataset/EMA/response'
exercise_folder = 'Exercise'
# os.path.join('dataset','app_usage')
user_files = os.listdir(path=os.path.join(root_dir,exercise_folder))

uids = [x.split('_')[-1].replace('.json','') for x in user_files]

combined_dfs = []
for uid in uids:
    print('Ingesting User : ',uid)
    
    get_student_ema = GetStudentEma()
    df= get_student_ema.get_student_df(root_dir=root_dir,student_number=uid)
    combined_dfs.append(df)
ema_df = pd.concat(combined_dfs)

# Labeling Missing Values with 'Not exercised' label
ema_df.loc[:,'exercise_exercise'] = ema_df.exercise_exercise.fillna(1)
ema_df.loc[:,'exercise_have'] = ema_df.exercise_have.fillna(2)
ema_df.loc[:,'exercise_walk'] = ema_df.exercise_walk.fillna(1)

Ingesting User :  u27
Ingesting User :  u33
Ingesting User :  u03
Ingesting User :  u22
Ingesting User :  u13
Ingesting User :  u30
Ingesting User :  u20
Ingesting User :  u39
Ingesting User :  u17
Ingesting User :  u43
Ingesting User :  u52
Ingesting User :  u49
Ingesting User :  u04
Ingesting User :  u25
Ingesting User :  u09
Ingesting User :  u12
Ingesting User :  u31
Ingesting User :  u15
Ingesting User :  u36
Ingesting User :  u16
Ingesting User :  u19
Ingesting User :  u32
Ingesting User :  u42
Ingesting User :  u05
Ingesting User :  u18
Ingesting User :  u01
Ingesting User :  u14
Ingesting User :  u47
Ingesting User :  u45
Ingesting User :  u34
Ingesting User :  u08
Ingesting User :  u54
Ingesting User :  u56
Ingesting User :  u00
Ingesting User :  u50
Ingesting User :  u02
Ingesting User :  u41
Ingesting User :  u59
Ingesting User :  u44
Ingesting User :  u24
Ingesting User :  u10
Ingesting User :  u57
Ingesting User :  u46
Ingesting User :  u58
Ingesting User :  u07
Ingesting 

In [8]:
ema_df

Unnamed: 0_level_0,exercise_exercise,exercise_have,exercise_walk,sleep_hour,sleep_rate,sleep_social,social_people_you_contacted_yesterday,stress_how_stressed_are_you,uid
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2013-04-02,1.0,2.0,3.0,,,,,,u27
2013-04-03,1.0,2.0,2.0,5.0,2.0,1.0,3.0,1.0,u27
2013-04-04,1.0,2.0,2.0,7.0,2.0,2.0,,1.0,u27
2013-04-15,1.0,2.0,2.0,,,,2.0,2.0,u27
2013-04-18,1.0,2.0,1.0,7.0,2.0,3.0,3.0,3.0,u27
...,...,...,...,...,...,...,...,...,...
2013-05-16,1.0,2,1.0,,,,,2.0,u51
2013-05-19,1.0,2,1.0,,,,,1.0,u51
2013-05-22,1.0,2,1.0,,,,,2.0,u51
2013-05-23,1.0,2,1.0,,,,,1.0,u51


In [9]:
ema_df.isna().sum()/ema_df.shape[0]*100

exercise_exercise                         0.000000
exercise_have                             0.000000
exercise_walk                             0.000000
sleep_hour                               34.063527
sleep_rate                               34.063527
sleep_social                             34.063527
social_people_you_contacted_yesterday    46.111720
stress_how_stressed_are_you              31.872946
uid                                       0.000000
dtype: float64

In [10]:
ex_miss = ema_df.groupby('uid').agg(
    sleep_missing = ('sleep_hour',lambda x: x.isna().sum()/x.shape[0]*100)
)
ex_miss.sort_values(by = 'sleep_missing',ascending = False )

Unnamed: 0_level_0,sleep_missing
uid,Unnamed: 1_level_1
u47,74.193548
u45,65.384615
u50,64.285714
u42,62.5
u31,57.894737
u22,56.097561
u51,50.0
u43,50.0
u54,50.0
u35,46.341463


In [11]:
# Labeling Missing Values with 'Not exercised' label
ema_df.loc[:,'exercise_exercise'] = ema_df.exercise_exercise.fillna(1)
ema_df.loc[:,'exercise_have'] = ema_df.exercise_have.fillna(2)
ema_df.loc[:,'exercise_walk'] = ema_df.exercise_walk.fillna(1)
