In [1]:
import pandas as pd
import numpy as np
import os
import re

## Ingesting App Running Data


In [2]:
def get_all_students_app_usage_data(root_dir='./'):
    def get_app_running_df(filepath):
        cols_to_drop = [
            'id',
            'RUNNING_TASKS_id',
            'RUNNING_TASKS_baseActivity_mClass',
            'RUNNING_TASKS_baseActivity_mPackage',
            'RUNNING_TASKS_numActivities',
            'RUNNING_TASKS_topActivity_mClass'
        ]
        FINAL_APPS = [
            'com.google.android.gm',
            'com.google.android.email',
            'com.android.chrome',
            'com.google.android.youtube',
            'com.spotify.mobile.android.ui',
            'com.google.android.music',
            'com.snapchat.android',
            'com.google.android.talk',
            'com.skype.raider',
            'com.google.android.calendar',
            'com.dropbox.android',
            'com.android.contacts',
            'com.google.android.gallery3d',
            'com.google.android.googlequicksearchbox',
            'com.politico.android',
            'com.guardian',
            'com.google.earth',
            'com.nprpodcastplayer.app',
            'com.piazza.android',
            'com.MoodPanda',
            'org.bewellapp'
        ]
        
        app_running_df = pd.read_csv(filepath)
        cleaned_df = app_running_df.drop(columns=cols_to_drop)
        
        cleaned_df = cleaned_df[cleaned_df['RUNNING_TASKS_topActivity_mPackage'].isin(FINAL_APPS)]
        
        cleaned_df.loc[:,'datetime'] = pd.to_datetime(cleaned_df['timestamp'],unit='s')
        cleaned_df.loc[:,'date'] = cleaned_df['datetime'].dt.date
        
        cleaned_df = cleaned_df.sort_values(by='timestamp')
        cleaned_df['app_switch'] = (
            cleaned_df['RUNNING_TASKS_topActivity_mPackage'] != cleaned_df['RUNNING_TASKS_topActivity_mPackage'].shift(1)
        ).astype(int)
        
        cleaned_df = cleaned_df[cleaned_df['app_switch']!=0]
        
        final_app_df = cleaned_df.groupby(by='date').agg(
        total_app_switch= ('app_switch','sum'),
        total_num_tasks = ('RUNNING_TASKS_numRunning','sum'),
        most_used_app = ('RUNNING_TASKS_topActivity_mPackage',
                        lambda x: x.value_counts().idxmax()
                        )
        )
        final_app_df['uid'] = filepath.split('_')[-1].replace('.csv','')
        
        return final_app_df

    # Merging all student datas
    import os
    data_dir = os.path.join(root_dir,'dataset','app_usage')
    
    # os.path.join('dataset','app_usage')
    user_files = os.listdir(path=data_dir)
    combined_dfs = []
    for file in user_files:
        print('Reading : ',file)
        file_path = os.path.join(data_dir,file)
        df= get_app_running_df(file_path)
        combined_dfs.append(df)
    app_running_df = pd.concat(combined_dfs)
    return app_running_df

## Ingesting Call Log data


In [3]:
def get_all_students_call_log_data(root_dir='./'):
    def get_call_log_df(filename):
        
        df_00 = pd.read_csv(filename)
        
        df_00.loc[:,'datetime'] = pd.to_datetime(df_00['timestamp'],unit='s')
        df_00.loc[:,'date'] = df_00['datetime'].dt.date
        
        call_count_df = df_00.groupby('date').agg(
            total_calls = ('timestamp','count'),
        )
        
        if 'CALLS_duration' in df_00.columns:
            
            cols_to_drop = [
                'id',
                'CALLS_name',
                'CALLS_number',
                'CALLS_numberlabel',
                'CALLS_numbertype'
            ]
            cleaned_df = df_00.drop(columns=cols_to_drop)
            
            cleaned_df = df_00[~df_00.CALLS__id.isna()]
            
            call_duration_df = cleaned_df.groupby('date').agg(
                total_duration_s = ('CALLS_duration','sum'),
                    longest_call_s = ('CALLS_duration','max'),
                    average_call_time_s = ('CALLS_duration','mean'),
                    frequent_call_type = ('CALLS_type',
                                        lambda x: x.value_counts().idxmax() # incoming / outgoing / missed
                                        )
            )
            final_df = pd.merge(call_count_df,call_duration_df,on='date',how='left')
            final_df['uid'] = filename.split('_')[-1].replace('.csv','')
            
            return final_df
        #else
        
        call_count_df['uid'] = filename.split('_')[-1].replace('.csv','')
        return call_count_df

    # Merging all student datas
    data_dir = os.path.join(root_dir,'dataset','call_log')

    # os.path.join('dataset','app_usage')
    user_files = os.listdir(path=data_dir)
    combined_dfs = []
    for file in user_files:
        print('Ingesting : ',file)
        file_path = os.path.join(data_dir,file)
        df= get_call_log_df(file_path)
        combined_dfs.append(df)
    call_log_df = pd.concat(combined_dfs)

    # Percentage of data missing
    call_log_missing = call_log_df.isna().sum()/call_log_df.shape[0]*100
    call_log_missing

    # removing columns where more than 50% of the data is missing
    cols_to_drop = call_log_missing[call_log_missing>50].index
    call_log_df = call_log_df.drop(columns=cols_to_drop)
    return call_log_df

In [4]:
## Ingesting Dining Data

In [5]:
def get_all_students_dinning_data(root_dir='./'):
    def get_dinning_df(filename):
        df_00 = pd.read_csv(
            filename,
            names=['datetime','restaurent','meal'],
            usecols=['datetime','meal'],
            parse_dates=['datetime']
        )
        
        df_00['date'] = df_00.datetime.dt.date
        
        final_df = df_00.groupby(by='date').agg(
            skipped_meal = ('meal', lambda x: not({'Breakfast', 'Lunch', 'Supper'}  <=  set(x.unique()))),
            total_meals = ('meal','count')
        )
        final_df['skipped_meal'] = final_df['skipped_meal'].astype('int')
        final_df['uid'] = filename.split('/')[-1].replace('.txt','')
        
        
        return final_df

    # Merging all student datas
    data_dir = os.path.join(root_dir,'dataset','dinning')
    # os.path.join('dataset','app_usage')
    user_files = os.listdir(path=data_dir)
    combined_dfs = []
    for file in user_files:
        print('Ingesting : ',file)
        file_path = os.path.join(data_dir,file)
        df= get_dinning_df(file_path)
        combined_dfs.append(df)
    dinning_df = pd.concat(combined_dfs)
    return dinning_df

## Ingesting EMA Data

In [None]:

def get_all_students_ema_data(root_dir='./'):
    class GetStudentEma:
        
        def __init__(self):
            pass
        
        def get_ema_social_df(self,filepath):
            ema_social_df = pd.read_json(filepath)
            if ema_social_df.shape[0] == 0:
                return pd.DataFrame()
            ema_social_df['date'] = ema_social_df.resp_time.dt.date
            for column in ['null','location','resp_time']:
                if column in ema_social_df.columns:
                    ema_social_df = ema_social_df.drop(columns=column)
            ema_social_df = ema_social_df.dropna()
            # We assume that multiple entries within a single day occur because the participant updates their 
            # social interaction status multiple times throughout the day. Therefore using the best entry.
            if 'number' in ema_social_df.columns:
                final_df = ema_social_df.groupby('date').agg(
                    social_people_you_contacted_yesterday = ('number','max')
                )
                return final_df
            return pd.DataFrame()
        

        def get_ema_stress_df(self,filepath):
            ema_stress_df = pd.read_json(filepath)
            if ema_stress_df.shape[0] == 0:
                return pd.DataFrame()
            ema_stress_df['date'] = ema_stress_df.resp_time.dt.date
            for column in ['null','location','resp_time']:
                if column in ema_stress_df.columns:
                    ema_stress_df = ema_stress_df.drop(columns=column)
            ema_stress_df = ema_stress_df.dropna()
            # We assume that multiple entries within a single day occur because the participant updates their 
            # social interaction status multiple times throughout the day. Therefore using the best entry.
            
            if 'level' not in ema_stress_df.columns:
                return pd.DataFrame()
            
            # [1]A little stressed, [2]Definitely stressed, [3]Stressed out, [4]Feeling good, [5]Feeling great,
            values = { 1:2, 2:3, 3:4, 4:1, 5:0 }
            
            ema_stress_df['level'] = ema_stress_df['level'].map(values)
            
            final_df = ema_stress_df.groupby('date').agg(
                stress_level = ('level','max')
            )
            return final_df

        def get_ema_sleep_df(self,filepath):
            df_00 = pd.read_json(filepath)
            if df_00.shape[0] == 0:
                return pd.DataFrame()
            df_00['date'] = df_00.resp_time.dt.date
            for column in ['null','location','resp_time']:
                if column in df_00.columns:
                    df_00 = df_00.drop(columns=column)
            df_00 = df_00.dropna()
            final_df =  df_00.groupby('date').max()
            final_df.columns = 'sleep_'+final_df.columns
            return final_df

        def get_ema_exercise_df(self,filepath):
            df_00 = pd.read_json(filepath)
            if df_00.shape[0] == 0:
                return pd.DataFrame()
            df_00['date'] = df_00.resp_time.dt.date
            for column in ['location','resp_time','schedule']:
                if column in df_00.columns:
                    df_00 = df_00.drop(columns=column)
            df_00 = df_00.dropna()
            
            final_df = df_00.groupby('date').max()
            final_df['exercise'] = final_df['exercise'] - 1
            final_df['have'] = np.absolute(final_df['have'] - 2)
            final_df['walk'] = final_df['walk'] - 1
            
            
            final_df.columns = 'exercise_'+final_df.columns
            # final_df = final_df.astype('float')
            return final_df

        # Main Function <-----------------------------
        def get_student_df(self,data_dir,student_number:str):
            exercise = self.get_ema_exercise_df(os.path.join(data_dir,f'Exercise/Exercise_{student_number}.json'))
            sleep = self.get_ema_sleep_df(os.path.join(data_dir,f'Sleep/Sleep_{student_number}.json'))
            social = self.get_ema_social_df(os.path.join(data_dir,f'Social/Social_{student_number}.json'))
            stress = self.get_ema_stress_df(os.path.join(data_dir,f'Stress/Stress_{student_number}.json'))
            student_df = pd.concat([exercise,sleep,social,stress],axis=1)
            student_df['uid'] = student_number
            
            return student_df
        
        
    # Merging all student datas

    data_dir = os.path.join(root_dir,'dataset','EMA','response')
    exercise_folder = 'Exercise'
    # os.path.join('dataset','app_usage')
    user_files = os.listdir(path=os.path.join(data_dir,exercise_folder))

    uids = [x.split('_')[-1].replace('.json','') for x in user_files]

    combined_dfs = []
    for uid in uids:
        print('Ingesting User : ',uid)
        
        get_student_ema = GetStudentEma()
        df= get_student_ema.get_student_df(data_dir=data_dir,student_number=uid)
        combined_dfs.append(df)
    ema_df = pd.concat(combined_dfs)

    # Labeling Missing Values with 'Not exercised' label
    ema_df.loc[:,'exercise_exercise'] = ema_df.exercise_exercise.fillna(1)
    ema_df.loc[:,'exercise_have'] = ema_df.exercise_have.fillna(2)
    ema_df.loc[:,'exercise_walk'] = ema_df.exercise_walk.fillna(1)
    return ema_df

## Ingesting Sensing Data

In [21]:
def get_all_students_sensing_data(root_dir='./'):
    def get_sensing_activity_data(filepath):
        df_00 = pd.read_csv(filepath)
        df_00['date-time'] = pd.to_datetime(df_00['timestamp'],unit='s')
        df_00['date'] = df_00['date-time'].dt.date
        df_00 = df_00.sort_values(by='date-time')
        filtered_df = df_00[df_00[' activity inference'] != df_00[' activity inference'].shift(1)].copy()
        filtered_df['duration_in_s'] = filtered_df['date-time'].diff().dt.total_seconds().shift(-1)
        filtered_df['duration_in_s'] = filtered_df['duration_in_s'].fillna(0)
        filtered_df['motion_duration_s'] = filtered_df['duration_in_s'] * (filtered_df[' activity inference'].isin([1,2]).astype('int'))

        final_df = filtered_df.groupby('date').agg(
            total_duration_of_motion_sensing = ('motion_duration_s','sum'),
        )
        return final_df

    def get_sensing_conversation_df(filepath):
        df_00 = pd.read_csv(filepath)
        df_00.columns = df_00.columns.str.strip()
        df_00['start_timestamp'] = pd.to_datetime(df_00['start_timestamp'],unit='s')
        df_00['end_timestamp'] = pd.to_datetime(df_00['end_timestamp'],unit='s')
        df_00['date'] = df_00['start_timestamp'].dt.date
        df_00['conversation_duration'] = (df_00['end_timestamp'] - df_00['start_timestamp']).dt.total_seconds()
        final_df = df_00.groupby('date').agg(
            total_daily_conversation = ('conversation_duration','sum')
        )
        return final_df

    def get_student_df(data_dir, student_number:str):
        # activity = get_sensing_activity_data(f'dataset/sensing/activity/activity_u{student_number}.csv')
        activity = get_sensing_activity_data(os.path.join(data_dir,f'activity/activity_{student_number}.csv'))
        conversation = get_sensing_conversation_df(os.path.join(data_dir,f'conversation/conversation_{student_number}.csv'))
        student_df = pd.concat([activity,conversation],axis=1)
        student_df['uid'] = student_number
        return student_df

    # getting all students data
    data_dir = os.path.join(root_dir,'dataset','sensing')
    exercise_folder = 'activity'
    # os.path.join('dataset','app_usage')
    user_files = os.listdir(path=os.path.join(data_dir,exercise_folder))

    uids = [x.split('_')[-1].replace('.csv','') for x in user_files]

    combined_dfs = []
    for uid in uids:
        print('Ingesting User : ',uid)
        
        df= get_student_df(data_dir=data_dir,student_number=uid)
        combined_dfs.append(df)
    ema_df = pd.concat(combined_dfs)
    return ema_df

## Ingesting Survey Data


In [22]:
def get_all_students_survey(root_dir='./'):
    def get_survey_big_five(data_dir,filename):
        df = pd.read_csv(os.path.join(data_dir,filename))
        
        values = {
            'Disagree Strongly':1,
            'Disagree a little':2,   
            'Neither agree nor disagree':3, 
            'Agree a little':4, 
            'Agree strongly':5, 
        }

        df.columns = df.columns.str.replace(r'.*?\d+\.\s*','',regex=True)
        df = df[df['type']=='post']
        relevant_columns = [
            'uid',
            # Usefull Surveys
            'Is talkative',
            'Tends to find fault with others',
            'Is depressed, blue',
            'Is helpful and unselfish with others',
            'Is relaxed, handles stress well.',
            'Worries a lot',
            'Tends to be quiet',
            'Gets nervous easily',
            'Is easily distracted'
        ]
        
        relevant_data = df[relevant_columns]
        relevant_data = relevant_data.drop_duplicates(subset=['uid'])
        for question in relevant_columns[1:]:
            relevant_data[question] = relevant_data[question].map(values)
        return relevant_data

    def get_survey_flourishing_scale(data_dir,filename):
        df = pd.read_csv(os.path.join(data_dir,filename))
        df = df[df['type']=='post']
        
        relevant_columns = [
            'uid',
            'I lead a purposeful and meaningful life',
            'I am a good person and live a good life',
            'I am optimistic about my future', 
            'People respect me'
        ]
        relevant_data = df[relevant_columns]
        relevant_data = relevant_data.drop_duplicates(subset=['uid'])
        return relevant_data
        
        
    def get_survey_loneliness_scale(data_dir,filename):
        
        df = pd.read_csv(os.path.join(data_dir,filename))

        df.columns = df.columns.str.replace(r'.*?\d+\.\s',"",regex=True)

        cols_to_drop = [
            'type',
            'I am an outgoing person',
            'I feel in tune with the people around me',
            'There are people I can talk to',
            'There are people I can turn to',
            'There are people who really understand me',
            'I can find companionship when I want it',
            'There are people I feel close to',
            'I have a lot in common with the people around me',
            'I feel part of a group of friends',
            'I do not feel alone'
        ]
        values = {
            'Never':0, 
            'Rarely':1, 
            'Sometimes':2, 
            'Often':3,
        }
        # only taking the post score as we are only intrested in the loneliness score related to the recorded behavior
        df = df[df['type']=='post']

        df = df.drop(columns = cols_to_drop)
        df = df.drop_duplicates(subset=['uid'])

        question_cols = df.columns[1:]  # skip uid
        df[question_cols] = df[question_cols].replace(values)

        loneliness_questions = df.columns[1:]
        loneliness_score = df[loneliness_questions].sum(axis=1)

        loneliness_df = df[['uid']].copy()
        loneliness_df['loneliness_score'] = loneliness_score
        
        return loneliness_df

    def get_survey_perceived_stress_scale(data_dir,filename):
        df = pd.read_csv(os.path.join(data_dir,filename))
        df = df[df['type']=='post']
        df.columns = df.columns.str.replace(r'.*?have you\s','',regex=True).str.replace('?','',)
        relevent_cols = [
            'uid',
            'felt nervous and "stressed"',
            'been able to control irritations in your life',
            'been angered because of things that were outside of your control',
            'felt difficulties were piling up so high that you could not overcome them'
        ]
        values = {
            'Never':0, 
            'Almost never':1, 
            'Sometime':2, 
            'Fairly often':3,
            'Very often':4
        }
        df = df[relevent_cols]
        df.loc[:,relevent_cols[1:]] = df[relevent_cols[1:]].replace(values)
        df = df.drop_duplicates(subset=['uid'])
        df['survey_stress_score'] = df[relevent_cols[1:]].sum(axis=1) 
        df = df.drop(columns = relevent_cols[1:])
        return df

    def get_survey_phq(data_dir,filename):
        df = pd.read_csv(os.path.join(data_dir,filename))
        df = df[df['type']=='post']
        df.columns = df.columns.str.replace('.','')
        relevent_cols = [
            'uid',
            'Feeling down, depressed, hopeless',
            'Feeling tired or having little energy',
        ]
        values = {
            'Not at all':0, 
            'More than half the days':1, 
            'Several days':2, 
            'Nearly every day':3,
        }
        df = df[relevent_cols]
        df.loc[:,relevent_cols[1:]] = df[relevent_cols[1:]].replace(values)
        df = df.drop_duplicates(subset=['uid'])
        return df


    def get_survey_psqi(data_dir,filename):
        df = pd.read_csv(os.path.join(data_dir,filename))
        df = df[df['type']=='post']
        cols_rename = {
            'uid':'uid',
            'During the past month, what time have you usually gone to bed at night? ': 'bed_time',
            'When have you usually gotten up in the morning?':'wake_up_time',
            'During the past month, how many hours of actual sleep did you get at night? (This may be different than the number of hours you spent in bed.)':'sleep_time',
            'a. Cannot get to sleep within 30 minutes':'sleep_within_30min',
            'During the past month, how often have you taken medicine (prescribed or over the counter) to help you sleep?':'taken_sleeping_medicine',
            'During the past month, how would you rate your sleep quality overall?':'sleep_quality',
        }
        relevant_columns = list(cols_rename.keys())
        df = df[relevant_columns]
        df.columns = df.columns.map(lambda x: cols_rename[x])

        # Bed Time Clean up
        import numpy as np
        import re

        def clean_bed_time(x):
            # Handle NaN
            if pd.isna(x):
                return np.nan
            
            # Normalize text
            s = str(x).lower().strip()
            
            # 1. Handle Special Manual Cases
            if 'usually dawn' in s:
                return 5.0
            if 'midnight' in s:
                return 0.0
            if '11:30' in s and 'pm' in s:
                return 23.5  # 11.5 + 12
                
            # 2. Handle Ranges (e.g., "12-1AM", "3-4am", "2-5 am")
            # We take the average of the two numbers
            range_match = re.search(r'(\d+)\s*-\s*(\d+)', s)
            if range_match:
                start = int(range_match.group(1))
                end = int(range_match.group(2))
                
                # Convert 12 to 0 for calculation (Midnight is 0)
                if start == 12: start = 0
                if end == 12: end = 0
                
                return (start + end) / 2.0

            # 3. Handle Standard Times (e.g., "5am", "2:30", "12:00")
            # Finds the first occurrence of "Number" or "Number:Number"
            match = re.search(r'(\d{1,2})(?::(\d{2}))?', s)
            if match:
                hour = int(match.group(1))
                # Get minutes if they exist, else 0
                minute = int(match.group(2)) if match.group(2) else 0
                
                # Convert 12am/12 to 0
                if hour == 12:
                    hour = 0
                    
                return hour + (minute / 60.0)
            
            return np.nan

        # Apply the function to your column
        df['bed_time'] = df['bed_time'].apply(clean_bed_time)
        df.loc[df['bed_time']==23.5,'bed_time'] = -1

        # wake up time cleanup
        s = (
            df['wake_up_time']
            .astype(str)
            .str.lower()
            .str.strip()
        )
        special_map = {
            "usually i don't get up, i'm awake from the night before. but often i go to sleep at 6 and wake up at 8": 8.0,

            '830': 8.5,
            '8:30': 8.5,
            '8:00': 8.0,
            '10': 10.0,

            '9"00 am': 9.0
        }

        wake_val = s.rename(special_map)

        def regex_wakeup_parser(x):
            if isinstance(x, (int, float)):
                return x

            # range not expected → skip

            m = re.search(r'(\d{1,2})(?::(\d{2}))?\s*(am|pm)?', x)
            if not m:
                return np.nan

            h, mnt, ap = m.groups()
            h = int(h)
            mnt = int(mnt or 0)

            if ap == 'pm' and h != 12:
                h += 12
            if ap == 'am' and h == 12:
                h = 0

            return h + mnt / 60

        df['wake_up_time'] = wake_val.map(regex_wakeup_parser)
        df.loc[df['wake_up_time']==83,'wake_up_time'] = 8.5

        # Sleep time
        def clean_sleep_time(val):
            if pd.isna(val):
                return np.nan
            
            val_str = str(val).strip()
            
            # Specific mappings provided
            if val_str == "probably 2hrs per night, 4 if I'm lucky":
                return 3.0
            if val_str == '7-Jun':
                return 7.5
            if val_str.lower() == '8-mar': 
                return 6.0
            if val_str.lower() == '4-mar':
                return 4.0
            
            # Handle time format like 6:30
            if ':' in val_str:
                parts = re.findall(r'(\d+):(\d+)', val_str)
                if parts:
                    h, m = parts[0]
                    return float(h) + float(m)/60
                    
            # Extract numbers for general cases
            nums = re.findall(r"(\d+(?:\.\d+)?)", val_str)
            nums = [float(n) for n in nums]
            
            if not nums:
                return np.nan
                
            # Handle single number
            if len(nums) == 1:
                n = nums[0]
                # Assume minutes if > 24 (like 420 -> 7 hours)
                if n > 24:
                    return n / 60
                return n
                
            # Handle ranges (take average, e.g., '6 to 7' -> 6.5)
            if len(nums) >= 2:
                return np.mean(nums)
                
            return np.nan

        df['sleep_time'] = df['sleep_time'].apply(clean_sleep_time)

        # clean sleep labels
        def clean_sleep_labels(val):
            values = {
                'nan':0,
                'Not during the past month':0, 
                'Less than once week':1, 
                'Once or a twice week':2, 
                'Three or a more times week':3,
            }
            return values[val]
        df['sleep_within_30min'] = df['sleep_within_30min'].astype('str').apply(clean_sleep_labels)
        df['taken_sleeping_medicine'] = df['taken_sleeping_medicine'].astype('str').apply(clean_sleep_labels)

        def clean_sleep_quality_labels(val):
            values = {
                'nan':0,
                'Very bad':0, 
                'Fairly bad':1, 
                'Fairly good':2, 
                'Very good':3,
            }
            return values[val]
        df['sleep_quality'] = df['sleep_quality'].astype('str').apply(clean_sleep_quality_labels)
        df[df.columns[1:]] = df[df.columns[1:]].astype('float')
        return df

    # combining everything
    data_dir = os.path.join(root_dir,'dataset','survey')
    big_five = get_survey_big_five(data_dir,'BigFive.csv' )
    flourishing = get_survey_flourishing_scale(data_dir, 'FlourishingScale.csv')
    loneliness = get_survey_loneliness_scale(data_dir, 'LonelinessScale.csv')
    stress = get_survey_perceived_stress_scale(data_dir, 'PerceivedStressScale.csv')
    phq = get_survey_phq(data_dir, 'PHQ-9.csv')
    psqi = get_survey_psqi(data_dir, 'psqi.csv')

    dfs = [big_five,flourishing,loneliness,stress,phq,psqi]
    dfs = [df.set_index('uid') for df in dfs]

    final_df = pd.concat(dfs,axis=1,join='outer')
    return final_df

## Combining all data sources together

In [23]:
def get_students_data_by_date(dataset_dir='./'):
    # Grouped by Data
    print('Ingesting App Usage Data ...')
    app_usage = get_all_students_app_usage_data(dataset_dir)
    print('Ingesting Call Log Data ...')
    call_log = get_all_students_call_log_data(dataset_dir)
    print('Ingesting Dinning Data ...')
    dinning = get_all_students_dinning_data(dataset_dir)
    print('Ingesting EMA Data ...')
    ema = get_all_students_ema_data(dataset_dir)
    print('Ingesting Sensing Data ...')
    sensing = get_all_students_sensing_data(dataset_dir)

    all_data = [app_usage, call_log, dinning, ema, sensing]
    preffix = ['app_usage_','call_log_','dinning_','ema_','sensing_']
    for i in range(len(all_data)):
        df = all_data[i]
        all_data[i] = df.reset_index().set_index(['date','uid']).add_prefix(preffix[i])
    combined_df = pd.concat(
        all_data,
        axis=1,
        join='outer'
    )
    return combined_df

In [24]:
combined_df = get_students_data_by_date(dataset_dir='../')

Ingesting App Usage Data ...
Reading :  running_app_u51.csv


Reading :  running_app_u30.csv
Reading :  running_app_u32.csv
Reading :  running_app_u09.csv
Reading :  running_app_u52.csv
Reading :  running_app_u04.csv
Reading :  running_app_u05.csv
Reading :  running_app_u13.csv
Reading :  running_app_u15.csv
Reading :  running_app_u42.csv
Reading :  running_app_u12.csv
Reading :  running_app_u36.csv
Reading :  running_app_u34.csv
Reading :  running_app_u00.csv
Reading :  running_app_u49.csv
Reading :  running_app_u14.csv
Reading :  running_app_u58.csv
Reading :  running_app_u27.csv
Reading :  running_app_u02.csv
Reading :  running_app_u31.csv
Reading :  running_app_u45.csv
Reading :  running_app_u35.csv
Reading :  running_app_u53.csv
Reading :  running_app_u50.csv
Reading :  running_app_u16.csv
Reading :  running_app_u59.csv
Reading :  running_app_u43.csv
Reading :  running_app_u18.csv
Reading :  running_app_u20.csv
Reading :  running_app_u57.csv
Reading :  running_app_u39.csv
Reading :  running_app_u10.csv
Reading :  running_app_u25.csv
Reading 

In [25]:
combined_df

Unnamed: 0_level_0,Unnamed: 1_level_0,app_usage_total_app_switch,app_usage_total_num_tasks,app_usage_most_used_app,call_log_total_calls,dinning_skipped_meal,dinning_total_meals,ema_exercise_exercise,ema_exercise_have,ema_exercise_walk,ema_sleep_hour,ema_sleep_rate,ema_sleep_social,ema_social_people_you_contacted_yesterday,ema_stress_level,sensing_total_duration_of_motion_sensing,sensing_total_daily_conversation
date,uid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2013-03-24,u51,97.0,97.0,org.bewellapp,19.0,,,,,,,,,,,,
2013-03-25,u51,136.0,136.0,com.google.android.gm,24.0,,,,,,,,,,,,
2013-03-26,u51,115.0,115.0,com.google.android.gm,24.0,,,,,,,,,,,,
2013-03-27,u51,82.0,82.0,org.bewellapp,68.0,,,,,,,,,,,193.0,3782.0
2013-03-28,u51,112.0,65.0,com.android.chrome,22.0,,,1.0,2,1.0,7.0,2.0,1.0,,,7951.0,14356.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013-05-21,u32,,,,,,,,,,,,,,,4886.0,29161.0
2013-05-23,u32,,,,,,,,,,,,,,,10869.0,36290.0
2013-05-26,u32,,,,,,,,,,,,,,,14729.0,43011.0
2013-05-31,u32,,,,,,,,,,,,,,,4122.0,


In [26]:
# Grouped by UID
survey = get_all_students_survey('../').add_prefix('survey_')
    
full_df = combined_df.reset_index()

uids = full_df.uid.unique()
for uid in uids:
    if uid in survey.index:
        full_df.loc[full_df['uid']== uid ,survey.columns] = survey.loc[uid].values

  df[question_cols] = df[question_cols].replace(values)
  df.loc[:,relevent_cols[1:]] = df[relevent_cols[1:]].replace(values)
  df.loc[:,relevent_cols[1:]] = df[relevent_cols[1:]].replace(values)


In [27]:
full_df

Unnamed: 0,date,uid,app_usage_total_app_switch,app_usage_total_num_tasks,app_usage_most_used_app,call_log_total_calls,dinning_skipped_meal,dinning_total_meals,ema_exercise_exercise,ema_exercise_have,...,survey_loneliness_score,survey_survey_stress_score,"survey_Feeling down, depressed, hopeless",survey_Feeling tired or having little energy,survey_bed_time,survey_wake_up_time,survey_sleep_time,survey_sleep_within_30min,survey_taken_sleeping_medicine,survey_sleep_quality
0,2013-03-24,u51,97.0,97.0,org.bewellapp,19.0,,,,,...,7.0,8.0,0.0,0.0,2.0,9.0,6.5,0.0,0.0,2.0
1,2013-03-25,u51,136.0,136.0,com.google.android.gm,24.0,,,,,...,7.0,8.0,0.0,0.0,2.0,9.0,6.5,0.0,0.0,2.0
2,2013-03-26,u51,115.0,115.0,com.google.android.gm,24.0,,,,,...,7.0,8.0,0.0,0.0,2.0,9.0,6.5,0.0,0.0,2.0
3,2013-03-27,u51,82.0,82.0,org.bewellapp,68.0,,,,,...,7.0,8.0,0.0,0.0,2.0,9.0,6.5,0.0,0.0,2.0
4,2013-03-28,u51,112.0,65.0,com.android.chrome,22.0,,,1.0,2,...,7.0,8.0,0.0,0.0,2.0,9.0,6.5,0.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4689,2013-05-21,u32,,,,,,,,,...,8.0,6.0,0.0,2.0,0.0,9.0,7.0,3.0,0.0,2.0
4690,2013-05-23,u32,,,,,,,,,...,8.0,6.0,0.0,2.0,0.0,9.0,7.0,3.0,0.0,2.0
4691,2013-05-26,u32,,,,,,,,,...,8.0,6.0,0.0,2.0,0.0,9.0,7.0,3.0,0.0,2.0
4692,2013-05-31,u32,,,,,,,,,...,8.0,6.0,0.0,2.0,0.0,9.0,7.0,3.0,0.0,2.0


In [28]:
(full_df.isna().sum()/full_df.shape[0]*100).sort_values(ascending=False)

ema_social_people_you_contacted_yesterday         79.037069
ema_sleep_hour                                    74.350234
ema_sleep_social                                  74.350234
ema_sleep_rate                                    74.350234
ema_stress_level                                  73.498083
ema_exercise_walk                                 61.099276
ema_exercise_have                                 61.099276
ema_exercise_exercise                             61.099276
app_usage_most_used_app                           48.743076
app_usage_total_app_switch                        48.743076
app_usage_total_num_tasks                         48.743076
sensing_total_daily_conversation                  41.798040
sensing_total_duration_of_motion_sensing          40.754154
dinning_total_meals                               35.236472
dinning_skipped_meal                              35.236472
call_log_total_calls                              34.512143
survey_loneliness_score                 

In [29]:
full_df.to_csv('students_data_by_date.csv',index=False)