In [1]:
import numpy as np
import pandas as pd
import json
import gzip
import os
import datetime
from dateutil import parser
from tqdm import tqdm

In [7]:
## Input Data Path
path = "/Users/farhan/Desktop/DNL/Budding_Scholar_22-23/Data/"

In [8]:
#Create a DataFrame object
df = pd.DataFrame(
    columns = [
        'ParticipantIdentifier', 
        'datetime',
        'trial_date',
        'time',
        'duration', 
        'total_unlocks', 
        'total_unlock_duration',
        # device usage
        'app_usage_books',
        'app_usage_business',
        'app_usage_catalogs',
        'app_usage_developer_tools',
        'app_usage_education',
        'app_usage_entertainment',
        'app_usage_finance',
        'app_usage_food_and_drink',
        'app_usage_games',
        'app_usage_graphics_and_design',
        'app_usage_health_and_fitness',
        'app_usage_kids',
        'app_usage_lifestyle',
        'app_usage_medical',
        'app_usage_miscellaneous',
        'app_usage_music',
        'app_usage_navigation',
        'app_usage_news',
        'app_usage_newsstand',
        'app_usage_photo_and_video',
        'app_usage_productivity',
        'app_usage_reference',
        'app_usage_shopping',
        'app_usage_social_networking',
        'app_usage_sports',
        'app_usage_stickers',
        'app_usage_travel',
        'app_usage_utilities',
        'app_usage_weather',
        'web_usage_books',
        'web_usage_business',
        'web_usage_catalogs',
        'web_usage_developer_tools',
        'web_usage_education',
        'web_usage_entertainment',
        'web_usage_finance',
        'web_usage_food_and_drink',
        'web_usage_games',
        'web_usage_graphics_and_design',
        'web_usage_health_and_fitness',
        'web_usage_kids',
        'web_usage_lifestyle',
        'web_usage_medical',
        'web_usage_miscellaneous',
        'web_usage_music',
        'web_usage_navigation',
        'web_usage_news',
        'web_usage_newsstand',
        'web_usage_photo_and_video',
        'web_usage_productivity',
        'web_usage_reference',
        'web_usage_shopping',
        'web_usage_social_networking',
        'web_usage_sports',
        'web_usage_stickers',
        'web_usage_travel',
        'web_usage_utilities',
        'web_usage_weather'
    ]
)

In [10]:
days = [i for i in os.listdir(path) if not i.startswith('.')]
for day in tqdm(days):
    files = os.listdir(path + day)
    sensors = [i for i in files if i.startswith('sensor')]
    for sensor in sensors:
        # select for sensor
        if sensor == 'sensorkit-device-usage':
            participants = [i for i in os.listdir(path+day+ '/' + sensor + '/iPhone') if not i.startswith('.')]
            for participant in participants: 
                ParticipantIdentifier = participant
                devices = [i for i in os.listdir(path+day+ '/' + sensor+ '/iPhone/' + participant) if not i.startswith('.')]
                for device in devices:
                    data_files = os.listdir(path+day + '/' + sensor+ '/iPhone/' + participant + '/' + device)
                    for subfile in data_files:
                        filepath = path+day+ '/' + sensor + '/iPhone/' + participant + '/' + device + '/' + subfile
                        if filepath.endswith('.gz'):
                            try:
                                with gzip.open(filepath, 'r') as fin:
                                    data = json.loads(fin.read().decode('utf-8'))
                                    
                                    rows = df.shape[0]
                                    for i in range(len(data['samples'])):
                                        # dynamically creating df: https://stackoverflow.com/questions/48030350/dynamically-add-rows-to-dataframe

                                        df.loc[rows + i, 'ParticipantIdentifier'] = ParticipantIdentifier 
                                        dt = parser.parse(data['samples'][i]['timestamp']) # str -> datetime of sample
                                        df.loc[rows + i, 'datetime'] = dt
                                        df.loc[rows + i, 'trial_date'] = (dt + datetime.timedelta(hours = -4)).date() # trial day associated with sample (4am is when the day flips)
                                        df.loc[rows + i, 'time'] = dt.time()
                                        df.loc[rows + i, 'duration'] = data['samples'][i]['sample']['duration']
                                        df.loc[rows + i, 'total_unlocks'] = data['samples'][i]['sample']['totalUnlocks']
                                        df.loc[rows + i, 'total_unlock_duration'] = data['samples'][i]['sample']['totalUnlockDuration']

                                        # web usage
                                        for j in range(len(data['samples'][i]['sample']['webUsageByCategory'])//2):
                                            web_use = data['samples'][i]['sample']['webUsageByCategory']
                                            # go through categories...
                                            if web_use[j] == 'SRDeviceUsageCategoryBooks':
                                                df.loc[rows + i, 'web_usage_books'] = sum(d.get('totalUsageTime', 0) for d in web_use[j+1])
                                            if web_use[j] == 'SRDeviceUsageCategoryBusiness':
                                                df.loc[rows + i, 'web_usage_business'] = sum(d.get('totalUsageTime', 0) for d in web_use[j+1])
                                            if web_use[j] == 'SRDeviceUsageCategoryCatalogs':
                                                df.loc[rows + i, 'web_usage_catalogs'] = sum(d.get('totalUsageTime', 0) for d in web_use[j+1])
                                            if web_use[j] == 'SRDeviceUsageCategoryDeveloperTools':
                                                df.loc[rows + i, 'web_usage_developer_tools'] = sum(d.get('totalUsageTime', 0) for d in web_use[j+1])
                                            if web_use[j] == 'SRDeviceUsageCategoryEducation':
                                                df.loc[rows + i, 'web_usage_education'] = sum(d.get('totalUsageTime', 0) for d in web_use[j+1])
                                            if web_use[j] == 'SRDeviceUsageCategoryEntertainment':
                                                df.loc[rows + i, 'web_usage_entertainment'] = sum(d.get('totalUsageTime', 0) for d in web_use[j+1])
                                            if web_use[j] == 'SRDeviceUsageCategoryFinance':
                                                df.loc[rows + i, 'web_usage_finance'] = sum(d.get('totalUsageTime', 0) for d in web_use[j+1])
                                            if web_use[j] == 'SRDeviceUsageCategoryFoodAndDrink':
                                                df.loc[rows + i, 'web_usage_food_and_drink'] = sum(d.get('totalUsageTime', 0) for d in web_use[j+1])
                                            if web_use[j] == 'SRDeviceUsageCategoryGames':
                                                df.loc[rows + i, 'web_usage_games'] = sum(d.get('totalUsageTime', 0) for d in web_use[j+1])
                                            if web_use[j] == 'SRDeviceUsageCategoryGraphicsAndDesign':
                                                df.loc[rows + i, 'web_usage_graphics_and_design'] = sum(d.get('totalUsageTime', 0) for d in web_use[j+1])
                                            if web_use[j] == 'SRDeviceUsageCategoryHealthAndFitness':
                                                df.loc[rows + i, 'web_usage_health_and_fitness'] = sum(d.get('totalUsageTime', 0) for d in web_use[j+1])
                                            if web_use[j] == 'SRDeviceUsageCategoryKids':
                                                df.loc[rows + i, 'web_usage_kids'] = sum(d.get('totalUsageTime', 0) for d in web_use[j+1])
                                            if web_use[j] == 'SRDeviceUsageCategoryLifestyle':
                                                df.loc[rows + i, 'web_usage_lifestyle'] = sum(d.get('totalUsageTime', 0) for d in web_use[j+1])
                                            if web_use[j] == 'SRDeviceUsageCategoryMedical':
                                                df.loc[rows + i, 'web_usage_medical'] = sum(d.get('totalUsageTime', 0) for d in web_use[j+1])
                                            if web_use[j] == 'SRDeviceUsageCategoryMiscellaneous':
                                                df.loc[rows + i, 'web_usage_utilities_miscellaneous'] = sum(d.get('totalUsageTime', 0) for d in web_use[j+1])
                                            if web_use[j] == 'SRDeviceUsageCategoryMusic':
                                                df.loc[rows + i, 'web_usage_music'] = sum(d.get('totalUsageTime', 0) for d in web_use[j+1])
                                            if web_use[j] == 'SRDeviceUsageCategoryNavigation':
                                                df.loc[rows + i, 'web_usage_navigation'] = sum(d.get('totalUsageTime', 0) for d in web_use[j+1])
                                            if web_use[j] == 'SRDeviceUsageCategoryNews':
                                                df.loc[rows + i, 'web_usage_news'] = sum(d.get('totalUsageTime', 0) for d in web_use[j+1])
                                            if web_use[j] == 'SRDeviceUsageCategoryNewsstand':
                                                df.loc[rows + i, 'web_usage_newsstand'] = sum(d.get('totalUsageTime', 0) for d in web_use[j+1])
                                            if web_use[j] == 'SRDeviceUsageCategoryPhotoAndVideo':
                                                df.loc[rows + i, 'web_usage_photo_and_video'] = sum(d.get('totalUsageTime', 0) for d in web_use[j+1])
                                            if web_use[j] == 'SRDeviceUsageCategoryProductivity':
                                                df.loc[rows + i, 'web_usage_productivity'] = sum(d.get('totalUsageTime', 0) for d in web_use[j+1])
                                            if web_use[j] == 'SRDeviceUsageCategoryReference':
                                                df.loc[rows + i, 'web_usage_reference'] = sum(d.get('totalUsageTime', 0) for d in web_use[j+1]) # sums up values in a list of dicts for given key
                                            if web_use[j] == 'SRDeviceUsageCategoryShopping':
                                                df.loc[rows + i, 'web_usage_shopping'] = sum(d.get('totalUsageTime', 0) for d in web_use[j+1]) # sums up values in a list of dicts for given key
                                            if web_use[j] == 'SRDeviceUsageCategorySocialNetworking':
                                                df.loc[rows + i, 'web_usage_social_networking'] = sum(d.get('totalUsageTime', 0) for d in web_use[j+1]) # sums up values in a list of dicts for given key
                                            if web_use[j] == 'SRDeviceUsageCategorySports':
                                                df.loc[rows + i, 'web_usage_sports'] = sum(d.get('totalUsageTime', 0) for d in web_use[j+1]) # sums up values in a list of dicts for given key
                                            if web_use[j] == 'SRDeviceUsageCategoryStickers':
                                                df.loc[rows + i, 'web_usage_stickers'] = sum(d.get('totalUsageTime', 0) for d in web_use[j+1]) # sums up values in a list of dicts for given key
                                            if web_use[j] == 'SRDeviceUsageCategoryTravel':
                                                df.loc[rows + i, 'web_usage_travel'] = sum(d.get('totalUsageTime', 0) for d in web_use[j+1]) # sums up values in a list of dicts for given key
                                            if web_use[j] == 'SRDeviceUsageCategoryUtilities':
                                                df.loc[rows + i, 'web_usage_utilities'] = sum(d.get('totalUsageTime', 0) for d in web_use[j+1]) # sums up values in a list of dicts for given key
                                            if web_use[j] == 'SRDeviceUsageCategoryWeather':
                                                df.loc[rows + i, 'web_usage_weather'] = sum(d.get('totalUsageTime', 0) for d in web_use[j+1]) # sums up values in a list of dicts for given key

                                        # app usage
                                        for j in range(len(data['samples'][i]['sample']['applicationUsageByCategory'])//2):
                                            app_use = data['samples'][i]['sample']['applicationUsageByCategory']
                                            # go through categories
                                            if app_use[j] == 'SRDeviceUsageCategoryBooks':
                                                df.loc[rows + i, 'app_usage_books'] = sum(d.get('usageTime', 0) for d in app_use[j+1])
                                            if app_use[j] == 'SRDeviceUsageCategoryBusiness':
                                                df.loc[rows + i, 'app_usage_business'] = sum(d.get('usageTime', 0) for d in app_use[j+1])
                                            if app_use[j] == 'SRDeviceUsageCategoryCatalogs':
                                                df.loc[rows + i, 'app_usage_catalogs'] = sum(d.get('usageTime', 0) for d in app_use[j+1])
                                            if app_use[j] == 'SRDeviceUsageCategoryDeveloperTools':
                                                df.loc[rows + i, 'app_usage_developer_tools'] = sum(d.get('usageTime', 0) for d in app_use[j+1])
                                            if app_use[j] == 'SRDeviceUsageCategoryEducation':
                                                df.loc[rows + i, 'app_usage_education'] = sum(d.get('usageTime', 0) for d in app_use[j+1])
                                            if app_use[j] == 'SRDeviceUsageCategoryEntertainment':
                                                df.loc[rows + i, 'app_usage_entertainment'] = sum(d.get('usageTime', 0) for d in app_use[j+1])
                                            if app_use[j] == 'SRDeviceUsageCategoryFinance':
                                                df.loc[rows + i, 'app_usage_finance'] = sum(d.get('usageTime', 0) for d in app_use[j+1])
                                            if app_use[j] == 'SRDeviceUsageCategoryFoodAndDrink':
                                                df.loc[rows + i, 'app_usage_food_and_drink'] = sum(d.get('usageTime', 0) for d in app_use[j+1])
                                            if app_use[j] == 'SRDeviceUsageCategoryGames':
                                                df.loc[rows + i, 'app_usage_games'] = sum(d.get('usageTime', 0) for d in app_use[j+1])
                                            if app_use[j] == 'SRDeviceUsageCategoryGraphicsAndDesign':
                                                df.loc[rows + i, 'app_usage_graphics_and_design'] = sum(d.get('usageTime', 0) for d in app_use[j+1])
                                            if app_use[j] == 'SRDeviceUsageCategoryHealthAndFitness':
                                                df.loc[rows + i, 'app_usage_health_and_fitness'] = sum(d.get('usageTime', 0) for d in app_use[j+1])
                                            if app_use[j] == 'SRDeviceUsageCategoryKids':
                                                df.loc[rows + i, 'app_usage_kids'] = sum(d.get('usageTime', 0) for d in app_use[j+1])
                                            if app_use[j] == 'SRDeviceUsageCategoryLifestyle':
                                                df.loc[rows + i, 'app_usage_lifestyle'] = sum(d.get('usageTime', 0) for d in app_use[j+1])
                                            if app_use[j] == 'SRDeviceUsageCategoryMedical':
                                                df.loc[rows + i, 'app_usage_medical'] = sum(d.get('usageTime', 0) for d in app_use[j+1])
                                            if app_use[j] == 'SRDeviceUsageCategoryMiscellaneous':
                                                df.loc[rows + i, 'app_usage_miscellaneous'] = sum(d.get('usageTime', 0) for d in app_use[j+1])
                                            if app_use[j] == 'SRDeviceUsageCategoryMusic':
                                                df.loc[rows + i, 'app_usage_music'] = sum(d.get('usageTime', 0) for d in app_use[j+1])
                                            if app_use[j] == 'SRDeviceUsageCategoryNavigation':
                                                df.loc[rows + i, 'app_usage_navigation'] = sum(d.get('usageTime', 0) for d in app_use[j+1])
                                            if app_use[j] == 'SRDeviceUsageCategoryNews':
                                                df.loc[rows + i, 'app_usage_news'] = sum(d.get('usageTime', 0) for d in app_use[j+1])
                                            if app_use[j] == 'SRDeviceUsageCategoryNewsstand':
                                                df.loc[rows + i, 'app_usage_newsstand'] = sum(d.get('usageTime', 0) for d in app_use[j+1])
                                            if app_use[j] == 'SRDeviceUsageCategoryPhotoAndVideo':
                                                df.loc[rows + i, 'app_usage_photo_and_video'] = sum(d.get('usageTime', 0) for d in app_use[j+1])
                                            if app_use[j] == 'SRDeviceUsageCategoryProductivity':
                                                df.loc[rows + i, 'app_usage_productivity'] = sum(d.get('usageTime', 0) for d in app_use[j+1])
                                            if app_use[j] == 'SRDeviceUsageCategoryReference':
                                                df.loc[rows + i, 'app_usage_reference'] = sum(d.get('usageTime', 0) for d in app_use[j+1])
                                            if app_use[j] == 'SRDeviceUsageCategoryShopping':
                                                df.loc[rows + i, 'app_usage_shopping'] = sum(d.get('usageTime', 0) for d in app_use[j+1])
                                            if app_use[j] == 'SRDeviceUsageCategorySocialNetworking':
                                                df.loc[rows + i, 'app_usage_social_network'] = sum(d.get('usageTime', 0) for d in app_use[j+1])            
                                            if app_use[j] == 'SRDeviceUsageCategorySports':
                                                df.loc[rows + i, 'app_usage_sports'] = sum(d.get('usageTime', 0) for d in app_use[j+1])
                                            if app_use[j] == 'SRDeviceUsageCategoryStickers':
                                                df.loc[rows + i, 'app_usage_stickers'] = sum(d.get('usageTime', 0) for d in app_use[j+1])
                                            if app_use[j] == 'SRDeviceUsageCategoryTravel':
                                                df.loc[rows + i, 'app_usage_travel'] = sum(d.get('usageTime', 0) for d in app_use[j+1])
                                            if app_use[j] == 'SRDeviceUsageCategoryUtilities':
                                                df.loc[rows + i, 'app_usage_utilities'] = sum(d.get('usageTime', 0) for d in app_use[j+1])
                                            if app_use[j] == 'SRDeviceUsageCategoryWeather':
                                                df.loc[rows + i, 'app_usage_weather'] = sum(d.get('usageTime', 0) for d in app_use[j+1])
                                    
                            except (OSError, ValueError) as e:  
                                pass

 24%|██▍       | 5/21 [24:29<1:32:44, 347.78s/it]