In [57]:
import pandas as pd
import numpy as np
import pickle
import datetime
import json
import re
import os

### Variable Information

In [58]:
## Concerned with survey_question_results

In [59]:
## Input Directory
directory = "../cumulative_data_811-910"

In [60]:
## Output Directory
out_dir = "../indv_table_exports/"

In [61]:
## Good Participants
good_subjects = ['01801252-3a7e-4f5f-8b6d-49e8da3902f3',
                 'd26d4b78-7fcf-488e-b687-2d1c93c47b74',
                 '531d7f6d-b880-4a0b-b467-80005a316f1c']

### Some handy functions

In [62]:
## Function to fix date from UTC to ET
def fix_date_to_ET(end_date):
    if pd.to_datetime(end_date, format= '%Y-%m-%d', utc=True).tz_convert('US/Eastern').hour < 5:
        return pd.to_datetime(end_date, format= '%Y-%m-%d', utc=True).tz_convert('US/Eastern').date() - datetime.timedelta(days=1)
    else:
        return pd.to_datetime(end_date, format= '%Y-%m-%d', utc=True).tz_convert('US/Eastern').date()
            
def fix_columns_by_category(dataframe, categories):
    df = dataframe
    for item in categories:
        if item not in df.columns.to_list():
            df[item] = 'NaN'
    return df

def pivot_df(dataframe, pos: list, col, val):
    df = dataframe
    df = df.pivot_table(index=pos,
                    columns=col, 
                    values=val).reset_index()
    return df

def get_time_diff(d2, d1):
    return (d2 - d1).days

### Dataframe Construction Functions

In [63]:
## Function to extract Affect scores
def get_affect_df(dataframe):
    df = dataframe
    df_affect = df.loc[df.ResultIdentifier.str.startswith('affect_'), :]
    df_affect = df_affect[['ParticipantIdentifier', 'ResultIdentifier', 'Answers', 'StudyDay']]
    
    # cast Value to numeric
    df_affect.Answers = pd.to_numeric(df_affect.Answers)

    # Make separate columns for each affect score
    # Convert from long to wide
    indices = ['StudyDay', 'ParticipantIdentifier']
    df_affect = pivot_df(df_affect, indices, 'ResultIdentifier', 'Answers')
    
    # Add columns that may not have been present in given dataset
    indices = ['affect_neg_angry', 'affect_neg_ashamed', 'affect_neg_bored', 'affect_neg_depressed', 
               'affect_neg_embarrassed', 'affect_neg_frustrated', 'affect_neg_guilty', 'affect_neg_lazy',
               'affect_neg_lonelyIsolated', 'affect_neg_nervousAnxious', 'affect_neg_sad', 'affect_neg_stressed',
               'affect_pos_amused', 'affect_pos_appreciated', 'affect_pos_excited', 'affect_pos_focused', 
               'affect_pos_happy', 'affect_pos_hopeful', 'affect_pos_motivated', 'affect_pos_relaxedCalm']
    df_affect = fix_columns_by_category(df_affect, indices)
    
    # Rename columns (add prefix SR for self report)
    keep_same = {'StudyDay', 'ParticipantIdentifier'}
    df_affect.columns = ['SR_' + str(col) if col not in keep_same else col for col in df_affect.columns]

    return df_affect

## Function to get goal, past24, next24 dataframe
def get_goals_df(survey_dataframe):
    df_survey = survey_dataframe
    df = df_survey.loc[df_survey.ResultIdentifier.str.startswith('DAILY_'), :]
    df = df[['ParticipantIdentifier', 'ResultIdentifier', 'Answers', 'StudyDay']].reset_index()

    ## Drop duplicate rows # This is hacky ## Show this to daniel ## Date conversion problem
    df = df.drop_duplicates(subset=['ParticipantIdentifier', 'ResultIdentifier', 'StudyDay'], keep="first")
    
    ## Make columns for each measure found in the ResultIdentifier column
    indices = ['StudyDay', 'ParticipantIdentifier']
    df = df.pivot_table(index=indices,
                        columns='ResultIdentifier', 
                        values='Answers',
                        aggfunc=lambda x: ' '.join(x)).reset_index()
    
    ## Dataframe containing goals
    df_goals = df
    
    ## Extract dfs of only past24 and next24 data
    past_24_col = [col for col in df_goals if (col.startswith('DAILY_past24') or col in indices)]
    next_24_col = [col for col in df_goals if (col.startswith('DAILY_next24') or col in indices)]

    df_past24 = df[past_24_col]
    df_next24 = df[next_24_col]
    
    return df_goals, df_past24, df_next24

## Function to calculate gap for one participant
def get_participant_gap_df(df_next24i, df_past24i):
    ## This will store the final gap df
    df_gap = []
    
    ## List of some important column names
    ## Will prove useful during iterations
    gap_cols = ['drinks', 'exercise', 'leisureNonSolo', 'leisureSolo', 'nonoccupation', 
                'occupation', 'sleep', 'socialMedia']
    next_cols = ['DAILY_next24_drinks', 'DAILY_next24_exercise', 'DAILY_next24_leisureNonSolo', 'DAILY_next24_leisureSolo',
                       'DAILY_next24_nonoccupation', 'DAILY_next24_occupation', 'DAILY_next24_sleep', 'DAILY_next24_socialMedia']
    past_cols = ['DAILY_past24_drinks', 'DAILY_past24_exercise', 'DAILY_past24_leisureNonSolo', 'DAILY_past24_leisureSolo',
                       'DAILY_past24_nonoccupation', 'DAILY_past24_occupation', 'DAILY_past24_sleep', 'DAILY_past24_socialMedia']                   

    
    ## Input dataframes
    df_next24 = df_next24i
    df_past24 = df_past24i
    
    ## Loop over each row in df_next24
    for i in range(len(df_next24)):
        
        ## This will hold gap results for one row
        row_gap_dict = {}

        ## Append participant ID and StudyDay
        row_gap_dict['StudyDay'] = df_next24.at[i, 'StudyDay']
        row_gap_dict['ParticipantIdentifier'] = df_next24.at[i, 'ParticipantIdentifier']

        for item in gap_cols:

            if i == 0:
                ## There is no previous day for gap to be calculated
                row_gap_dict[item + '_gap'] = 'NaN'
                
            elif df_next24.at[i, next_cols[gap_cols.index(item)] + '_goal'] == 'False':
                ## This was not set as a goal
                row_gap_dict[item + '_gap'] = 'NaN'
                
            else:

                ## get the date of the previous row in past24 dataframe
                past_row_date = df_past24.at[i - 1, 'StudyDay']

                ## Get current row date of the next24 dataframe
                current_date = df_next24.at[i, 'StudyDay']

                ## Calculate time difference in days
                delta = get_time_diff(current_date, past_row_date)

                ## If the previous day does not exist, gap is NaN
                if delta != 1:
                    row_gap_dict[item + '_gap'] = 'NaN'
                else:
                    ## Calculate gap as required information exists
                    today = float(df_next24.at[i, next_cols[gap_cols.index(item)]])
                    yesterday = float(df_past24.at[i - 1, past_cols[gap_cols.index(item)]])
                    gap = yesterday - today
                    row_gap_dict[item + '_gap'] = gap

        df_gap.append(row_gap_dict)
    df_gap = pd.DataFrame(df_gap)
    return df_gap

### Got all SurveyQuestionResults till date in a dataframe

In [64]:
## Got all SurveyQuestionResults till date in a dataframe

survey_file_name = ""
path = directory + "/"
for f_name in os.listdir(path):
    if f_name.startswith("SurveyQuestionResults"):
        survey_file_name = f_name
        break
path = path + '/' + survey_file_name

current_df = pd.read_csv(path) 

In [None]:
current_df["StudyDay"] = current_df.apply(lambda x: fix_date_to_ET(x.EndDate), axis=1)

In [46]:
df_survey = current_df

### Construct the Affect and Goals dataframes

In [47]:
## Got Affect Dataframe
df_affect = get_affect_df(df_survey)

## Get goals, past24, and next_24 Dataframes
df_goals, df_past24, df_next24 = get_goals_df(df_survey)

### Participant List

In [51]:
df_participants = pd.read_csv(directory + '/StudyParticipants_20220910.csv')
df_participants["CustomFields"] = df_participants["CustomFields"].apply(json.loads)

participant_list = []
for index, row in df_participants.iterrows():
    if row["CustomFields"]["exp_version"] == "app_pilot_1":
        participant_list.append(row["ParticipantIdentifier"])

### Construct the gap Dataframe

In [52]:
## Will get this by concatenating gap dataframes from each good participant
df_gap_list = []
good_subjects = participant_list
for item in good_subjects:
    df_past24_temp = df_past24[df_past24.ParticipantIdentifier.isin([item])].reset_index(drop=True)
    df_next24_temp = df_next24[df_next24.ParticipantIdentifier.isin([item])].reset_index(drop=True)
    current_participant_df_gap = get_participant_gap_df(df_next24_temp, df_past24_temp)
    
    df_gap_list.append(current_participant_df_gap)

df_gap = pd.concat(df_gap_list)

### Combine the three dataframes to make SR Dataframe and Export CSV

In [53]:
## Merge df_goals, df_affect, and df_gap Dataframes
df_self_report = df_affect
df_self_report = df_self_report.merge(df_affect, how='left', on=['ParticipantIdentifier', 'StudyDay'])
df_self_report = df_self_report.merge(df_gap, how='left', on=['ParticipantIdentifier', 'StudyDay'])

In [54]:
## Export the self-report Dataframe as a CSV
df_self_report.to_csv('self_report.csv', index=False)