Combine study data into single "tidy" data frame for subsequent processing and analysis

In [None]:
#getting and working with data
import pandas as pd
import numpy as np
import re
import os
from itertools import groupby
import datetime as dt

#visualizing results
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
#import yellowbrick as yb

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

In [None]:
path_PF_clean = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/TILES/final_data/pf_final.csv'
path_MGT_clean = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/TILES/final_data/mgt_final.csv'
path_part_info = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/TILES/final_data/participant_info.csv'

### prepare participant info df

In [None]:
#read in csv containing participant info
data_part_info = pd.read_csv(path_part_info)
data_part_info = pd.DataFrame(data = data_part_info)
#data_PF.reset_index(inplace=True)

print('Original data_part_info shape:\n', data_part_info.shape, '\n')
#ensure no replicate ID (211 participants in study)
print('Original data_part_info unique IDs:\n', data_part_info['ParticipantID'].unique().shape, '\n')
#how much missing data is there?
print('Original data_part_info missing value counts:\n', data_part_info.isnull().sum(), '\n')
#what is the data type of each column?
print('Original data_part_info data types:\n', data_part_info.info(), '\n')

data_part_info.head()

In [None]:
#there should only be a single entry for each of the 212 participants but data frame is 213 rows, find duplicate ParticipantIDs
print(data_part_info['MitreID'].value_counts())
#examine duplicate
print(data_part_info[data_part_info['MitreID'] == 'SD1042'])
#examine duplicate
print(data_part_info[data_part_info['MitreID'] == 'SD1093'])

In [None]:
#duplicate entry confirmed, delete duplicate 
data_part_info.drop_duplicates(subset='ParticipantID', keep='first', inplace=True)
#remove SD1042 until determined correct ParticipantID etc.
data_part_info = data_part_info[data_part_info['MitreID'] != 'SD1042']

### final clean for MGT (job, health, and personality surveys) df

In [None]:
#read in csv from preprocessed MGT EMAs 
data_MGT = pd.read_csv(path_MGT_clean)
data_MGT = pd.DataFrame(data = data_MGT)

print('Original data_MGT shape:\n', data_MGT.shape, '\n')
#ensure no replicate ID (211 participants in study)
print('Original data_MGT unique IDs:\n', data_MGT['Name'].unique().shape, '\n')

#what is the data type of each column?
print('Original data_MGT data types:\n', data_MGT.info(), '\n')

In [None]:
#change dates from objects to datetimes
data_MGT['Date'] = data_MGT['Date'].astype('datetime64[ns]')
data_MGT['Timestamp'] = data_MGT['Timestamp'].astype('datetime64[ns]')
data_MGT['StartDate2'] = data_MGT['StartDate2'].astype('datetime64[ns]')
data_MGT['EndDate2'] = data_MGT['EndDate2'].astype('datetime64[ns]')

#what is the data type of each column?
print('Original data_MGT data types:\n', data_MGT.info(), '\n')

In [None]:
#find out if there are inconsistencies within participant IDs between MGT and part_info
in_part_not_MGT = set(data_part_info['MitreID'].unique()) - set(data_MGT['Name'].unique())
print('in_part_not_MGT', in_part_not_MGT)
in_MGT_not_part = set(data_MGT['Name'].unique()) - set(data_part_info['MitreID'].unique())
print('in_MGT_not_part', in_MGT_not_part)

print('MGT part length', len(data_MGT['Name'].unique()))
print('Info part length', len(data_part_info['MitreID'].unique()))

data_MGT = data_MGT[(data_MGT['Name'] != 'SD1042') & (data_MGT['Name'] != 'SG1015') & (data_MGT['Name'] != 'SY1001') & (data_MGT['Name'] != 'SG1043')]
in_MGT_not_part = set(data_MGT['Name'].unique()) - set(data_part_info['MitreID'].unique())
print('in_MGT_not_part', in_MGT_not_part)

In [None]:
#there are 10 rows with nan for 'Name', drop these
print(data_MGT.shape)
print(data_MGT[data_MGT['Name'].isnull()].shape)
data_MGT = data_MGT[data_MGT['Name'].isnull() == False]
print(data_MGT.shape)

In [None]:
#add the information contained in data_part_info to data_PF
#first create new data table of data_part_info that contains the same number of rows for each participant in that is in data_PF
#(e.g. replicate data_part_info so same length as data_PF for each participant)

data_MGT = data_MGT.sort_values(by=['Name'], ascending=True)

participants = data_MGT['Name'].unique()

data_MGT_part = pd.DataFrame()

for part in participants:
    df_part_long = pd.concat([data_part_info[data_part_info['MitreID'] == part]]*len(data_MGT[data_MGT['Name'] ==  part]), ignore_index=True)
    df_part_long.reset_index(inplace=True)
    data_MGT_part_int = pd.concat([df_part_long, data_MGT[data_MGT['Name'] ==  part].reset_index()], axis = 1)
    data_MGT_part = data_MGT_part.append(data_MGT_part_int)

#confirm the two data tables are now the same lenght
print('data_MGT and data_MGT_part are the same length:', data_MGT.shape[0] == data_MGT_part.shape[0])
print(data_MGT.shape[0])
print(data_MGT_part.shape[0])
print('does the math make sense?', data_MGT_part.shape[0] == (data_MGT_part['Name'].values == data_MGT_part['MitreID'].values).sum())
data_MGT_part.head()

### final clean for psychological flexibility df

In [None]:
#read in csv from preprocessed psychologial felxibility EMAs 
data_PF = pd.read_csv(path_PF_clean)
data_PF = pd.DataFrame(data = data_PF)

print('Original data_PF_S3 shape:\n', data_PF.shape, '\n')
#ensure no replicate ID (211 participants in study)
print('Original data_PF_S3 unique IDs:\n', data_PF['participant_id'].unique().shape, '\n')
#how much missing data is there?
print('Original data_PF_S3 missing value counts:\n', data_PF.isnull().sum(), '\n')
#what is the data type of each column?
print('Original data_PF_S3 data types:\n', data_PF.info(), '\n')
#what is the participant response rate across the entire study?
print('Non-response rate for PF survey:\n', data_PF['completed_ts_utc'].isnull().sum() / data_PF.shape[0] * 100, '%')
#add a binary column for if survey was completed (1) or not (0)
data_PF['completed'] = np.where(data_PF['results_updated'].isnull(), 0, 1)

In [None]:
#change dates from objects to datetimes
data_PF['survey_dt'] = data_PF['survey_dt'].astype('datetime64')
data_PF['delivered_ts_utc'] = data_PF['delivered_ts_utc'].astype('datetime64[ns]')
data_PF['started_ts_utc'] = data_PF['started_ts_utc'].astype('datetime64[ns]')
data_PF['completed_ts_utc'] = data_PF['completed_ts_utc'].astype('datetime64[ns]')
data_PF['ingested_ts_utc'] = data_PF['ingested_ts_utc'].astype('datetime64[ns]')

#confirm change 
print(data_PF.info())

In [None]:
#compute time between when survey is sent and when participant starts the survey
data_PF['start_delay'] = (data_PF['started_ts_utc'] - data_PF['delivered_ts_utc']).astype('timedelta64[s]')
#compute time between when survey is sent and when participant starts the survey
data_PF['time_to_complete'] = (data_PF['completed_ts_utc'] - data_PF['started_ts_utc']).astype('timedelta64[s]')

In [None]:
#make new column of only numeric corresponding to activity questions (code -1 for write in responses)
activity_num = []
for index, row in data_PF.iterrows():
    try:
        num = float(row['activity'])
        activity_num.append(num)
    except:
        activity_num.append(-1)

data_PF['activity_num'] = activity_num

In [None]:
#find out if there are inconsistencies within participant IDs between PF and part_info
in_part_not_PF = set(data_part_info['ParticipantID'].unique()) - set(data_PF['participant_id'].unique())
print('in_part_not_PF', in_part_not_PF)
in_PF_not_part = set(data_PF['participant_id'].unique()) - set(data_part_info['ParticipantID'].unique())
print('in_PF_not_part', in_PF_not_part)

print('PF part length', len(data_PF['participant_id'].unique()))
print('Info part length', len(data_part_info['ParticipantID'].unique()))

data_PF = data_PF[(data_PF['participant_id'] != '4e471779-ecbc-4b5a-b6b0-fcbfc9479faa') & (data_PF['participant_id'] != '24166136-6ee3-4521-abaf-972fbe83d15d')]

in_PF_not_part = set(data_PF['participant_id'].unique()) - set(data_part_info['ParticipantID'].unique())
print('in_PF_not_part', in_PF_not_part)

In [None]:
#add the information contained in data_part_info to data_PF
#first create new data table of data_part_info that contains the same number of rows for each participant in that is in data_PF
#(e.g. replicate data_part_info so same length as data_PF for each participant)

data_PF = data_PF.sort_values(by=['participant_id'], ascending=True)

participants = data_PF['participant_id'].unique()

data_PF_part = pd.DataFrame()

for part in participants:
    df_part_long = pd.concat([data_part_info[data_part_info['ParticipantID'] == part]]*len(data_PF[data_PF['participant_id'] ==  part]), ignore_index=True)
    df_part_long.reset_index(inplace=True)
    df_part_long_int = pd.concat([df_part_long, data_PF[data_PF['participant_id'] ==  part].reset_index()], axis = 1)
    data_PF_part = data_PF_part.append(df_part_long_int)

#confirm the two data tables are now the same lenght
print('data_PF and data_part_info_long are the same length:', data_PF.shape[0] == data_PF_part.shape[0])
print(data_PF.shape[0])
print(data_PF_part.shape[0])
print('does the math make sense?', data_PF_part.shape[0] == (data_PF_part['participant_id'].values == data_PF_part['ParticipantID'].values).sum())
data_PF_part.head()

### reconcile columns to keep and combine dfs

In [None]:
#rename columns to match with PF survey
data_MGT_part = data_MGT_part.rename({'Name': 'name', 'StartDate2': 'date_time', 'ResponseID': 'survey_id', 'surveytype': 'survey_type', 'Q_TotalDuration': 'time_to_complete'}, axis = 1)
#add a time and date columns
data_MGT_part['time'] = data_MGT_part['date_time'].dt.time
data_MGT_part['date'] = data_MGT_part['date_time'].dt.date

In [None]:
#rename columns to match
data_PF_part = data_PF_part.rename({'survey_dt': 'date', 'delivered_ts_utc': 'date_time'}, axis = 1)
#add a time sent column
data_PF_part['Timesent'] = data_PF_part['date_time'].dt.time
#add a time and date columns
data_PF_part['time'] = data_PF_part['date_time'].dt.time
data_PF_part['date'] = data_PF_part['date_time'].dt.date

In [None]:
#select final columns and orgainze for MGT

meta_data = ['MitreID', 'ParticipantID', 'PrimaryUnit', 'SmartPhone', 'Sex',
       'Shift', 'Wave', 'survey_id', 'survey_type', 'date_time', 'date', 'time', 'Timesent', 'time_to_complete']

shared_questions = ['context1', 'context2', 'context2_10_TEXT', 'context2_TEXT', 'context3',
       'context3_7_TEXT', 'context3_TEXT', 'context4', 'context4_3_TEXT',
       'context4_TEXT', 'pand1', 'pand2', 'pand3', 'pand4', 'pand5', 'pand6', 'pand7', 'pand8', 'pand9', 'pand10', 'pand_pand1',
       'pand_pand2', 'pand_pand3', 'pand_pand4', 'pand_pand5',
       'pand_pand6', 'pand_pand7', 'pand_pand8', 'pand_pand9', 'pand_pand10', 'anxiety', 'stress']

personality_questions = ['bfid1', 'bfid2', 'bfid3', 'bfid4', 'bfid5',
       'bfid6', 'bfid7', 'bfid8', 'bfid9', 'bfid10', 'bfid_bfid1',
       'bfid_bfid2', 'bfid_bfid3', 'bfid_bfid4', 'bfid_bfid5',
       'bfid_bfid6', 'bfid_bfid7', 'bfid_bfid8', 'bfid_bfid9', 'bfid_bfid10']

job_questions = ['work', 'irbd1', 'irbd2', 'irbd3', 'irbd4', 'irbd5',
       'irbd6', 'irbd7', 'irbd_irbd1', 'irbd_irbd2', 'irbd_irbd3',
       'irbd_irbd4', 'irbd_irbd5', 'irbd_irbd6', 'irbd_irbd7', 'itpd1',
       'itpd2', 'itpd3', 'itpd_itpd1', 'itpd_itpd2', 'itpd_itpd3', 'dalal1', 'dalal2', 'dalal3',
       'dalal4', 'dalal5', 'dalal6', 'dalal7', 'dalal8', 'dalal9', 'dalal10', 'dalal11', 'dalal12',
       'dalal13', 'dalal14', 'dalal15', 'dalal16', 
       'dalal_dalal1',
       'dalal_dalal2', 'dalal_dalal3', 'dalal_dalal4', 'dalal_dalal5',
       'dalal_dalal6', 'dalal_dalal7', 'dalal_dalal8', 'dalal_dalal9',  'dalal_dalal10', 'dalal_dalal11', 'dalal_dalal12',
       'dalal_dalal13', 'dalal_dalal14', 'dalal_dalal15', 'dalal_dalal16']
    
health_questions = ['alc1', 'alc2_1', 'alc2_2', 'alc2_3', 'tob1', 'tob2_1', 'tob2_2', 'tob2_3',
       'tob2_4', 'tob2_5', 'tob2_6', 'tob2_7', 'ex1_1', 'ex2_1', 'sleep_1']
       
    
data_MGT_final = data_MGT_part[meta_data + shared_questions + job_questions + health_questions + personality_questions]
data_MGT_final.head()

In [None]:
#deal with duplicate column
data_MGT_final.columns = ['MitreID', 'ParticipantID', 'PrimaryUnit', 'SmartPhone', 'Sex',
       'Shift', 'Wave', 'survey_id', 'survey_type', 'date_time', 'date',
       'time', 'Timesent', 'time_to_complete1', 'time_to_complete2',
       'context1', 'context2', 'context2_10_TEXT', 'context2_TEXT',
       'context3', 'context3_7_TEXT', 'context3_TEXT', 'context4',
       'context4_3_TEXT', 'context4_TEXT', 'pand1', 'pand2', 'pand3',
       'pand4', 'pand5', 'pand6', 'pand7', 'pand8', 'pand9', 'pand10',
       'pand_pand1', 'pand_pand2', 'pand_pand3', 'pand_pand4',
       'pand_pand5', 'pand_pand6', 'pand_pand7', 'pand_pand8',
       'pand_pand9', 'pand_pand10', 'anxiety', 'stress', 'work', 'irbd1',
       'irbd2', 'irbd3', 'irbd4', 'irbd5', 'irbd6', 'irbd7', 'irbd_irbd1',
       'irbd_irbd2', 'irbd_irbd3', 'irbd_irbd4', 'irbd_irbd5',
       'irbd_irbd6', 'irbd_irbd7', 'itpd1', 'itpd2', 'itpd3',
       'itpd_itpd1', 'itpd_itpd2', 'itpd_itpd3', 'dalal1', 'dalal2',
       'dalal3', 'dalal4', 'dalal5', 'dalal6', 'dalal7', 'dalal8',
       'dalal9', 'dalal10', 'dalal11', 'dalal12', 'dalal13', 'dalal14',
       'dalal15', 'dalal16', 'dalal_dalal1', 'dalal_dalal2',
       'dalal_dalal3', 'dalal_dalal4', 'dalal_dalal5', 'dalal_dalal6',
       'dalal_dalal7', 'dalal_dalal8', 'dalal_dalal9', 'dalal_dalal10',
       'dalal_dalal11', 'dalal_dalal12', 'dalal_dalal13', 'dalal_dalal14',
       'dalal_dalal15', 'dalal_dalal16', 'alc1', 'alc2_1', 'alc2_2',
       'alc2_3', 'tob1', 'tob2_1', 'tob2_2', 'tob2_3', 'tob2_4', 'tob2_5',
       'tob2_6', 'tob2_7', 'ex1_1', 'ex2_1', 'sleep_1', 'bfid1', 'bfid2',
       'bfid3', 'bfid4', 'bfid5', 'bfid6', 'bfid7', 'bfid8', 'bfid9',
       'bfid10', 'bfid_bfid1', 'bfid_bfid2', 'bfid_bfid3', 'bfid_bfid4',
       'bfid_bfid5', 'bfid_bfid6', 'bfid_bfid7', 'bfid_bfid8',
       'bfid_bfid9', 'bfid_bfid10']
data_MGT_final = data_MGT_final.drop(['time_to_complete2'], axis = 1)
data_MGT_final = data_MGT_final.rename({'time_to_complete1': 'time_to_complete'}, axis='columns')
data_MGT_final.head()

In [None]:
data_MGT_final = data_MGT_final.sort_values(by=['Wave', 'MitreID', 'date'])

In [None]:
#select final columns and orgainze for PF

meta_data = ['MitreID', 'ParticipantID', 'PrimaryUnit', 'SmartPhone', 'Sex',
       'Shift', 'Wave', 'survey_id', 'survey_type', 'date_time', 'date', 'time', 'Timesent', 'completed', 'start_delay', 'time_to_complete']

questions = ['activity', 'activity_num', 'pf_03', 'pf_04', 'pf_05', 'pf_06', 'pf_07', 'pf_08',
       'pf_09', 'pf_10', 'pf_11', 'pf_12', 'pf_13', 'pf_14', 'pf_15',
       'pf_mgt', 'exp_0', 'exp_1', 'exp_2', 'exp_3', 'exp_4', 'exp_5',
       'exp_6', 'exp_7', 'exp_8', 'exp_9', 'exp_10', 'exp_11', 'exp_12',
       'exp_13', 'exp_neg', 'exp_pos', 'exp_neut']
    
data_PF_final = data_PF_part[meta_data + questions]
data_PF_final.head()

In [None]:
data_PF_final = data_PF_final.sort_values(by=['Wave', 'MitreID', 'date'])

In [None]:
#create final df with all 4 surveys
data_final = pd.DataFrame(columns=['MitreID', 'ParticipantID', 'PrimaryUnit', 'SmartPhone', 'Sex',
       'Shift', 'Wave', 'survey_id', 'survey_type', 'date_time', 'date',
       'time', 'Timesent', 'completed', 'start_delay', 'time_to_complete',
       'activity', 'activity_num', 'pf_03', 'pf_04', 'pf_05', 'pf_06',
       'pf_07', 'pf_08', 'pf_09', 'pf_10', 'pf_11', 'pf_12', 'pf_13',
       'pf_14', 'pf_15', 'pf_mgt', 'exp_0', 'exp_1', 'exp_2', 'exp_3',
       'exp_4', 'exp_5', 'exp_6', 'exp_7', 'exp_8', 'exp_9', 'exp_10',
       'exp_11', 'exp_12', 'exp_13', 'exp_neg', 'exp_pos', 'exp_neut', 'context1', 'context2',
       'context2_10_TEXT', 'context2_TEXT', 'context3', 'context3_7_TEXT',
       'context3_TEXT', 'context4', 'context4_3_TEXT', 'context4_TEXT',
       'pand1', 'pand2', 'pand3', 'pand4', 'pand5', 'pand6', 'pand7',
       'pand8', 'pand9', 'pand10', 'pand_pand1', 'pand_pand2',
       'pand_pand3', 'pand_pand4', 'pand_pand5', 'pand_pand6',
       'pand_pand7', 'pand_pand8', 'pand_pand9', 'pand_pand10', 'anxiety',
       'stress', 'work', 'irbd1', 'irbd2', 'irbd3', 'irbd4', 'irbd5',
       'irbd6', 'irbd7', 'irbd_irbd1', 'irbd_irbd2', 'irbd_irbd3',
       'irbd_irbd4', 'irbd_irbd5', 'irbd_irbd6', 'irbd_irbd7', 'itpd1',
       'itpd2', 'itpd3', 'itpd_itpd1', 'itpd_itpd2', 'itpd_itpd3',
       'dalal1', 'dalal2', 'dalal3', 'dalal4', 'dalal5', 'dalal6',
       'dalal7', 'dalal8', 'dalal9', 'dalal10', 'dalal11', 'dalal12',
       'dalal13', 'dalal14', 'dalal15', 'dalal16', 'dalal_dalal1',
       'dalal_dalal2', 'dalal_dalal3', 'dalal_dalal4', 'dalal_dalal5',
       'dalal_dalal6', 'dalal_dalal7', 'dalal_dalal8', 'dalal_dalal9',
       'dalal_dalal10', 'dalal_dalal11', 'dalal_dalal12', 'dalal_dalal13',
       'dalal_dalal14', 'dalal_dalal15', 'dalal_dalal16', 'alc1',
       'alc2_1', 'alc2_2', 'alc2_3', 'tob1', 'tob2_1', 'tob2_2', 'tob2_3',
       'tob2_4', 'tob2_5', 'tob2_6', 'tob2_7', 'ex1_1', 'ex2_1',
       'sleep_1', 'bfid1', 'bfid2', 'bfid3', 'bfid4', 'bfid5', 'bfid6',
       'bfid7', 'bfid8', 'bfid9', 'bfid10', 'bfid_bfid1', 'bfid_bfid2',
       'bfid_bfid3', 'bfid_bfid4', 'bfid_bfid5', 'bfid_bfid6',
       'bfid_bfid7', 'bfid_bfid8', 'bfid_bfid9', 'bfid_bfid10'])

In [None]:
data_final = data_final.append([data_MGT_final, data_PF_final], ignore_index=True, sort=False)
data_final.tail()

### create final dfs with 70 rows per participant per survey type (70 corresponds to days in study)

In [None]:
#process wave 1 data
wave_1 = data_final[data_final['Wave'] == 1]
wave_1 = wave_1.sort_values(by=['date'])
#wave 1 started on 3/5/2018, remove pilot data with dates prior to start
wave_1 = wave_1[wave_1['date'] > dt.date(2018, 3, 4)]
print(wave_1.shape)

In [None]:
wave_1_rows = pd.DataFrame(columns=wave_1.columns)
wave_1_rows['wave_study_date'] = pd.date_range('2018-03-05', periods=71, freq='D')
wave_1_rows['wave_study_day'] = wave_1_rows.index
    
final_wave1 = pd.DataFrame()

for participant in wave_1['ParticipantID'].unique():
    #get data corresponding to participant
    data_part = wave_1[wave_1['ParticipantID'] == participant]
    print(participant)
    print(data_part.shape)
    
    #df to fill with 70 rows according to 70 dates of study
    data_part_long = pd.DataFrame()
    
    i = 0

    while i < wave_1_rows.shape[0]:
        #get date
        date = pd.date_range('2018-03-05', periods=71, freq='D')[i].date()
        #get participant data that matches that date
        data_part_date = data_part.loc[data_part['date'] == date]
        #get survey date data that matches that date
        wave_date_df = wave_1_rows.loc[wave_1_rows['wave_study_date'] == date]
        
        if data_part_date.shape[0] > 0:
            data_int = data_part_date
            #some dates had more than one survey sent 
            data_int['wave_study_date'] = wave_date_df['wave_study_date'].values.repeat(data_part_date.shape[0])
            data_int['wave_study_day'] = wave_date_df['wave_study_day'].values.repeat(data_part_date.shape[0])
            
            data_part_long = data_part_long.append(data_int)

        else:
            #fill in row corresponding to date when no survey was sent
            data_int = wave_date_df
            #fill in meta data for participant
            data_int['MitreID'] = data_part['MitreID'].unique()
            data_int['ParticipantID'] = participant
            data_int['PrimaryUnit'] = data_part['PrimaryUnit'].unique()
            data_int['SmartPhone'] = data_part['SmartPhone'].unique()
            data_int['Sex'] = data_part['Sex'].unique()
            data_int['Shift'] = data_part['Shift'].unique()
            data_int['Wave'] = data_part['Wave'].unique()
            data_part_long = data_part_long.append(data_int)
             
        i = i + 1
    data_part_long.reset_index(inplace=True)
    final_wave1 = pd.concat([final_wave1, data_part_long], axis = 0, ignore_index=True)

In [None]:
#process wave 2 data
wave_2 = data_final[data_final['Wave'] == 2]
wave_2 = wave_2.sort_values(by=['date'])
#wave 2 started on 4/9/2018, remove pilot data with dates prior to start
wave_2 = wave_2[wave_2['date'] > dt.date(2018, 4, 8)]
print(wave_2.shape)

In [None]:
wave_2_rows = pd.DataFrame(columns=wave_2.columns)
wave_2_rows['wave_study_date'] = pd.date_range('2018-04-09', periods=71, freq='D')
wave_2_rows['wave_study_day'] = wave_2_rows.index
    
final_wave2 = pd.DataFrame()

for participant in wave_2['ParticipantID'].unique():
    #get data corresponding to participant
    data_part = wave_2[wave_2['ParticipantID'] == participant]
    print(participant)
    print(data_part.shape)
    
    #df to fill with 70 rows according to 70 dates of study
    data_part_long = pd.DataFrame()
    
    i = 0

    while i < wave_2_rows.shape[0]:
        #get date
        date = pd.date_range('2018-04-09', periods=71, freq='D')[i].date()
        #get participant data that matches that date
        data_part_date = data_part.loc[data_part['date'] == date]
        #get survey date data that matches that date
        wave_date_df = wave_2_rows.loc[wave_2_rows['wave_study_date'] == date]
        
        if data_part_date.shape[0] > 0:
            data_int = data_part_date
            #some dates had more than one survey sent 
            data_int['wave_study_date'] = wave_date_df['wave_study_date'].values.repeat(data_part_date.shape[0])
            data_int['wave_study_day'] = wave_date_df['wave_study_day'].values.repeat(data_part_date.shape[0])
            
            data_part_long = data_part_long.append(data_int)

        else:
            #fill in row corresponding to date when no survey was sent
            data_int = wave_date_df
            #fill in meta data for participant
            data_int['MitreID'] = data_part['MitreID'].unique()
            data_int['ParticipantID'] = participant
            data_int['PrimaryUnit'] = data_part['PrimaryUnit'].unique()
            data_int['SmartPhone'] = data_part['SmartPhone'].unique()
            data_int['Sex'] = data_part['Sex'].unique()
            data_int['Shift'] = data_part['Shift'].unique()
            data_int['Wave'] = data_part['Wave'].unique()
            data_part_long = data_part_long.append(data_int)
             
        i = i + 1
    data_part_long.reset_index(inplace=True)
    final_wave2 = pd.concat([final_wave2, data_part_long], axis = 0, ignore_index=True)

In [None]:
#process wave 3 data
wave_3 = data_final[data_final['Wave'] == 3]
wave_3 = wave_3.sort_values(by=['date'])
#wave 3 started on 5/4/2018, remove pilot data with dates prior to start
wave_3 = wave_3[wave_3['date'] > dt.date(2018, 5, 3)]
print(wave_3.shape)

In [None]:
wave_3_rows = pd.DataFrame(columns=wave_3.columns)
wave_3_rows['wave_study_date'] = pd.date_range('2018-05-04', periods=71, freq='D')
wave_3_rows['wave_study_day'] = wave_3_rows.index
    
final_wave3 = pd.DataFrame()

for participant in wave_3['ParticipantID'].unique():
    #get data corresponding to participant
    data_part = wave_3[wave_3['ParticipantID'] == participant]
    print(participant)
    print(data_part.shape)
    
    #df to fill with 70 rows according to 70 dates of study
    data_part_long = pd.DataFrame()
    
    i = 0

    while i < wave_3_rows.shape[0]:
        #get date
        date = pd.date_range('2018-04-09', periods=71, freq='D')[i].date()
        #get participant data that matches that date
        data_part_date = data_part.loc[data_part['date'] == date]
        #get survey date data that matches that date
        wave_date_df = wave_3_rows.loc[wave_3_rows['wave_study_date'] == date]
        
        if data_part_date.shape[0] > 0:
            data_int = data_part_date
            #some dates had more than one survey sent 
            data_int['wave_study_date'] = wave_date_df['wave_study_date'].values.repeat(data_part_date.shape[0])
            data_int['wave_study_day'] = wave_date_df['wave_study_day'].values.repeat(data_part_date.shape[0])
            
            data_part_long = data_part_long.append(data_int)

        else:
            #fill in row corresponding to date when no survey was sent
            data_int = wave_date_df
            #fill in meta data for participant
            data_int['MitreID'] = data_part['MitreID'].unique()
            data_int['ParticipantID'] = participant
            data_int['PrimaryUnit'] = data_part['PrimaryUnit'].unique()
            data_int['SmartPhone'] = data_part['SmartPhone'].unique()
            data_int['Sex'] = data_part['Sex'].unique()
            data_int['Shift'] = data_part['Shift'].unique()
            data_int['Wave'] = data_part['Wave'].unique()
            data_part_long = data_part_long.append(data_int)
             
        i = i + 1
    data_part_long.reset_index(inplace=True)
    final_wave3 = pd.concat([final_wave3, data_part_long], axis = 0, ignore_index=True)

In [None]:
data_final_combined = pd.concat([final_wave1, final_wave2, final_wave3], axis = 0, ignore_index=True)
print(data_final_combined.shape)
data_final_combined.head()

In [None]:
#save to csv
data_final_combined.to_csv('data_final_combined.csv')