Combine study data into single "tidy" data frame for subsequent processing and analysis

In [2]:
#getting and working with data
import pandas as pd
import numpy as np
import re
import os
from itertools import groupby
import datetime as dt

#visualizing results
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
#import yellowbrick as yb

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

In [3]:
path_PF_clean = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/TILES/final_data/pf_final.csv'
path_engage_clean = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/TILES/final_data/engage_final.csv'
path_MGT_clean = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/TILES/final_data/mgt_final.csv'
path_part_info = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/TILES/final_data/S3participant_info_updated.csv'

### prepare participant info df

In [4]:
#read in csv containing participant info
data_part_info = pd.read_csv(path_part_info)
data_part_info = pd.DataFrame(data = data_part_info)
#data_PF.reset_index(inplace=True)

print('Original data_part_info shape:\n', data_part_info.shape, '\n')
#ensure no replicate ID (211 participants in study)
print('Original data_part_info unique Participant IDs:\n', data_part_info['ParticipantID'].unique().shape, '\n')
#ensure no replicate ID (211 participants in study)
print('Original data_part_info unique Mitre IDs:\n', data_part_info['MitreID'].unique().shape, '\n')
#how much missing data is there?
print('Original data_part_info missing value counts:\n', data_part_info.isnull().sum(), '\n')
#what is the data type of each column?
print('Original data_part_info data types:\n', data_part_info.info(), '\n')

data_part_info.head()

Original data_part_info shape:
 (212, 7) 

Original data_part_info unique Participant IDs:
 (212,) 

Original data_part_info unique Mitre IDs:
 (212,) 

Original data_part_info missing value counts:
 ParticipantID    0
MitreID          0
PrimaryUnit      1
SmartPhone       0
Sex              0
Shift            0
Wave             0
dtype: int64 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212 entries, 0 to 211
Data columns (total 7 columns):
ParticipantID    212 non-null object
MitreID          212 non-null object
PrimaryUnit      211 non-null object
SmartPhone       212 non-null object
Sex              212 non-null object
Shift            212 non-null object
Wave             212 non-null int64
dtypes: int64(1), object(6)
memory usage: 11.7+ KB
Original data_part_info data types:
 None 



Unnamed: 0,ParticipantID,MitreID,PrimaryUnit,SmartPhone,Sex,Shift,Wave
0,e3e5e4aa-5950-4f1f-915c-c67598965b03,SD1001,7 South ICU,Android,Male,Day shift,1
1,0ec84778-1a98-4cd7-aa11-05997ddadd52,SD1002,6 South,iPhone,Female,Day shift,1
2,02b7a595-6508-46bd-8239-6deb433d6290,SD1003,7 West ICU,Android,Female,Day shift,1
3,f596b3ca-7b25-4632-b986-7b44448d3f2f,SD1004,5 East,iPhone,Female,Day shift,1
4,235be35e-4e50-4996-80d7-ce701d2dca4b,SD1005,9 East;9 West,iPhone,Female,Day shift,1


In [5]:
#there should only be a single entry for each of the 212 participants, ensure only single entry per participant
data_part_info.loc[data_part_info['MitreID'].value_counts().values > 1]

Unnamed: 0,ParticipantID,MitreID,PrimaryUnit,SmartPhone,Sex,Shift,Wave


### final clean for MGT (job, health, and personality surveys) df

In [6]:
#read in csv from preprocessed MGT EMAs 
data_MGT = pd.read_csv(path_MGT_clean)
data_MGT = pd.DataFrame(data = data_MGT)

print('Original data_MGT shape:\n', data_MGT.shape, '\n')
#ensure no replicate ID (211 participants in study)
print('Original data_MGT unique IDs:\n', data_MGT['Name'].unique().shape, '\n')

#what is the data type of each column?
print('Original data_MGT data types:\n', data_MGT.info(), '\n')

Original data_MGT shape:
 (11498, 134) 

Original data_MGT unique IDs:
 (210,) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11498 entries, 0 to 11497
Columns: 134 entries, Unnamed: 0 to time_to_complete
dtypes: float64(65), int64(3), object(66)
memory usage: 11.8+ MB
Original data_MGT data types:
 None 



In [7]:
#there should be no rows with null for participant_id (eg Name)
data_MGT[data_MGT['Name'].isnull()]

Unnamed: 0.1,Unnamed: 0,Date,DayWeek,Name,Q_TotalDuration,Timesent,Timestamp,ResponseID,ResponseSet,StartDate2,EndDate2,Finished2,alc1,alc2_1,alc2_2,alc2_3,anxiety,bfid1,bfid10,bfid2,bfid3,bfid4,bfid5,bfid6,bfid7,bfid8,bfid9,bfid_bfid1,bfid_bfid10,bfid_bfid2,bfid_bfid3,bfid_bfid4,bfid_bfid5,bfid_bfid6,bfid_bfid7,bfid_bfid8,bfid_bfid9,context1,context2,context2_10_TEXT,context2_TEXT,context3,context3_7_TEXT,context3_TEXT,context4,context4_3_TEXT,context4_TEXT,dalal1,dalal10,dalal11,dalal12,dalal13,dalal14,dalal15,dalal16,dalal2,dalal3,dalal4,dalal5,dalal6,dalal7,dalal8,dalal9,dalal_dalal1,dalal_dalal10,dalal_dalal11,dalal_dalal12,dalal_dalal13,dalal_dalal14,dalal_dalal15,dalal_dalal16,dalal_dalal2,dalal_dalal3,dalal_dalal4,dalal_dalal5,dalal_dalal6,dalal_dalal7,dalal_dalal8,dalal_dalal9,ex1_1,ex2_1,irbd1,irbd2,irbd3,irbd4,irbd5,irbd6,irbd7,irbd_irbd1,irbd_irbd2,irbd_irbd3,irbd_irbd4,irbd_irbd5,irbd_irbd6,irbd_irbd7,itpd1,itpd2,itpd3,itpd_itpd1,itpd_itpd2,itpd_itpd3,pand1,pand10,pand2,pand3,pand4,pand5,pand6,pand7,pand8,pand9,pand_pand1,pand_pand10,pand_pand2,pand_pand3,pand_pand4,pand_pand5,pand_pand6,pand_pand7,pand_pand8,pand_pand9,sleep_1,stress,surveytype,tob1,tob2_1,tob2_2,tob2_3,tob2_4,tob2_5,tob2_6,tob2_7,work,time_to_complete
218,226,2018-03-27,"Tuesday, March 27th",,21,12:00pm,2019-06-08 14:29:37,9QTTBs3EWL8r601,Default Response Set,2018-03-27 14:29:37,2018-03-27 14:29:58,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,health,,,,,,,,,,21
248,257,2018-03-27,"Tuesday, March 27th",,36,12:00pm,2019-06-08 14:27:46,3OfADpYcM24lY6k,Default Response Set,2018-03-27 14:27:46,2018-03-27 14:28:22,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,job,,,,,,,,,,36
458,475,2018-03-27,"Tuesday, March 27th",,29,12:00pm,2019-06-08 14:31:26,31N24IYieb3lbDT,Default Response Set,2018-03-27 14:31:26,2018-03-27 14:31:55,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,personality,,,,,,,,,,29
5080,5161,2018-05-24,"Thursday, May 24th",,174,6:00am,2019-06-08 10:11:56,ApTnS4vpUobH3z3,Default Response Set,2018-05-24 10:11:56,2018-05-24 10:14:50,1.0,2.0,,,,1.0,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,,2.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,300.0,100.0,,,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,2.0,3.0,1.0,1.0,1.0,1.0,1.0,,,,,,,,,,,6.0,3.0,health,2.0,,,,,,,,,174
11364,11674,2018-01-30,"Tuesday, January 30th",,64,12:00pm,2019-06-08 15:00:02,3jYzEqSLjfvozpd,Default Response Set,2018-01-30 15:00:02,2018-01-30 15:01:07,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,personality,,,,,,,,,,64
11365,11675,2018-01-30,"Tuesday, January 30th",,9,12:00pm,2019-06-08 15:01:11,3Pv5a9x85mWl9HE,Default Response Set,2018-01-30 15:01:11,2018-01-30 15:01:21,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,personality,,,,,,,,,,9
11366,11676,2018-01-30,"Tuesday, January 30th",,10,12:00pm,2019-06-08 15:01:25,2SrgMgBfGLQVwsS,Default Response Set,2018-01-30 15:01:25,2018-01-30 15:01:35,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,personality,,,,,,,,,,10
11371,11682,2018-01-30,"Tuesday, January 30th",,200,12:00pm,2019-06-08 14:56:33,1QoRsaraOztSSVD,Default Response Set,2018-01-30 14:56:33,2018-01-30 14:59:53,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,11.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,health,,,,,,,,,,200
11413,11736,2018-01-30,"Tuesday, January 30th",,35,12:00pm,2019-06-08 14:30:30,DoDEvHFz4PZ2wJr,Default Response Set,2018-01-30 14:30:30,2018-01-30 14:31:06,1.0,,,,,4.0,,,,,,,,,,,,,,,,,,,,,,9.0,,,4.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,4.0,job,,,,,,,,,2.0,35
11414,11737,2018-01-30,"Tuesday, January 30th",,1250,12:00pm,2019-06-08 14:32:18,2urExv37JEqbO4J,Default Response Set,2018-01-30 14:32:18,2018-01-30 14:53:09,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,,,,,,,,,,,job,,,,,,,,,1.0,1250


In [8]:
#there are 10 rows with nan for 'Name', drop these
print(data_MGT.shape)
print(data_MGT[data_MGT['Name'].isnull()].shape)
data_MGT = data_MGT[data_MGT['Name'].isnull() == False]
print(data_MGT.shape)

(11498, 134)
(10, 134)
(11488, 134)


In [9]:
#change dates from objects to datetimes
data_MGT['Date'] = data_MGT['Date'].astype('datetime64[ns]')
data_MGT['Timestamp'] = data_MGT['Timestamp'].astype('datetime64[ns]')
data_MGT['StartDate2'] = data_MGT['StartDate2'].astype('datetime64[ns]')
data_MGT['EndDate2'] = data_MGT['EndDate2'].astype('datetime64[ns]')

#what is the data type of each column?
print('Original data_MGT data types:\n', data_MGT.info(), '\n')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11488 entries, 0 to 11497
Columns: 134 entries, Unnamed: 0 to time_to_complete
dtypes: datetime64[ns](4), float64(65), int64(3), object(62)
memory usage: 11.8+ MB
Original data_MGT data types:
 None 



In [10]:
#make new column of only numeric corresponding to location (context 3) and activity (context 2) questions (code -1 for write in responses)
activity_num = []
for index, row in data_MGT.iterrows():
    try:
        num = float(row['context2'])
        activity_num.append(num)
    except:
        activity_num.append(-1)

data_MGT['activity_num'] = activity_num


location_num = []
for index, row in data_MGT.iterrows():
    try:
        num = float(row['context3'])
        location_num.append(num)
    except:
        location_num.append(-1)

data_MGT['location_num'] = location_num

In [11]:
#find out if there are inconsistencies within participant IDs between MGT and part_info
in_part_not_MGT = set(data_part_info['MitreID'].unique()) - set(data_MGT['Name'].unique())
print('in_part_not_MGT', in_part_not_MGT)
in_MGT_not_part = set(data_MGT['Name'].unique()) - set(data_part_info['MitreID'].unique())
print('in_MGT_not_part', in_MGT_not_part)

print('MGT part length', len(data_MGT['Name'].unique()))
print('Info part length', len(data_part_info['MitreID'].unique()))

in_part_not_MGT {'SD1050', 'SD1081', 'SG1042'}
in_MGT_not_part set()
MGT part length 209
Info part length 212


In [12]:
#add the information contained in data_part_info to data_PF
#first create new data table of data_part_info that contains the same number of rows for each participant in that is in data_PF
#(e.g. replicate data_part_info so same length as data_PF for each participant)

data_MGT = data_MGT.sort_values(by=['Name'], ascending=True)

participants = data_MGT['Name'].unique()

data_MGT_part = pd.DataFrame()

for part in participants:
    df_part_long = pd.concat([data_part_info[data_part_info['MitreID'] == part]]*len(data_MGT[data_MGT['Name'] ==  part]), ignore_index=True)
    df_part_long.reset_index(inplace=True)
    data_MGT_part_int = pd.concat([df_part_long, data_MGT[data_MGT['Name'] ==  part].reset_index()], axis = 1)
    data_MGT_part = data_MGT_part.append(data_MGT_part_int)

#confirm the two data tables are now the same lenght
print('data_MGT and data_MGT_part are the same length:', data_MGT.shape[0] == data_MGT_part.shape[0])
print(data_MGT.shape[0])
print(data_MGT_part.shape[0])
print('does the math make sense?', data_MGT_part.shape[0] == (data_MGT_part['Name'].values == data_MGT_part['MitreID'].values).sum())
data_MGT_part.head()

data_MGT and data_MGT_part are the same length: True
11488
11488
does the math make sense? True


Unnamed: 0.1,index,ParticipantID,MitreID,PrimaryUnit,SmartPhone,Sex,Shift,Wave,index.1,Unnamed: 0,Date,DayWeek,Name,Q_TotalDuration,Timesent,Timestamp,ResponseID,ResponseSet,StartDate2,EndDate2,Finished2,alc1,alc2_1,alc2_2,alc2_3,anxiety,bfid1,bfid10,bfid2,bfid3,bfid4,bfid5,bfid6,bfid7,bfid8,bfid9,bfid_bfid1,bfid_bfid10,bfid_bfid2,bfid_bfid3,bfid_bfid4,bfid_bfid5,bfid_bfid6,bfid_bfid7,bfid_bfid8,bfid_bfid9,context1,context2,context2_10_TEXT,context2_TEXT,context3,context3_7_TEXT,context3_TEXT,context4,context4_3_TEXT,context4_TEXT,dalal1,dalal10,dalal11,dalal12,dalal13,dalal14,dalal15,dalal16,dalal2,dalal3,dalal4,dalal5,dalal6,dalal7,dalal8,dalal9,dalal_dalal1,dalal_dalal10,dalal_dalal11,dalal_dalal12,dalal_dalal13,dalal_dalal14,dalal_dalal15,dalal_dalal16,dalal_dalal2,dalal_dalal3,dalal_dalal4,dalal_dalal5,dalal_dalal6,dalal_dalal7,dalal_dalal8,dalal_dalal9,ex1_1,ex2_1,irbd1,irbd2,irbd3,irbd4,irbd5,irbd6,irbd7,irbd_irbd1,irbd_irbd2,irbd_irbd3,irbd_irbd4,irbd_irbd5,irbd_irbd6,irbd_irbd7,itpd1,itpd2,itpd3,itpd_itpd1,itpd_itpd2,itpd_itpd3,pand1,pand10,pand2,pand3,pand4,pand5,pand6,pand7,pand8,pand9,pand_pand1,pand_pand10,pand_pand2,pand_pand3,pand_pand4,pand_pand5,pand_pand6,pand_pand7,pand_pand8,pand_pand9,sleep_1,stress,surveytype,tob1,tob2_1,tob2_2,tob2_3,tob2_4,tob2_5,tob2_6,tob2_7,work,time_to_complete,activity_num,location_num
0,0,e3e5e4aa-5950-4f1f-915c-c67598965b03,SD1001,7 South ICU,Android,Male,Day shift,1,68,71,2018-03-07,"Wednesday, March 7th",SD1001,105,6:00pm,2019-06-08 19:00:57,1daim6pBHCE7w6s,Default Response Set,2018-03-07 19:00:57,2018-03-07 19:02:42,1.0,,,,,1,,,,,,,,,,,,,,,,,,,,,1,5,,,1,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,,,,,,,,,,,,3,job,,,,,,,,,2.0,105,5.0,1.0
1,1,e3e5e4aa-5950-4f1f-915c-c67598965b03,SD1001,7 South ICU,Android,Male,Day shift,1,143,149,2018-03-10,"Saturday, March 10th",SD1001,266,6:00pm,2019-06-08 18:04:54,3n6gw4wz2OLDqZ8,Default Response Set,2018-03-10 18:04:54,2018-03-10 18:09:20,1.0,,,,,1,,,,,,,,,,,,,,,,,,,,,3,10,,,2,,,0,,,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,,,,,,,,,,,,,,,,,,,7.0,,7.0,7.0,6.0,4.0,1.0,,,,,,,,5.0,5.0,5.0,,,,3.0,1.0,3.0,3.0,3.0,3.0,4.0,1.0,1.0,1.0,,,,,,,,,,,,1,job,,,,,,,,,1.0,266,10.0,2.0
2,2,e3e5e4aa-5950-4f1f-915c-c67598965b03,SD1001,7 South ICU,Android,Male,Day shift,1,26,28,2018-03-06,"Tuesday, March 6th",SD1001,172,6:00am,2019-06-08 06:00:26,2TmZ6kWDC8cKZW8,Default Response Set,2018-03-06 06:00:26,2018-03-06 06:03:18,1.0,1.0,0.0,1.0,0.0,1,,,,,,,,,,,,,,,,,,,,,3,1,,,2,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,4.0,1.0,2.0,3.0,3.0,2.0,2.0,1.0,1.0,1.0,,,,,,,,,,,5.0,1,health,2.0,,,,,,,,,172,1.0,2.0
3,3,e3e5e4aa-5950-4f1f-915c-c67598965b03,SD1001,7 South ICU,Android,Male,Day shift,1,4,5,2018-03-05,"Monday, March 5th",SD1001,270,6:00pm,2019-06-08 18:00:29,2ZZV4xQBDzcgHEd,Default Response Set,2018-03-05 18:00:29,2018-03-05 18:05:00,1.0,,,,,1,,,,,,,,,,,,,,,,,,,,,1,8,,,2,,,0,,,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,,,,,,,,,,,,,,,,,,,7.0,7.0,7.0,7.0,7.0,1.0,1.0,,,,,,,,4.0,5.0,5.0,,,,3.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,,,,,,,,,,,,1,job,,,,,,,,,1.0,270,8.0,2.0
4,4,e3e5e4aa-5950-4f1f-915c-c67598965b03,SD1001,7 South ICU,Android,Male,Day shift,1,190,198,2018-03-12,"Monday, March 12th",SD1001,105,6:00am,2019-06-08 06:03:10,2RWzVWbAsPEwplc,Default Response Set,2018-03-12 06:03:10,2018-03-12 06:04:55,1.0,2.0,,,,1,,,,,,,,,,,,,,,,,,,,,1,1,,,2,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,3.0,1.0,1.0,2.0,2.0,3.0,4.0,1.0,1.0,1.0,,,,,,,,,,,6.0,1,health,2.0,,,,,,,,,105,1.0,2.0


### final clean for psychological flexibility df

In [13]:
#read in csv from preprocessed psychologial felxibility EMAs 
data_PF = pd.read_csv(path_PF_clean)
data_PF = pd.DataFrame(data = data_PF)

print('Original data_PF_S3 shape:\n', data_PF.shape, '\n')
#ensure no replicate ID (211 participants in study)
print('Original data_PF_S3 unique IDs:\n', data_PF['participant_id'].unique().shape, '\n')
#how much missing data is there?
print('Original data_PF_S3 missing value counts:\n', data_PF.isnull().sum(), '\n')
#what is the data type of each column?
print('Original data_PF_S3 data types:\n', data_PF.info(), '\n')
#what is the participant response rate across the entire study?
print('Non-response rate for PF survey:\n', data_PF['completed_ts_utc'].isnull().sum() / data_PF.shape[0] * 100, '%')
#add a binary column for if survey was completed (1) or not (0)
data_PF['completed'] = np.where(data_PF['results_updated'].isnull(), 0, 1)

Original data_PF_S3 shape:
 (10450, 44) 

Original data_PF_S3 unique IDs:
 (211,) 

Original data_PF_S3 missing value counts:
 Unnamed: 0                0
survey_id                 0
participant_id            0
survey_type               0
survey_dt                 0
delivered_ts_utc          0
started_ts_utc         2736
started_ts_offset      2736
completed_ts_utc       2736
completed_ts_offset    2736
ingested_ts_utc        2736
results_updated        2736
activity               2750
pf_03                  2758
pf_04                  2754
pf_05                  2757
pf_06                  2777
pf_07                  2752
pf_08                  2759
pf_09                  2782
pf_10                  2753
pf_11                  2800
pf_12                  2754
pf_13                  2748
pf_14                  2774
pf_15                  2772
pf_mgt                 2740
exp_0                    89
exp_1                    89
exp_2                    89
exp_3                    89
exp_4

In [14]:
#there should be no null participant ids
data_PF[data_PF['participant_id'].isnull()]

Unnamed: 0.1,Unnamed: 0,survey_id,participant_id,survey_type,survey_dt,delivered_ts_utc,started_ts_utc,started_ts_offset,completed_ts_utc,completed_ts_offset,ingested_ts_utc,results_updated,activity,pf_03,pf_04,pf_05,pf_06,pf_07,pf_08,pf_09,pf_10,pf_11,pf_12,pf_13,pf_14,pf_15,pf_mgt,exp_0,exp_1,exp_2,exp_3,exp_4,exp_5,exp_6,exp_7,exp_8,exp_9,exp_10,exp_11,exp_12,exp_13,exp_neg,exp_pos,exp_neut,completed


In [15]:
#change dates from objects to datetimes
data_PF['survey_dt'] = data_PF['survey_dt'].astype('datetime64')
data_PF['delivered_ts_utc'] = data_PF['delivered_ts_utc'].astype('datetime64[ns]')
data_PF['started_ts_utc'] = data_PF['started_ts_utc'].astype('datetime64[ns]')
data_PF['completed_ts_utc'] = data_PF['completed_ts_utc'].astype('datetime64[ns]')
data_PF['ingested_ts_utc'] = data_PF['ingested_ts_utc'].astype('datetime64[ns]')

#confirm change 
print(data_PF.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10450 entries, 0 to 10449
Data columns (total 45 columns):
Unnamed: 0             10450 non-null int64
survey_id              10450 non-null object
participant_id         10450 non-null object
survey_type            10450 non-null object
survey_dt              10450 non-null datetime64[ns]
delivered_ts_utc       10450 non-null datetime64[ns]
started_ts_utc         7714 non-null datetime64[ns]
started_ts_offset      7714 non-null object
completed_ts_utc       7714 non-null datetime64[ns]
completed_ts_offset    7714 non-null object
ingested_ts_utc        7714 non-null datetime64[ns]
results_updated        7714 non-null object
activity               7700 non-null object
pf_03                  7692 non-null float64
pf_04                  7696 non-null float64
pf_05                  7693 non-null float64
pf_06                  7673 non-null float64
pf_07                  7698 non-null float64
pf_08                  7691 non-null float64
pf_0

In [16]:
#compute time between when survey is sent and when participant starts the survey
data_PF['start_delay'] = (data_PF['started_ts_utc'] - data_PF['delivered_ts_utc']).astype('timedelta64[s]')
#compute time between when survey is sent and when participant starts the survey
data_PF['time_to_complete'] = (data_PF['completed_ts_utc'] - data_PF['started_ts_utc']).astype('timedelta64[s]')

In [17]:
#make new column of only numeric corresponding to activity questions (code -1 for write in responses)
activity_num = []
for index, row in data_PF.iterrows():
    try:
        num = float(row['activity'])
        activity_num.append(num)
    except:
        activity_num.append(-1)

data_PF['activity_num'] = activity_num

In [18]:
#find out if there are inconsistencies within participant IDs between PF and part_info
in_part_not_PF = set(data_part_info['ParticipantID'].unique()) - set(data_PF['participant_id'].unique())
print('in_part_not_PF', in_part_not_PF)
in_PF_not_part = set(data_PF['participant_id'].unique()) - set(data_part_info['ParticipantID'].unique())
print('in_PF_not_part', in_PF_not_part)

print('PF part length', len(data_PF['participant_id'].unique()))
print('Info part length', len(data_part_info['ParticipantID'].unique()))

in_part_not_PF {'9276d1e3-6954-460f-bd75-7f99fa01345c'}
in_PF_not_part set()
PF part length 211
Info part length 212


In [19]:
#add the information contained in data_part_info to data_PF
#first create new data table of data_part_info that contains the same number of rows for each participant in that is in data_PF
#(e.g. replicate data_part_info so same length as data_PF for each participant)

data_PF = data_PF.sort_values(by=['participant_id'], ascending=True)

participants = data_PF['participant_id'].unique()

data_PF_part = pd.DataFrame()

for part in participants:
    df_part_long = pd.concat([data_part_info[data_part_info['ParticipantID'] == part]]*len(data_PF[data_PF['participant_id'] ==  part]), ignore_index=True)
    df_part_long.reset_index(inplace=True)
    df_part_long_int = pd.concat([df_part_long, data_PF[data_PF['participant_id'] ==  part].reset_index()], axis = 1)
    data_PF_part = data_PF_part.append(df_part_long_int)

#confirm the two data tables are now the same length
print('data_PF and data_part_info_long are the same length:', data_PF.shape[0] == data_PF_part.shape[0])
print(data_PF.shape[0])
print(data_PF_part.shape[0])
print('does the math make sense?', data_PF_part.shape[0] == (data_PF_part['participant_id'].values == data_PF_part['ParticipantID'].values).sum())
data_PF_part.head()

data_PF and data_part_info_long are the same length: True
10450
10450
does the math make sense? True


Unnamed: 0.1,index,ParticipantID,MitreID,PrimaryUnit,SmartPhone,Sex,Shift,Wave,index.1,Unnamed: 0,survey_id,participant_id,survey_type,survey_dt,delivered_ts_utc,started_ts_utc,started_ts_offset,completed_ts_utc,completed_ts_offset,ingested_ts_utc,results_updated,activity,pf_03,pf_04,pf_05,pf_06,pf_07,pf_08,pf_09,pf_10,pf_11,pf_12,pf_13,pf_14,pf_15,pf_mgt,exp_0,exp_1,exp_2,exp_3,exp_4,exp_5,exp_6,exp_7,exp_8,exp_9,exp_10,exp_11,exp_12,exp_13,exp_neg,exp_pos,exp_neut,completed,start_delay,time_to_complete,activity_num
0,0,02581754-36cd-4b23-85ea-bf995c6dec83,SG1025,8 West ICU,iPhone,Male,Night shift,2,0,0,65688b4e-620d-41bb-bb89-4714d01f7404,02581754-36cd-4b23-85ea-bf995c6dec83,psych_flex,2018-04-10,2018-04-10 07:14:07,NaT,,NaT,,NaT,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,,,
1,1,02581754-36cd-4b23-85ea-bf995c6dec83,SG1025,8 West ICU,iPhone,Male,Night shift,2,27,39,c132474d-d37e-4a12-8730-611fbe039f27,02581754-36cd-4b23-85ea-bf995c6dec83,psych_flex,2018-05-19,2018-05-19 11:18:26,NaT,,NaT,,NaT,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,,,
2,2,02581754-36cd-4b23-85ea-bf995c6dec83,SG1025,8 West ICU,iPhone,Male,Night shift,2,28,40,5e498834-14f6-4d79-8bff-48db2d4f5609,02581754-36cd-4b23-85ea-bf995c6dec83,psych_flex,2018-05-20,2018-05-20 08:23:22,NaT,,NaT,,NaT,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,,,
3,3,02581754-36cd-4b23-85ea-bf995c6dec83,SG1025,8 West ICU,iPhone,Male,Night shift,2,29,41,50b1fd79-90d0-4122-be8f-f70dcdf4af42,02581754-36cd-4b23-85ea-bf995c6dec83,psych_flex,2018-05-21,2018-05-21 10:33:51,NaT,,NaT,,NaT,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,,,
4,4,02581754-36cd-4b23-85ea-bf995c6dec83,SG1025,8 West ICU,iPhone,Male,Night shift,2,30,42,95d68aa5-3770-455a-9fb8-d670a190bdf9,02581754-36cd-4b23-85ea-bf995c6dec83,psych_flex,2018-05-22,2018-05-22 11:03:30,NaT,,NaT,,NaT,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,,,


In [20]:
data_PF_part[data_PF_part['MitreID'].isnull()]['participant_id'].unique()

array([], dtype=object)

### final clean for engage df

In [21]:
#read in csv from preprocessed engage EMAs 
data_engage = pd.read_csv(path_engage_clean)
data_engage = pd.DataFrame(data = data_engage)

print('Original data_engage shape:\n', data_engage.shape, '\n')
#ensure no replicate ID (211 participants in study)
print('Original data_engage unique IDs:\n', data_engage['participant_id'].unique().shape, '\n')
#how much missing data is there?
print('Original data_engage missing value counts:\n', data_engage.isnull().sum(), '\n')
#what is the data type of each column?
print('Original data_engage data types:\n', data_engage.info(), '\n')
#what is the participant response rate across the entire study?
print('Non-response rate for data_engage survey:\n', data_engage['completed_ts_utc'].isnull().sum() / data_engage.shape[0] * 100, '%')
#add a binary column for if survey was completed (1) or not (0)
data_engage['completed'] = np.where(data_engage['results_updated'].isnull(), 0, 1)

data_engage.tail(20)

Original data_engage shape:
 (4178, 46) 

Original data_engage unique IDs:
 (211,) 

Original data_engage missing value counts:
 Unnamed: 0                0
survey_id                 0
participant_id            0
survey_type               0
survey_dt                 0
delivered_ts_utc          0
started_ts_utc         1114
started_ts_offset      1114
completed_ts_utc       1114
completed_ts_offset    1114
ingested_ts_utc        1114
results_updated        1114
engage_location        1118
engage_activity        1119
engage_3               1122
engage_4               1121
engage_5               1120
engage_6               1121
engage_7               1129
engage_8               1122
engage_9               1129
engage_10              1127
engage_11              1125
engage_12              1124
engage_13              1125
engage_14              1118
engage_15              1123
engage_16              1121
engage_17              1118
engage_18              1140
engage_19              1140
eng

Unnamed: 0.1,Unnamed: 0,survey_id,participant_id,survey_type,survey_dt,delivered_ts_utc,started_ts_utc,started_ts_offset,completed_ts_utc,completed_ts_offset,ingested_ts_utc,results_updated,engage_location,engage_activity,engage_3,engage_4,engage_5,engage_6,engage_7,engage_8,engage_9,engage_10,engage_11,engage_12,engage_13,engage_14,engage_15,engage_16,engage_17,engage_18,engage_19,engage_20,engage_21,engage_22,engage_23,engage_24,engage_25,engage_26,engage_27,engage_28,engage_29,engage_mgt,psycap_mgt,support_mgt,challenge_mgt,hindrance_mgt,completed
4158,14561,cd1643e1-ef48-467b-981d-31cbe8813d2f,fdbd8c07-720c-4ae5-a3d8-aad56b54688c,engage_psycap,2018-05-07,2018-05-07 20:16:29+00:00,2018-05-07 20:30:46+00:00,"tzoffset(None, -25200)",2018-05-07 20:36:42+00:00,"tzoffset(None, -25200)",2018-05-07 20:36:42.675312,"{'16': 7, '12': 4, '9': 5, '7': 6, '3': 6, '21...",1.0,0.0,6.0,1.0,4.0,6.0,6.0,6.0,5.0,7.0,5.0,4.0,5.0,6.0,7.0,7.0,7.0,6.0,4.0,3.0,6.0,5.0,4.0,5.0,6.0,3.0,1.0,1.0,2.0,3.666667,5.916667,4.75,5.0,1.75,1
4159,14562,33667ace-c68b-42b8-b7ab-e106e0dccd5e,fdbd8c07-720c-4ae5-a3d8-aad56b54688c,engage_psycap,2018-05-08,2018-05-08 21:40:38+00:00,2018-05-08 23:48:05+00:00,"tzoffset(None, -25200)",2018-05-08 23:53:59+00:00,"tzoffset(None, -25200)",2018-05-08 21:46:29.025841,"{'16': 7, '12': 4, '9': 4, '7': 6, '3': 6, '21...",At home,2.0,6.0,6.0,5.0,5.0,6.0,6.0,4.0,6.0,6.0,4.0,5.0,6.0,7.0,7.0,7.0,6.0,6.0,3.0,5.0,4.0,4.0,3.0,6.0,2.0,1.0,1.0,1.0,5.666667,5.75,5.0,4.25,1.25,1
4160,14567,9af563e3-e624-4308-9898-f66e2694d912,fdbd8c07-720c-4ae5-a3d8-aad56b54688c,engage_psycap,2018-05-13,2018-05-13 23:37:16+00:00,2018-05-14 00:04:05+00:00,"tzoffset(None, -25200)",2018-05-14 00:08:04+00:00,"tzoffset(None, -25200)",2018-05-14 00:08:04.502468,"{'16': 7, '12': 4, '9': 4, '7': 4, '3': 5, '21...",2.0,2.0,5.0,6.0,5.0,4.0,4.0,6.0,4.0,6.0,6.0,4.0,5.0,6.0,7.0,7.0,6.0,3.0,4.0,3.0,5.0,4.0,3.0,4.0,4.0,1.0,1.0,1.0,4.0,5.333333,5.416667,3.75,3.75,1.75,1
4161,14571,fd4d433d-c89a-4fcf-9acb-561cde160290,fdbd8c07-720c-4ae5-a3d8-aad56b54688c,engage_psycap,2018-05-17,2018-05-17 18:42:32+00:00,2018-05-17 20:08:38+00:00,"tzoffset(None, -25200)",2018-05-17 20:11:34+00:00,"tzoffset(None, -25200)",2018-05-17 20:11:35.103424,"{'16': 6, '12': 4, '9': 5, '7': 4, '3': 4, '21...",1.0,0.0,4.0,4.0,2.0,4.0,4.0,4.0,5.0,6.0,4.0,4.0,3.0,5.0,5.0,6.0,7.0,4.0,4.0,2.0,4.0,4.0,4.0,4.0,4.0,2.0,2.0,4.0,3.0,3.333333,4.75,3.5,4.0,2.75,1
4162,14573,93f05b83-9276-4e95-b5b7-8befc25d8454,fdbd8c07-720c-4ae5-a3d8-aad56b54688c,engage_psycap,2018-05-19,2018-05-19 19:03:36+00:00,2018-05-19 19:56:13+00:00,"tzoffset(None, -25200)",2018-05-19 20:00:53+00:00,"tzoffset(None, -25200)",2018-05-19 20:00:54.209978,"{'16': 7, '12': 6, '9': 4, '7': 5, '3': 4, '21...",2.0,2.0,4.0,4.0,4.0,4.0,5.0,6.0,4.0,7.0,6.0,6.0,5.0,6.0,6.0,7.0,7.0,6.0,4.0,3.0,4.0,4.0,4.0,4.0,4.0,3.0,1.0,1.0,1.0,4.0,5.75,4.25,4.0,1.5,1
4163,14577,f191f6e7-a01e-4859-8daf-a205dca333bf,fdbd8c07-720c-4ae5-a3d8-aad56b54688c,engage_psycap,2018-05-23,2018-05-23 18:39:27+00:00,2018-05-23 23:37:14+00:00,"tzoffset(None, -25200)",2018-05-23 23:43:49+00:00,"tzoffset(None, -25200)",2018-05-23 23:43:50.117777,"{'16': 7, '12': 4, '9': 4, '7': 4, '3': 4, '21...",1.0,0.0,4.0,4.0,5.0,4.0,4.0,5.0,4.0,6.0,6.0,4.0,5.0,7.0,7.0,7.0,6.0,5.0,6.0,3.0,4.0,4.0,4.0,4.0,5.0,3.0,2.0,2.0,2.0,4.333333,5.416667,4.5,4.25,2.25,1
4164,14581,706779f2-36a0-4240-940f-fe579d41e195,fdbd8c07-720c-4ae5-a3d8-aad56b54688c,engage_psycap,2018-05-27,2018-05-27 20:27:59+00:00,2018-05-27 20:30:16+00:00,"tzoffset(None, -25200)",2018-05-27 20:36:20+00:00,"tzoffset(None, -25200)",2018-05-27 20:36:20.920250,"{'16': 7, '12': 6, '9': 4, '7': 5, '3': 4, '21...",Home\r\n,4.0,4.0,4.0,6.0,5.0,5.0,6.0,4.0,6.0,6.0,6.0,4.0,7.0,6.0,7.0,7.0,1.0,4.0,2.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,2.0,1.0,4.666667,5.75,2.75,4.0,2.5,1
4165,14584,cc23e774-7c13-40e1-a4c7-c2257391a429,fdbd8c07-720c-4ae5-a3d8-aad56b54688c,engage_psycap,2018-05-30,2018-05-31 00:28:50+00:00,2018-05-31 00:41:53+00:00,"tzoffset(None, -25200)",2018-05-31 00:46:32+00:00,"tzoffset(None, -25200)",2018-05-31 00:46:33.159860,"{'16': 7, '12': 4, '9': 5, '7': 6, '3': 6, '21...",2.0,2.0,6.0,6.0,6.0,5.0,6.0,6.0,5.0,7.0,7.0,4.0,5.0,6.0,7.0,7.0,7.0,3.0,5.0,2.0,5.0,4.0,3.0,3.0,4.0,3.0,2.0,2.0,2.0,6.0,6.0,3.75,3.5,2.25,1
4166,14586,91e05d26-8a10-48a8-8c44-4d06289da678,fdbd8c07-720c-4ae5-a3d8-aad56b54688c,engage_psycap,2018-06-01,2018-06-01 18:43:21+00:00,2018-06-01 20:55:18+00:00,"tzoffset(None, -25200)",2018-06-01 20:58:50+00:00,"tzoffset(None, -25200)",2018-06-01 20:58:50.487036,"{'16': 7, '12': 7, '9': 7, '7': 6, '3': 6, '21...",1.0,0.0,6.0,5.0,7.0,6.0,6.0,6.0,7.0,7.0,7.0,7.0,6.0,5.0,7.0,7.0,7.0,7.0,6.0,4.0,5.0,4.0,6.0,6.0,6.0,5.0,2.0,1.0,6.0,6.0,6.5,5.5,5.5,3.5,1
4167,14588,c19e298e-c36b-4043-b4a4-63667bca83a5,fdbd8c07-720c-4ae5-a3d8-aad56b54688c,engage_psycap,2018-06-03,2018-06-03 18:55:16+00:00,2018-06-03 20:34:13+00:00,"tzoffset(None, -25200)",2018-06-03 20:36:44+00:00,"tzoffset(None, -25200)",2018-06-03 20:36:45.143165,"{'16': 7, '12': 4, '9': 4, '7': 4, '3': 4, '21...",1.0,0.0,4.0,4.0,4.0,6.0,4.0,6.0,4.0,7.0,6.0,4.0,4.0,7.0,6.0,7.0,7.0,6.0,5.0,2.0,5.0,4.0,4.0,6.0,5.0,4.0,1.0,1.0,2.0,4.0,5.666667,4.5,4.75,2.0,1


In [22]:
#there should be no null participant ids
data_engage[data_engage['participant_id'].isnull()]

Unnamed: 0.1,Unnamed: 0,survey_id,participant_id,survey_type,survey_dt,delivered_ts_utc,started_ts_utc,started_ts_offset,completed_ts_utc,completed_ts_offset,ingested_ts_utc,results_updated,engage_location,engage_activity,engage_3,engage_4,engage_5,engage_6,engage_7,engage_8,engage_9,engage_10,engage_11,engage_12,engage_13,engage_14,engage_15,engage_16,engage_17,engage_18,engage_19,engage_20,engage_21,engage_22,engage_23,engage_24,engage_25,engage_26,engage_27,engage_28,engage_29,engage_mgt,psycap_mgt,support_mgt,challenge_mgt,hindrance_mgt,completed


In [23]:
#change dates from objects to datetimes
data_engage['survey_dt'] = data_engage['survey_dt'].astype('datetime64')
data_engage['delivered_ts_utc'] = data_engage['delivered_ts_utc'].astype('datetime64[ns]')
data_engage['started_ts_utc'] = data_engage['started_ts_utc'].astype('datetime64[ns]')
data_engage['completed_ts_utc'] = data_engage['completed_ts_utc'].astype('datetime64[ns]')
data_engage['ingested_ts_utc'] = data_engage['ingested_ts_utc'].astype('datetime64[ns]')

#confirm change 
print(data_engage.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4178 entries, 0 to 4177
Data columns (total 47 columns):
Unnamed: 0             4178 non-null int64
survey_id              4178 non-null object
participant_id         4178 non-null object
survey_type            4178 non-null object
survey_dt              4178 non-null datetime64[ns]
delivered_ts_utc       4178 non-null datetime64[ns]
started_ts_utc         3064 non-null datetime64[ns]
started_ts_offset      3064 non-null object
completed_ts_utc       3064 non-null datetime64[ns]
completed_ts_offset    3064 non-null object
ingested_ts_utc        3064 non-null datetime64[ns]
results_updated        3064 non-null object
engage_location        3060 non-null object
engage_activity        3059 non-null object
engage_3               3056 non-null float64
engage_4               3057 non-null float64
engage_5               3058 non-null float64
engage_6               3057 non-null float64
engage_7               3049 non-null float64
engage_8     

In [24]:
#compute time between when survey is sent and when participant starts the survey
data_engage['start_delay'] = (data_engage['started_ts_utc'] - data_engage['delivered_ts_utc']).astype('timedelta64[s]')
#compute time between when survey is sent and when participant starts the survey
data_engage['time_to_complete'] = (data_engage['completed_ts_utc'] - data_engage['started_ts_utc']).astype('timedelta64[s]')

In [25]:
#make new column of only numeric corresponding to location and activity questions (code -1 for write in responses)
activity_num = []
for index, row in data_engage.iterrows():
    try:
        num = float(row['engage_activity'])
        activity_num.append(num)
    except:
        activity_num.append(-1)

data_engage['activity_num'] = activity_num


location_num = []
for index, row in data_engage.iterrows():
    try:
        num = float(row['engage_location'])
        location_num.append(num)
    except:
        location_num.append(-1)

data_engage['location_num'] = location_num

In [26]:
#find out if there are inconsistencies within participant IDs between PF and part_info
in_part_not_engage = set(data_part_info['ParticipantID'].unique()) - set(data_engage['participant_id'].unique())
print('in_part_not_engage', in_part_not_engage)
in_engage_not_part = set(data_engage['participant_id'].unique()) - set(data_part_info['ParticipantID'].unique())
print('in_engage_not_part', in_engage_not_part)

print('Engage part length', len(data_engage['participant_id'].unique()))
print('Info part length', len(data_part_info['ParticipantID'].unique()))

in_part_not_engage {'9276d1e3-6954-460f-bd75-7f99fa01345c'}
in_engage_not_part set()
Engage part length 211
Info part length 212


In [27]:
#add the information contained in data_part_info to data_PF
#first create new data table of data_part_info that contains the same number of rows for each participant in that is in data_PF
#(e.g. replicate data_part_info so same length as data_PF for each participant)

data_engage = data_engage.sort_values(by=['participant_id'], ascending=True)

participants = data_engage['participant_id'].unique()

data_engage_part = pd.DataFrame()

for part in participants:
    df_part_long = pd.concat([data_part_info[data_part_info['ParticipantID'] == part]]*len(data_engage[data_engage['participant_id'] ==  part]), ignore_index=True)
    df_part_long.reset_index(inplace=True)
    df_part_long_int = pd.concat([df_part_long, data_engage[data_engage['participant_id'] ==  part].reset_index()], axis = 1)
    data_engage_part = data_engage_part.append(df_part_long_int)

#confirm the two data tables are now the same length
print('data_engage and data_part_info_long are the same length:', data_engage.shape[0] == data_engage_part.shape[0])
print(data_engage.shape[0])
print(data_engage_part.shape[0])
print('does the math make sense?', data_engage_part.shape[0] == (data_engage_part['participant_id'].values == data_engage_part['ParticipantID'].values).sum())
data_engage_part.head()

data_engage and data_part_info_long are the same length: True
4178
4178
does the math make sense? True


Unnamed: 0.1,index,ParticipantID,MitreID,PrimaryUnit,SmartPhone,Sex,Shift,Wave,index.1,Unnamed: 0,survey_id,participant_id,survey_type,survey_dt,delivered_ts_utc,started_ts_utc,started_ts_offset,completed_ts_utc,completed_ts_offset,ingested_ts_utc,results_updated,engage_location,engage_activity,engage_3,engage_4,engage_5,engage_6,engage_7,engage_8,engage_9,engage_10,engage_11,engage_12,engage_13,engage_14,engage_15,engage_16,engage_17,engage_18,engage_19,engage_20,engage_21,engage_22,engage_23,engage_24,engage_25,engage_26,engage_27,engage_28,engage_29,engage_mgt,psycap_mgt,support_mgt,challenge_mgt,hindrance_mgt,completed,start_delay,time_to_complete,activity_num,location_num
0,0,02581754-36cd-4b23-85ea-bf995c6dec83,SG1025,8 West ICU,iPhone,Male,Night shift,2,0,3,45a45865-1431-4062-91c8-e4eae8db1578,02581754-36cd-4b23-85ea-bf995c6dec83,engage_psycap,2018-04-13,2018-04-13 11:58:35,NaT,,NaT,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,,,,
1,1,02581754-36cd-4b23-85ea-bf995c6dec83,SG1025,8 West ICU,iPhone,Male,Night shift,2,19,68,e8022d2d-4c61-4785-b1d1-a2fe254fc8fb,02581754-36cd-4b23-85ea-bf995c6dec83,engage_psycap,2018-06-17,2018-06-17 10:54:37,NaT,,NaT,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,,,,
2,2,02581754-36cd-4b23-85ea-bf995c6dec83,SG1025,8 West ICU,iPhone,Male,Night shift,2,18,65,86978b58-858b-4071-a330-446aca359eaa,02581754-36cd-4b23-85ea-bf995c6dec83,engage_psycap,2018-06-14,2018-06-14 06:35:25,NaT,,NaT,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,,,,
3,3,02581754-36cd-4b23-85ea-bf995c6dec83,SG1025,8 West ICU,iPhone,Male,Night shift,2,17,62,937c8e1e-8478-44f6-a498-e3868ef5190a,02581754-36cd-4b23-85ea-bf995c6dec83,engage_psycap,2018-06-11,2018-06-11 06:29:23,NaT,,NaT,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,,,,
4,4,02581754-36cd-4b23-85ea-bf995c6dec83,SG1025,8 West ICU,iPhone,Male,Night shift,2,16,57,c58089e5-0c26-490a-bb7a-f849e29e19f1,02581754-36cd-4b23-85ea-bf995c6dec83,engage_psycap,2018-06-06,2018-06-06 11:29:32,NaT,,NaT,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,,,,


In [28]:
data_engage_part[data_engage_part['MitreID'].isnull()]['participant_id'].unique()

array([], dtype=object)

### reconcile columns to keep and combine dfs

In [29]:
#rename MGT columns to match rest
data_MGT_part = data_MGT_part.rename({'Name': 'name', 'StartDate2': 'date_time', 'ResponseID': 'survey_id', 'surveytype': 'survey_type', 'Q_TotalDuration': 'time_to_complete'}, axis = 1)
#add a time and date columns
data_MGT_part['time'] = data_MGT_part['date_time'].dt.time
data_MGT_part['date'] = data_MGT_part['date_time'].dt.date

In [30]:
#rename psych_flex columns to match
data_PF_part = data_PF_part.rename({'survey_dt': 'date', 'delivered_ts_utc': 'date_time'}, axis = 1)
#add a time sent column
data_PF_part['Timesent'] = data_PF_part['date_time'].dt.time
#add a time and date columns
data_PF_part['time'] = data_PF_part['date_time'].dt.time
data_PF_part['date'] = data_PF_part['date_time'].dt.date

In [31]:
#rename engage columns to match
data_engage_part = data_engage_part.rename({'survey_dt': 'date', 'delivered_ts_utc': 'date_time'}, axis = 1)
#add a time sent column
data_engage_part['Timesent'] = data_engage_part['date_time'].dt.time
#add a time and date columns
data_engage_part['time'] = data_engage_part['date_time'].dt.time
data_engage_part['date'] = data_engage_part['date_time'].dt.date

In [32]:
#select final columns and orgainze for MGT

meta_data = ['MitreID', 'ParticipantID', 'PrimaryUnit', 'SmartPhone', 'Sex',
       'Shift', 'Wave', 'survey_id', 'survey_type', 'date_time', 'date', 'time', 'Timesent', 
             'time_to_complete', 'activity_num', 'location_num']

shared_questions = ['context1', 'context2', 'context2_10_TEXT', 'context2_TEXT', 'context3',
       'context3_7_TEXT', 'context3_TEXT', 'context4', 'context4_3_TEXT',
       'context4_TEXT', 'pand1', 'pand2', 'pand3', 'pand4', 'pand5', 'pand6', 'pand7', 'pand8', 'pand9', 'pand10', 'pand_pand1',
       'pand_pand2', 'pand_pand3', 'pand_pand4', 'pand_pand5',
       'pand_pand6', 'pand_pand7', 'pand_pand8', 'pand_pand9', 'pand_pand10', 'anxiety', 'stress']

personality_questions = ['bfid1', 'bfid2', 'bfid3', 'bfid4', 'bfid5',
       'bfid6', 'bfid7', 'bfid8', 'bfid9', 'bfid10', 'bfid_bfid1',
       'bfid_bfid2', 'bfid_bfid3', 'bfid_bfid4', 'bfid_bfid5',
       'bfid_bfid6', 'bfid_bfid7', 'bfid_bfid8', 'bfid_bfid9', 'bfid_bfid10']

job_questions = ['work', 'irbd1', 'irbd2', 'irbd3', 'irbd4', 'irbd5',
       'irbd6', 'irbd7', 'irbd_irbd1', 'irbd_irbd2', 'irbd_irbd3',
       'irbd_irbd4', 'irbd_irbd5', 'irbd_irbd6', 'irbd_irbd7', 'itpd1',
       'itpd2', 'itpd3', 'itpd_itpd1', 'itpd_itpd2', 'itpd_itpd3', 'dalal1', 'dalal2', 'dalal3',
       'dalal4', 'dalal5', 'dalal6', 'dalal7', 'dalal8', 'dalal9', 'dalal10', 'dalal11', 'dalal12',
       'dalal13', 'dalal14', 'dalal15', 'dalal16', 
       'dalal_dalal1',
       'dalal_dalal2', 'dalal_dalal3', 'dalal_dalal4', 'dalal_dalal5',
       'dalal_dalal6', 'dalal_dalal7', 'dalal_dalal8', 'dalal_dalal9',  'dalal_dalal10', 'dalal_dalal11', 'dalal_dalal12',
       'dalal_dalal13', 'dalal_dalal14', 'dalal_dalal15', 'dalal_dalal16']
    
health_questions = ['alc1', 'alc2_1', 'alc2_2', 'alc2_3', 'tob1', 'tob2_1', 'tob2_2', 'tob2_3',
       'tob2_4', 'tob2_5', 'tob2_6', 'tob2_7', 'ex1_1', 'ex2_1', 'sleep_1']
       
    
data_MGT_final = data_MGT_part[meta_data + shared_questions + job_questions + health_questions + personality_questions]
data_MGT_final.head()

Unnamed: 0,MitreID,ParticipantID,PrimaryUnit,SmartPhone,Sex,Shift,Wave,survey_id,survey_type,date_time,date,time,Timesent,time_to_complete,time_to_complete.1,activity_num,location_num,context1,context2,context2_10_TEXT,context2_TEXT,context3,context3_7_TEXT,context3_TEXT,context4,context4_3_TEXT,context4_TEXT,pand1,pand2,pand3,pand4,pand5,pand6,pand7,pand8,pand9,pand10,pand_pand1,pand_pand2,pand_pand3,pand_pand4,pand_pand5,pand_pand6,pand_pand7,pand_pand8,pand_pand9,pand_pand10,anxiety,stress,work,irbd1,irbd2,irbd3,irbd4,irbd5,irbd6,irbd7,irbd_irbd1,irbd_irbd2,irbd_irbd3,irbd_irbd4,irbd_irbd5,irbd_irbd6,irbd_irbd7,itpd1,itpd2,itpd3,itpd_itpd1,itpd_itpd2,itpd_itpd3,dalal1,dalal2,dalal3,dalal4,dalal5,dalal6,dalal7,dalal8,dalal9,dalal10,dalal11,dalal12,dalal13,dalal14,dalal15,dalal16,dalal_dalal1,dalal_dalal2,dalal_dalal3,dalal_dalal4,dalal_dalal5,dalal_dalal6,dalal_dalal7,dalal_dalal8,dalal_dalal9,dalal_dalal10,dalal_dalal11,dalal_dalal12,dalal_dalal13,dalal_dalal14,dalal_dalal15,dalal_dalal16,alc1,alc2_1,alc2_2,alc2_3,tob1,tob2_1,tob2_2,tob2_3,tob2_4,tob2_5,tob2_6,tob2_7,ex1_1,ex2_1,sleep_1,bfid1,bfid2,bfid3,bfid4,bfid5,bfid6,bfid7,bfid8,bfid9,bfid10,bfid_bfid1,bfid_bfid2,bfid_bfid3,bfid_bfid4,bfid_bfid5,bfid_bfid6,bfid_bfid7,bfid_bfid8,bfid_bfid9,bfid_bfid10
0,SD1001,e3e5e4aa-5950-4f1f-915c-c67598965b03,7 South ICU,Android,Male,Day shift,1,1daim6pBHCE7w6s,job,2018-03-07 19:00:57,2018-03-07,19:00:57,6:00pm,105,105,5.0,1.0,1,5,,,1,,,0,,,3.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,,,,,,,,,,,1,3,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,SD1001,e3e5e4aa-5950-4f1f-915c-c67598965b03,7 South ICU,Android,Male,Day shift,1,3n6gw4wz2OLDqZ8,job,2018-03-10 18:04:54,2018-03-10,18:04:54,6:00pm,266,266,10.0,2.0,3,10,,,2,,,0,,,3.0,3.0,3.0,3.0,3.0,4.0,1.0,1.0,1.0,1.0,,,,,,,,,,,1,1,1.0,7.0,,7.0,7.0,6.0,4.0,1.0,,,,,,,,5.0,5.0,5.0,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,SD1001,e3e5e4aa-5950-4f1f-915c-c67598965b03,7 South ICU,Android,Male,Day shift,1,2TmZ6kWDC8cKZW8,health,2018-03-06 06:00:26,2018-03-06,06:00:26,6:00am,172,172,1.0,2.0,3,1,,,2,,,0,,,4.0,2.0,3.0,3.0,2.0,2.0,1.0,1.0,1.0,1.0,,,,,,,,,,,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,1.0,0.0,2.0,,,,,,,,0.0,0.0,5.0,,,,,,,,,,,,,,,,,,,,
3,SD1001,e3e5e4aa-5950-4f1f-915c-c67598965b03,7 South ICU,Android,Male,Day shift,1,2ZZV4xQBDzcgHEd,job,2018-03-05 18:00:29,2018-03-05,18:00:29,6:00pm,270,270,8.0,2.0,1,8,,,2,,,0,,,3.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,,,,,,,,,,,1,1,1.0,7.0,7.0,7.0,7.0,7.0,1.0,1.0,,,,,,,,4.0,5.0,5.0,,,,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,SD1001,e3e5e4aa-5950-4f1f-915c-c67598965b03,7 South ICU,Android,Male,Day shift,1,2RWzVWbAsPEwplc,health,2018-03-12 06:03:10,2018-03-12,06:03:10,6:00am,105,105,1.0,2.0,1,1,,,2,,,0,,,3.0,1.0,2.0,2.0,3.0,4.0,1.0,1.0,1.0,1.0,,,,,,,,,,,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,,,,,,0.0,0.0,6.0,,,,,,,,,,,,,,,,,,,,


In [33]:
#deal with duplicate column
data_MGT_final.columns = ['MitreID', 'ParticipantID', 'PrimaryUnit', 'SmartPhone', 'Sex',
       'Shift', 'Wave', 'survey_id', 'survey_type', 'date_time', 'date',
       'time', 'Timesent', 'time_to_complete1', 'time_to_complete2',
       'activity_num', 'location_num', 'context1', 'context2',
       'context2_10_TEXT', 'context2_TEXT', 'context3', 'context3_7_TEXT',
       'context3_TEXT', 'context4', 'context4_3_TEXT', 'context4_TEXT',
       'pand1', 'pand2', 'pand3', 'pand4', 'pand5', 'pand6', 'pand7',
       'pand8', 'pand9', 'pand10', 'pand_pand1', 'pand_pand2',
       'pand_pand3', 'pand_pand4', 'pand_pand5', 'pand_pand6',
       'pand_pand7', 'pand_pand8', 'pand_pand9', 'pand_pand10', 'anxiety',
       'stress', 'work', 'irbd1', 'irbd2', 'irbd3', 'irbd4', 'irbd5',
       'irbd6', 'irbd7', 'irbd_irbd1', 'irbd_irbd2', 'irbd_irbd3',
       'irbd_irbd4', 'irbd_irbd5', 'irbd_irbd6', 'irbd_irbd7', 'itpd1',
       'itpd2', 'itpd3', 'itpd_itpd1', 'itpd_itpd2', 'itpd_itpd3',
       'dalal1', 'dalal2', 'dalal3', 'dalal4', 'dalal5', 'dalal6',
       'dalal7', 'dalal8', 'dalal9', 'dalal10', 'dalal11', 'dalal12',
       'dalal13', 'dalal14', 'dalal15', 'dalal16', 'dalal_dalal1',
       'dalal_dalal2', 'dalal_dalal3', 'dalal_dalal4', 'dalal_dalal5',
       'dalal_dalal6', 'dalal_dalal7', 'dalal_dalal8', 'dalal_dalal9',
       'dalal_dalal10', 'dalal_dalal11', 'dalal_dalal12', 'dalal_dalal13',
       'dalal_dalal14', 'dalal_dalal15', 'dalal_dalal16', 'alc1',
       'alc2_1', 'alc2_2', 'alc2_3', 'tob1', 'tob2_1', 'tob2_2', 'tob2_3',
       'tob2_4', 'tob2_5', 'tob2_6', 'tob2_7', 'ex1_1', 'ex2_1',
       'sleep_1', 'bfid1', 'bfid2', 'bfid3', 'bfid4', 'bfid5', 'bfid6',
       'bfid7', 'bfid8', 'bfid9', 'bfid10', 'bfid_bfid1', 'bfid_bfid2',
       'bfid_bfid3', 'bfid_bfid4', 'bfid_bfid5', 'bfid_bfid6',
       'bfid_bfid7', 'bfid_bfid8', 'bfid_bfid9', 'bfid_bfid10']

data_MGT_final = data_MGT_final.drop(['time_to_complete2'], axis = 1)
data_MGT_final = data_MGT_final.rename({'time_to_complete1': 'time_to_complete'}, axis='columns')
data_MGT_final.head()

Unnamed: 0,MitreID,ParticipantID,PrimaryUnit,SmartPhone,Sex,Shift,Wave,survey_id,survey_type,date_time,date,time,Timesent,time_to_complete,activity_num,location_num,context1,context2,context2_10_TEXT,context2_TEXT,context3,context3_7_TEXT,context3_TEXT,context4,context4_3_TEXT,context4_TEXT,pand1,pand2,pand3,pand4,pand5,pand6,pand7,pand8,pand9,pand10,pand_pand1,pand_pand2,pand_pand3,pand_pand4,pand_pand5,pand_pand6,pand_pand7,pand_pand8,pand_pand9,pand_pand10,anxiety,stress,work,irbd1,irbd2,irbd3,irbd4,irbd5,irbd6,irbd7,irbd_irbd1,irbd_irbd2,irbd_irbd3,irbd_irbd4,irbd_irbd5,irbd_irbd6,irbd_irbd7,itpd1,itpd2,itpd3,itpd_itpd1,itpd_itpd2,itpd_itpd3,dalal1,dalal2,dalal3,dalal4,dalal5,dalal6,dalal7,dalal8,dalal9,dalal10,dalal11,dalal12,dalal13,dalal14,dalal15,dalal16,dalal_dalal1,dalal_dalal2,dalal_dalal3,dalal_dalal4,dalal_dalal5,dalal_dalal6,dalal_dalal7,dalal_dalal8,dalal_dalal9,dalal_dalal10,dalal_dalal11,dalal_dalal12,dalal_dalal13,dalal_dalal14,dalal_dalal15,dalal_dalal16,alc1,alc2_1,alc2_2,alc2_3,tob1,tob2_1,tob2_2,tob2_3,tob2_4,tob2_5,tob2_6,tob2_7,ex1_1,ex2_1,sleep_1,bfid1,bfid2,bfid3,bfid4,bfid5,bfid6,bfid7,bfid8,bfid9,bfid10,bfid_bfid1,bfid_bfid2,bfid_bfid3,bfid_bfid4,bfid_bfid5,bfid_bfid6,bfid_bfid7,bfid_bfid8,bfid_bfid9,bfid_bfid10
0,SD1001,e3e5e4aa-5950-4f1f-915c-c67598965b03,7 South ICU,Android,Male,Day shift,1,1daim6pBHCE7w6s,job,2018-03-07 19:00:57,2018-03-07,19:00:57,6:00pm,105,5.0,1.0,1,5,,,1,,,0,,,3.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,,,,,,,,,,,1,3,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,SD1001,e3e5e4aa-5950-4f1f-915c-c67598965b03,7 South ICU,Android,Male,Day shift,1,3n6gw4wz2OLDqZ8,job,2018-03-10 18:04:54,2018-03-10,18:04:54,6:00pm,266,10.0,2.0,3,10,,,2,,,0,,,3.0,3.0,3.0,3.0,3.0,4.0,1.0,1.0,1.0,1.0,,,,,,,,,,,1,1,1.0,7.0,,7.0,7.0,6.0,4.0,1.0,,,,,,,,5.0,5.0,5.0,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,SD1001,e3e5e4aa-5950-4f1f-915c-c67598965b03,7 South ICU,Android,Male,Day shift,1,2TmZ6kWDC8cKZW8,health,2018-03-06 06:00:26,2018-03-06,06:00:26,6:00am,172,1.0,2.0,3,1,,,2,,,0,,,4.0,2.0,3.0,3.0,2.0,2.0,1.0,1.0,1.0,1.0,,,,,,,,,,,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,1.0,0.0,2.0,,,,,,,,0.0,0.0,5.0,,,,,,,,,,,,,,,,,,,,
3,SD1001,e3e5e4aa-5950-4f1f-915c-c67598965b03,7 South ICU,Android,Male,Day shift,1,2ZZV4xQBDzcgHEd,job,2018-03-05 18:00:29,2018-03-05,18:00:29,6:00pm,270,8.0,2.0,1,8,,,2,,,0,,,3.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,,,,,,,,,,,1,1,1.0,7.0,7.0,7.0,7.0,7.0,1.0,1.0,,,,,,,,4.0,5.0,5.0,,,,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,SD1001,e3e5e4aa-5950-4f1f-915c-c67598965b03,7 South ICU,Android,Male,Day shift,1,2RWzVWbAsPEwplc,health,2018-03-12 06:03:10,2018-03-12,06:03:10,6:00am,105,1.0,2.0,1,1,,,2,,,0,,,3.0,1.0,2.0,2.0,3.0,4.0,1.0,1.0,1.0,1.0,,,,,,,,,,,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,,,,,,0.0,0.0,6.0,,,,,,,,,,,,,,,,,,,,


In [34]:
data_MGT_final = data_MGT_final.sort_values(by=['Wave', 'MitreID', 'date'])

In [35]:
#select final columns and orgainze for PF

meta_data = ['MitreID', 'ParticipantID', 'PrimaryUnit', 'SmartPhone', 'Sex',
       'Shift', 'Wave', 'survey_id', 'survey_type', 'date_time', 'date', 'time', 'Timesent', 'completed', 'start_delay', 'time_to_complete', 'activity_num']

questions = ['activity', 'pf_03', 'pf_04', 'pf_05', 'pf_06', 'pf_07', 'pf_08',
       'pf_09', 'pf_10', 'pf_11', 'pf_12', 'pf_13', 'pf_14', 'pf_15',
       'pf_mgt', 'exp_0', 'exp_1', 'exp_2', 'exp_3', 'exp_4', 'exp_5',
       'exp_6', 'exp_7', 'exp_8', 'exp_9', 'exp_10', 'exp_11', 'exp_12',
       'exp_13', 'exp_neg', 'exp_pos', 'exp_neut']
    
data_PF_final = data_PF_part[meta_data + questions]
data_PF_final.head()

Unnamed: 0,MitreID,ParticipantID,PrimaryUnit,SmartPhone,Sex,Shift,Wave,survey_id,survey_type,date_time,date,time,Timesent,completed,start_delay,time_to_complete,activity_num,activity,pf_03,pf_04,pf_05,pf_06,pf_07,pf_08,pf_09,pf_10,pf_11,pf_12,pf_13,pf_14,pf_15,pf_mgt,exp_0,exp_1,exp_2,exp_3,exp_4,exp_5,exp_6,exp_7,exp_8,exp_9,exp_10,exp_11,exp_12,exp_13,exp_neg,exp_pos,exp_neut
0,SG1025,02581754-36cd-4b23-85ea-bf995c6dec83,8 West ICU,iPhone,Male,Night shift,2,65688b4e-620d-41bb-bb89-4714d01f7404,psych_flex,2018-04-10 07:14:07,2018-04-10,07:14:07,07:14:07,0,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,SG1025,02581754-36cd-4b23-85ea-bf995c6dec83,8 West ICU,iPhone,Male,Night shift,2,c132474d-d37e-4a12-8730-611fbe039f27,psych_flex,2018-05-19 11:18:26,2018-05-19,11:18:26,11:18:26,0,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,SG1025,02581754-36cd-4b23-85ea-bf995c6dec83,8 West ICU,iPhone,Male,Night shift,2,5e498834-14f6-4d79-8bff-48db2d4f5609,psych_flex,2018-05-20 08:23:22,2018-05-20,08:23:22,08:23:22,0,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,SG1025,02581754-36cd-4b23-85ea-bf995c6dec83,8 West ICU,iPhone,Male,Night shift,2,50b1fd79-90d0-4122-be8f-f70dcdf4af42,psych_flex,2018-05-21 10:33:51,2018-05-21,10:33:51,10:33:51,0,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,SG1025,02581754-36cd-4b23-85ea-bf995c6dec83,8 West ICU,iPhone,Male,Night shift,2,95d68aa5-3770-455a-9fb8-d670a190bdf9,psych_flex,2018-05-22 11:03:30,2018-05-22,11:03:30,11:03:30,0,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
data_PF_final = data_PF_final.sort_values(by=['Wave', 'MitreID', 'date'])

In [37]:
#select final columns and orgainze for engage

meta_data = ['MitreID', 'ParticipantID', 'PrimaryUnit', 'SmartPhone', 'Sex',
       'Shift', 'Wave', 'survey_id', 'survey_type', 'date_time', 'date', 'time', 'Timesent', 'completed', 'start_delay', 'time_to_complete', 'activity_num', 'location_num']

questions = ['engage_location', 'engage_activity', 'engage_3', 'engage_4',
       'engage_5', 'engage_6', 'engage_7', 'engage_8', 'engage_9',
       'engage_10', 'engage_11', 'engage_12', 'engage_13', 'engage_14',
       'engage_15', 'engage_16', 'engage_17', 'engage_18', 'engage_19',
       'engage_20', 'engage_21', 'engage_22', 'engage_23', 'engage_24',
       'engage_25', 'engage_26', 'engage_27', 'engage_28', 'engage_29',
       'engage_mgt', 'psycap_mgt', 'support_mgt', 'challenge_mgt',
       'hindrance_mgt']
    
data_engage_final = data_engage_part[meta_data + questions]
data_engage_final.head()

Unnamed: 0,MitreID,ParticipantID,PrimaryUnit,SmartPhone,Sex,Shift,Wave,survey_id,survey_type,date_time,date,time,Timesent,completed,start_delay,time_to_complete,activity_num,location_num,engage_location,engage_activity,engage_3,engage_4,engage_5,engage_6,engage_7,engage_8,engage_9,engage_10,engage_11,engage_12,engage_13,engage_14,engage_15,engage_16,engage_17,engage_18,engage_19,engage_20,engage_21,engage_22,engage_23,engage_24,engage_25,engage_26,engage_27,engage_28,engage_29,engage_mgt,psycap_mgt,support_mgt,challenge_mgt,hindrance_mgt
0,SG1025,02581754-36cd-4b23-85ea-bf995c6dec83,8 West ICU,iPhone,Male,Night shift,2,45a45865-1431-4062-91c8-e4eae8db1578,engage_psycap,2018-04-13 11:58:35,2018-04-13,11:58:35,11:58:35,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,SG1025,02581754-36cd-4b23-85ea-bf995c6dec83,8 West ICU,iPhone,Male,Night shift,2,e8022d2d-4c61-4785-b1d1-a2fe254fc8fb,engage_psycap,2018-06-17 10:54:37,2018-06-17,10:54:37,10:54:37,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,SG1025,02581754-36cd-4b23-85ea-bf995c6dec83,8 West ICU,iPhone,Male,Night shift,2,86978b58-858b-4071-a330-446aca359eaa,engage_psycap,2018-06-14 06:35:25,2018-06-14,06:35:25,06:35:25,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,SG1025,02581754-36cd-4b23-85ea-bf995c6dec83,8 West ICU,iPhone,Male,Night shift,2,937c8e1e-8478-44f6-a498-e3868ef5190a,engage_psycap,2018-06-11 06:29:23,2018-06-11,06:29:23,06:29:23,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,SG1025,02581754-36cd-4b23-85ea-bf995c6dec83,8 West ICU,iPhone,Male,Night shift,2,c58089e5-0c26-490a-bb7a-f849e29e19f1,engage_psycap,2018-06-06 11:29:32,2018-06-06,11:29:32,11:29:32,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [38]:
data_engage_final = data_engage_final.sort_values(by=['Wave', 'MitreID', 'date'])

In [39]:
#create final df with all 5 surveys
data_final = pd.DataFrame(columns=['MitreID', 'ParticipantID', 'PrimaryUnit', 'SmartPhone', 'Sex',
       'Shift', 'Wave', 'survey_id', 'survey_type', 'date_time', 'date',
       'time', 'Timesent', 'completed', 'start_delay', 'time_to_complete', 'activity_num', 'location_num', 'engage_location', 
                                   'engage_activity', 'engage_3', 'engage_4',
       'engage_5', 'engage_6', 'engage_7', 'engage_8', 'engage_9',
       'engage_10', 'engage_11', 'engage_12', 'engage_13', 'engage_14',
       'engage_15', 'engage_16', 'engage_17', 'engage_18', 'engage_19',
       'engage_20', 'engage_21', 'engage_22', 'engage_23', 'engage_24',
       'engage_25', 'engage_26', 'engage_27', 'engage_28', 'engage_29',
       'engage_mgt', 'psycap_mgt', 'support_mgt', 'challenge_mgt',
       'hindrance_mgt', 'activity', 'pf_03', 'pf_04', 'pf_05', 'pf_06',
       'pf_07', 'pf_08', 'pf_09', 'pf_10', 'pf_11', 'pf_12', 'pf_13',
       'pf_14', 'pf_15', 'pf_mgt', 'exp_0', 'exp_1', 'exp_2', 'exp_3',
       'exp_4', 'exp_5', 'exp_6', 'exp_7', 'exp_8', 'exp_9', 'exp_10',
       'exp_11', 'exp_12', 'exp_13', 'exp_neg', 'exp_pos', 'exp_neut', 'context1', 'context2',
       'context2_10_TEXT', 'context2_TEXT', 'context3', 'context3_7_TEXT',
       'context3_TEXT', 'context4', 'context4_3_TEXT', 'context4_TEXT',
       'pand1', 'pand2', 'pand3', 'pand4', 'pand5', 'pand6', 'pand7',
       'pand8', 'pand9', 'pand10', 'pand_pand1', 'pand_pand2',
       'pand_pand3', 'pand_pand4', 'pand_pand5', 'pand_pand6',
       'pand_pand7', 'pand_pand8', 'pand_pand9', 'pand_pand10', 'anxiety',
       'stress', 'work', 'irbd1', 'irbd2', 'irbd3', 'irbd4', 'irbd5',
       'irbd6', 'irbd7', 'irbd_irbd1', 'irbd_irbd2', 'irbd_irbd3',
       'irbd_irbd4', 'irbd_irbd5', 'irbd_irbd6', 'irbd_irbd7', 'itpd1',
       'itpd2', 'itpd3', 'itpd_itpd1', 'itpd_itpd2', 'itpd_itpd3',
       'dalal1', 'dalal2', 'dalal3', 'dalal4', 'dalal5', 'dalal6',
       'dalal7', 'dalal8', 'dalal9', 'dalal10', 'dalal11', 'dalal12',
       'dalal13', 'dalal14', 'dalal15', 'dalal16', 'dalal_dalal1',
       'dalal_dalal2', 'dalal_dalal3', 'dalal_dalal4', 'dalal_dalal5',
       'dalal_dalal6', 'dalal_dalal7', 'dalal_dalal8', 'dalal_dalal9',
       'dalal_dalal10', 'dalal_dalal11', 'dalal_dalal12', 'dalal_dalal13',
       'dalal_dalal14', 'dalal_dalal15', 'dalal_dalal16', 'alc1',
       'alc2_1', 'alc2_2', 'alc2_3', 'tob1', 'tob2_1', 'tob2_2', 'tob2_3',
       'tob2_4', 'tob2_5', 'tob2_6', 'tob2_7', 'ex1_1', 'ex2_1',
       'sleep_1', 'bfid1', 'bfid2', 'bfid3', 'bfid4', 'bfid5', 'bfid6',
       'bfid7', 'bfid8', 'bfid9', 'bfid10', 'bfid_bfid1', 'bfid_bfid2',
       'bfid_bfid3', 'bfid_bfid4', 'bfid_bfid5', 'bfid_bfid6',
       'bfid_bfid7', 'bfid_bfid8', 'bfid_bfid9', 'bfid_bfid10'])

In [40]:
data_final = data_final.append([data_MGT_final, data_PF_final, data_engage_final], ignore_index=True, sort=False)
print(data_final.shape)
print('Does the math make sense?', data_final.shape[0] == data_MGT_final.shape[0] + data_PF_final.shape[0] + data_engage_final.shape[0])
data_final.tail()

(26116, 204)
Does the math make sense? True


Unnamed: 0,MitreID,ParticipantID,PrimaryUnit,SmartPhone,Sex,Shift,Wave,survey_id,survey_type,date_time,date,time,Timesent,completed,start_delay,time_to_complete,activity_num,location_num,engage_location,engage_activity,engage_3,engage_4,engage_5,engage_6,engage_7,engage_8,engage_9,engage_10,engage_11,engage_12,engage_13,engage_14,engage_15,engage_16,engage_17,engage_18,engage_19,engage_20,engage_21,engage_22,engage_23,engage_24,engage_25,engage_26,engage_27,engage_28,engage_29,engage_mgt,psycap_mgt,support_mgt,challenge_mgt,hindrance_mgt,activity,pf_03,pf_04,pf_05,pf_06,pf_07,pf_08,pf_09,pf_10,pf_11,pf_12,pf_13,pf_14,pf_15,pf_mgt,exp_0,exp_1,exp_2,exp_3,exp_4,exp_5,exp_6,exp_7,exp_8,exp_9,exp_10,exp_11,exp_12,exp_13,exp_neg,exp_pos,exp_neut,context1,context2,context2_10_TEXT,context2_TEXT,context3,context3_7_TEXT,context3_TEXT,context4,context4_3_TEXT,context4_TEXT,pand1,pand2,pand3,pand4,pand5,pand6,pand7,pand8,pand9,pand10,pand_pand1,pand_pand2,pand_pand3,pand_pand4,pand_pand5,pand_pand6,pand_pand7,pand_pand8,pand_pand9,pand_pand10,anxiety,stress,work,irbd1,irbd2,irbd3,irbd4,irbd5,irbd6,irbd7,irbd_irbd1,irbd_irbd2,irbd_irbd3,irbd_irbd4,irbd_irbd5,irbd_irbd6,irbd_irbd7,itpd1,itpd2,itpd3,itpd_itpd1,itpd_itpd2,itpd_itpd3,dalal1,dalal2,dalal3,dalal4,dalal5,dalal6,dalal7,dalal8,dalal9,dalal10,dalal11,dalal12,dalal13,dalal14,dalal15,dalal16,dalal_dalal1,dalal_dalal2,dalal_dalal3,dalal_dalal4,dalal_dalal5,dalal_dalal6,dalal_dalal7,dalal_dalal8,dalal_dalal9,dalal_dalal10,dalal_dalal11,dalal_dalal12,dalal_dalal13,dalal_dalal14,dalal_dalal15,dalal_dalal16,alc1,alc2_1,alc2_2,alc2_3,tob1,tob2_1,tob2_2,tob2_3,tob2_4,tob2_5,tob2_6,tob2_7,ex1_1,ex2_1,sleep_1,bfid1,bfid2,bfid3,bfid4,bfid5,bfid6,bfid7,bfid8,bfid9,bfid10,bfid_bfid1,bfid_bfid2,bfid_bfid3,bfid_bfid4,bfid_bfid5,bfid_bfid6,bfid_bfid7,bfid_bfid8,bfid_bfid9,bfid_bfid10
26111,SG1076,91297102-4775-4ea1-a10d-4530e3c2f0af,Clinical Laboratory,iPhone,Female,Night shift,3,c99b079e-6782-40cf-9f62-c64268ae217a,engage_psycap,2018-06-27 11:53:45,2018-06-27,11:53:45,11:53:45,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
26112,SG1076,91297102-4775-4ea1-a10d-4530e3c2f0af,Clinical Laboratory,iPhone,Female,Night shift,3,a073fdba-62cc-46d6-bf85-482d8b51b8ce,engage_psycap,2018-07-01 12:17:31,2018-07-01,12:17:31,12:17:31,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
26113,SG1076,91297102-4775-4ea1-a10d-4530e3c2f0af,Clinical Laboratory,iPhone,Female,Night shift,3,d6c8e77b-bd7a-48c1-adb7-38842968c263,engage_psycap,2018-07-03 10:38:34,2018-07-03,10:38:34,10:38:34,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
26114,SG1076,91297102-4775-4ea1-a10d-4530e3c2f0af,Clinical Laboratory,iPhone,Female,Night shift,3,f82962eb-b302-4605-aaf3-6f65f188e088,engage_psycap,2018-07-11 09:57:37,2018-07-11,09:57:37,09:57:37,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
26115,SG1076,91297102-4775-4ea1-a10d-4530e3c2f0af,Clinical Laboratory,iPhone,Female,Night shift,3,ec84057f-5543-4b50-ba61-488a60d726ab,engage_psycap,2018-07-13 06:00:12,2018-07-13,06:00:12,06:00:12,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


### create final dfs with 70 rows per participant per survey type (70 corresponds to days in study)

In [41]:
#process wave 1 data
wave_1 = data_final[data_final['Wave'] == 1]
wave_1 = wave_1.sort_values(by=['date'])
#wave 1 started on 3/5/2018, remove pilot data with dates prior to start
wave_1 = wave_1[wave_1['date'] > dt.date(2018, 3, 4)]
print(wave_1.shape)

(6296, 204)


In [42]:
wave_1_rows = pd.DataFrame(columns=wave_1.columns)
wave_1_rows['wave_study_date'] = pd.date_range('2018-03-05', periods=71, freq='D')
wave_1_rows['wave_study_day'] = wave_1_rows.index
    
final_wave1 = pd.DataFrame()

for participant in wave_1['ParticipantID'].unique():
    #get data corresponding to participant
    data_part = wave_1[wave_1['ParticipantID'] == participant]
    print(participant)
    print(data_part.shape)
    
    #df to fill with 70 rows according to 70 dates of study
    data_part_long = pd.DataFrame()
    
    i = 0

    while i < wave_1_rows.shape[0]:
        #get date
        date = pd.date_range('2018-03-05', periods=71, freq='D')[i].date()
        #get participant data that matches that date
        data_part_date = data_part.loc[data_part['date'] == date]
        #get survey date data that matches that date
        wave_date_df = wave_1_rows.loc[wave_1_rows['wave_study_date'] == date]
        
        if data_part_date.shape[0] > 0:
            data_int = data_part_date
            #some dates had more than one survey sent 
            data_int['wave_study_date'] = wave_date_df['wave_study_date'].values.repeat(data_part_date.shape[0])
            data_int['wave_study_day'] = wave_date_df['wave_study_day'].values.repeat(data_part_date.shape[0])
            
            data_part_long = data_part_long.append(data_int)

        else:
            #fill in row corresponding to date when no survey was sent
            data_int = wave_date_df
            #fill in meta data for participant
            data_int['MitreID'] = data_part['MitreID'].unique()
            data_int['ParticipantID'] = participant
            data_int['PrimaryUnit'] = data_part['PrimaryUnit'].unique()
            data_int['SmartPhone'] = data_part['SmartPhone'].unique()
            data_int['Sex'] = data_part['Sex'].unique()
            data_int['Shift'] = data_part['Shift'].unique()
            data_int['Wave'] = data_part['Wave'].unique()
            data_part_long = data_part_long.append(data_int)
             
        i = i + 1
    data_part_long.reset_index(inplace=True)
    final_wave1 = pd.concat([final_wave1, data_part_long], axis = 0, ignore_index=True)

6ceb4ef3-6578-45cc-bc02-fa97614313e1
(94, 204)
f2e8ab49-ce1b-45f4-9632-cf596f1196bb
(105, 204)
674e9815-7dd1-47f9-9916-c6e53d7e1a36
(141, 204)
a1395d16-e27d-4ff3-80ba-2bb22049c19d
(132, 204)
efc0dabd-1c17-4a63-89ca-21f65250e43f
(114, 204)
fcb14a4c-1ffa-4315-872a-a38833459885
(133, 204)
7b0d87b0-c099-44eb-9815-67d51d73db9a
(94, 204)
e027002a-cf96-4132-a4ac-2003e945258d
(141, 204)
803be457-cee4-4f3f-9540-ff574c57c697
(73, 204)
dcc3e9ff-9c05-4e17-9798-fd5114be77a9
(141, 204)
4e541136-0db1-4749-b4c1-8dc03df3d4a5
(133, 204)
a9dfbe4d-4076-48c7-a72b-342fe4c12514
(118, 204)
2a8f6d94-88d4-4d72-b304-782563903dca
(126, 204)
8b13d979-315f-4357-8f0e-7c12df0a6ca8
(113, 204)
0adb7679-9d26-46e7-a134-11da293910f3
(142, 204)
235be35e-4e50-4996-80d7-ce701d2dca4b
(114, 204)
a0746eca-480e-41ac-bc92-d31568b2bdaf
(135, 204)
7607c6de-7244-4ba1-ab7c-5c4c4d7151ad
(135, 204)
f596b3ca-7b25-4632-b986-7b44448d3f2f
(115, 204)
5bb7f10e-ea7d-4af3-adae-c0d92e8a700d
(139, 204)
14121de2-f38e-4906-9fbe-b613549623fd
(139, 

In [43]:
#process wave 2 data
wave_2 = data_final[data_final['Wave'] == 2]
wave_2 = wave_2.sort_values(by=['date'])
#wave 2 started on 4/9/2018, remove pilot data with dates prior to start
wave_2 = wave_2[wave_2['date'] > dt.date(2018, 4, 8)]
print(wave_2.shape)

(14112, 204)


In [44]:
wave_2_rows = pd.DataFrame(columns=wave_2.columns)
wave_2_rows['wave_study_date'] = pd.date_range('2018-04-09', periods=71, freq='D')
wave_2_rows['wave_study_day'] = wave_2_rows.index
    
final_wave2 = pd.DataFrame()

for participant in wave_2['ParticipantID'].unique():
    #get data corresponding to participant
    data_part = wave_2[wave_2['ParticipantID'] == participant]
    print(participant)
    print(data_part.shape)
    
    #df to fill with 70 rows according to 70 dates of study
    data_part_long = pd.DataFrame()
    
    i = 0

    while i < wave_2_rows.shape[0]:
        #get date
        date = pd.date_range('2018-04-09', periods=71, freq='D')[i].date()
        #get participant data that matches that date
        data_part_date = data_part.loc[data_part['date'] == date]
        #get survey date data that matches that date
        wave_date_df = wave_2_rows.loc[wave_2_rows['wave_study_date'] == date]
        
        if data_part_date.shape[0] > 0:
            data_int = data_part_date
            #some dates had more than one survey sent 
            data_int['wave_study_date'] = wave_date_df['wave_study_date'].values.repeat(data_part_date.shape[0])
            data_int['wave_study_day'] = wave_date_df['wave_study_day'].values.repeat(data_part_date.shape[0])
            
            data_part_long = data_part_long.append(data_int)

        else:
            #fill in row corresponding to date when no survey was sent
            data_int = wave_date_df
            #fill in meta data for participant
            data_int['MitreID'] = data_part['MitreID'].unique()
            data_int['ParticipantID'] = participant
            data_int['PrimaryUnit'] = data_part['PrimaryUnit'].unique()
            data_int['SmartPhone'] = data_part['SmartPhone'].unique()
            data_int['Sex'] = data_part['Sex'].unique()
            data_int['Shift'] = data_part['Shift'].unique()
            data_int['Wave'] = data_part['Wave'].unique()
            data_part_long = data_part_long.append(data_int)
             
        i = i + 1
    data_part_long.reset_index(inplace=True)
    final_wave2 = pd.concat([final_wave2, data_part_long], axis = 0, ignore_index=True)

3cc3da4a-9b07-4215-ad8c-7ef222571856
(131, 204)
658adbe4-781c-45f9-92a7-14912fcd0701
(140, 204)
481aca07-ac33-4f38-98b7-3405dc3d60dc
(138, 204)
300a4242-7c38-4d7a-8911-2914ef452ab3
(82, 204)
51adda7d-b627-48e0-98be-72386fc84661
(125, 204)
9b4822a9-8506-4026-8f8f-803410f95a52
(122, 204)
2d05ec74-7279-4b26-b3ec-d3a2e414ba43
(90, 204)
0a85fd46-fada-434c-9f7a-08b81f9ed8e7
(124, 204)
23a4808f-4429-41d7-adcf-c96742871613
(126, 204)
997c6352-6724-4985-b48c-9c3f08437964
(135, 204)
f983485d-f954-4693-9c2c-981710c06dd0
(138, 204)
b31b6e69-0384-4d5d-8eac-5f512862690b
(137, 204)
02581754-36cd-4b23-85ea-bf995c6dec83
(102, 204)
ba240e43-900d-4477-8718-b9487ed24d7d
(134, 204)
425f3be7-a142-4256-af48-25ecedd8c425
(131, 204)
5375b0e9-db92-4189-801a-1e9dbe1d9f61
(140, 204)
f30085f9-fdb0-49fb-b242-2ad8286b242c
(64, 204)
4ed3225f-868e-43c6-9e57-0329d9ff9e9d
(121, 204)
8c09fe40-2c9e-49a1-afc9-a37138658d3b
(101, 204)
2aae82c5-03f9-42dd-974d-88230a9b75f7
(132, 204)
29e729f1-575c-4627-bf5a-e7bc28fdda26
(116, 

In [45]:
#process wave 3 data
wave_3 = data_final[data_final['Wave'] == 3]
wave_3 = wave_3.sort_values(by=['date'])
#wave 3 started on 5/4/2018, remove pilot data with dates prior to start
wave_3 = wave_3[wave_3['date'] > dt.date(2018, 5, 3)]
print(wave_3.shape)

(5564, 204)


In [46]:
wave_3_rows = pd.DataFrame(columns=wave_3.columns)
wave_3_rows['wave_study_date'] = pd.date_range('2018-05-04', periods=71, freq='D')
wave_3_rows['wave_study_day'] = wave_3_rows.index
    
final_wave3 = pd.DataFrame()

for participant in wave_3['ParticipantID'].unique():
    #get data corresponding to participant
    data_part = wave_3[wave_3['ParticipantID'] == participant]
    print(participant)
    print(data_part.shape)
    
    #df to fill with 70 rows according to 70 dates of study
    data_part_long = pd.DataFrame()
    
    i = 0

    while i < wave_3_rows.shape[0]:
        #get date
        date = pd.date_range('2018-05-04', periods=71, freq='D')[i].date()
        #get participant data that matches that date
        data_part_date = data_part.loc[data_part['date'] == date]
        #get survey date data that matches that date
        wave_date_df = wave_3_rows.loc[wave_3_rows['wave_study_date'] == date]
        
        if data_part_date.shape[0] > 0:
            data_int = data_part_date
            #some dates had more than one survey sent 
            data_int['wave_study_date'] = wave_date_df['wave_study_date'].values.repeat(data_part_date.shape[0])
            data_int['wave_study_day'] = wave_date_df['wave_study_day'].values.repeat(data_part_date.shape[0])
            
            data_part_long = data_part_long.append(data_int)

        else:
            #fill in row corresponding to date when no survey was sent
            data_int = wave_date_df
            #fill in meta data for participant
            data_int['MitreID'] = data_part['MitreID'].unique()
            data_int['ParticipantID'] = participant
            data_int['PrimaryUnit'] = data_part['PrimaryUnit'].unique()
            data_int['SmartPhone'] = data_part['SmartPhone'].unique()
            data_int['Sex'] = data_part['Sex'].unique()
            data_int['Shift'] = data_part['Shift'].unique()
            data_int['Wave'] = data_part['Wave'].unique()
            data_part_long = data_part_long.append(data_int)
             
        i = i + 1
    data_part_long.reset_index(inplace=True)
    final_wave3 = pd.concat([final_wave3, data_part_long], axis = 0, ignore_index=True)

a575b34b-0787-4762-a06a-5604de6d005b
(142, 204)
0271c478-a56a-4c09-ab91-9743184dd71b
(130, 204)
16812063-e5df-4657-b86c-5b55a0c9ffe6
(131, 204)
eb4e1be4-29de-4120-9727-0ce8041da479
(139, 204)
883aca61-af06-4c14-ba58-8d2e4b75bfda
(139, 204)
a1623554-43d6-4038-b28a-bd74a96b9c97
(137, 204)
967b5d47-3ff4-4364-b2e8-b2243c490d7d
(134, 204)
c7492565-48cd-4b52-af25-3aee61a391ad
(138, 204)
663de5e8-dfc8-42ad-8410-c196f6345bec
(141, 204)
2ec7547d-bd0a-4fbf-92a8-3a4e747a53f9
(141, 204)
3bdb1a29-023b-4fb5-8f49-6d285446179d
(104, 204)
0b45e9c1-eba5-46d3-b0a4-cdb5aa4dc736
(123, 204)
6f532efb-8e13-46ca-997d-2d934b5e5a37
(138, 204)
a2676b6d-a28f-46bf-b3f0-6a2e735ae251
(132, 204)
f5c8d104-4839-4aca-b2f5-8c2fc4b98af9
(141, 204)
f732e878-e538-49b8-97d3-49ca29e9a3f8
(140, 204)
e8606fff-0ac6-4832-bb79-323b89ebcf2e
(135, 204)
43d50656-b760-4700-8ff1-da45f17bba09
(130, 204)
e5594e40-9999-45b6-ac2f-bbd063395cd0
(85, 204)
e89b1ea7-a2ea-4f2f-ae5a-9a9d29af8639
(124, 204)
96a272fc-166f-4fe4-8aeb-3874264f3b54
(132

In [47]:
data_final_combined = pd.concat([final_wave1, final_wave2, final_wave3], axis = 0, ignore_index=True)
print(data_final_combined.shape)
data_final_combined.head()

(26512, 207)


Unnamed: 0,index,MitreID,ParticipantID,PrimaryUnit,SmartPhone,Sex,Shift,Wave,survey_id,survey_type,date_time,date,time,Timesent,completed,start_delay,time_to_complete,activity_num,location_num,engage_location,engage_activity,engage_3,engage_4,engage_5,engage_6,engage_7,engage_8,engage_9,engage_10,engage_11,engage_12,engage_13,engage_14,engage_15,engage_16,engage_17,engage_18,engage_19,engage_20,engage_21,engage_22,engage_23,engage_24,engage_25,engage_26,engage_27,engage_28,engage_29,engage_mgt,psycap_mgt,support_mgt,challenge_mgt,hindrance_mgt,activity,pf_03,pf_04,pf_05,pf_06,pf_07,pf_08,pf_09,pf_10,pf_11,pf_12,pf_13,pf_14,pf_15,pf_mgt,exp_0,exp_1,exp_2,exp_3,exp_4,exp_5,exp_6,exp_7,exp_8,exp_9,exp_10,exp_11,exp_12,exp_13,exp_neg,exp_pos,exp_neut,context1,context2,context2_10_TEXT,context2_TEXT,context3,context3_7_TEXT,context3_TEXT,context4,context4_3_TEXT,context4_TEXT,pand1,pand2,pand3,pand4,pand5,pand6,pand7,pand8,pand9,pand10,pand_pand1,pand_pand2,pand_pand3,pand_pand4,pand_pand5,pand_pand6,pand_pand7,pand_pand8,pand_pand9,pand_pand10,anxiety,stress,work,irbd1,irbd2,irbd3,irbd4,irbd5,irbd6,irbd7,irbd_irbd1,irbd_irbd2,irbd_irbd3,irbd_irbd4,irbd_irbd5,irbd_irbd6,irbd_irbd7,itpd1,itpd2,itpd3,itpd_itpd1,itpd_itpd2,itpd_itpd3,dalal1,dalal2,dalal3,dalal4,dalal5,dalal6,dalal7,dalal8,dalal9,dalal10,dalal11,dalal12,dalal13,dalal14,dalal15,dalal16,dalal_dalal1,dalal_dalal2,dalal_dalal3,dalal_dalal4,dalal_dalal5,dalal_dalal6,dalal_dalal7,dalal_dalal8,dalal_dalal9,dalal_dalal10,dalal_dalal11,dalal_dalal12,dalal_dalal13,dalal_dalal14,dalal_dalal15,dalal_dalal16,alc1,alc2_1,alc2_2,alc2_3,tob1,tob2_1,tob2_2,tob2_3,tob2_4,tob2_5,tob2_6,tob2_7,ex1_1,ex2_1,sleep_1,bfid1,bfid2,bfid3,bfid4,bfid5,bfid6,bfid7,bfid8,bfid9,bfid10,bfid_bfid1,bfid_bfid2,bfid_bfid3,bfid_bfid4,bfid_bfid5,bfid_bfid6,bfid_bfid7,bfid_bfid8,bfid_bfid9,bfid_bfid10,wave_study_date,wave_study_day
0,13506,SG1010,6ceb4ef3-6578-45cc-bc02-fa97614313e1,Floatpool,iPhone,Female,Night shift,1,b879875a-4b82-4cc4-9d14-e0f4d4039c88,psych_flex,2018-03-05 11:13:04,2018-03-05,11:13:04,11:13:04,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2018-03-05,0
1,2222,SG1010,6ceb4ef3-6578-45cc-bc02-fa97614313e1,Floatpool,iPhone,Female,Night shift,1,2as3VAYYs7s0faa,job,2018-03-05 08:49:45,2018-03-05,08:49:45,6:00am,,,107.0,2.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,,,1.0,,,0.0,,,5.0,2.0,3.0,3.0,4.0,1.0,1.0,1.0,1.0,1.0,,,,,,,,,,,1.0,3.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2018-03-05,0
2,2223,SG1010,6ceb4ef3-6578-45cc-bc02-fa97614313e1,Floatpool,iPhone,Female,Night shift,1,322PVRSqYIxEDc0,health,2018-03-06 18:01:07,2018-03-06,18:01:07,6:00pm,,,129.0,2.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,,,1.0,,,0.0,,,4.0,2.0,3.0,1.0,5.0,2.0,1.0,1.0,1.0,1.0,,,,,,,,,,,1.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,,,,,,0.0,0.0,6.0,,,,,,,,,,,,,,,,,,,,,2018-03-06,1
3,22745,SG1010,6ceb4ef3-6578-45cc-bc02-fa97614313e1,Floatpool,iPhone,Female,Night shift,1,0926a41b-6890-4143-96ad-2dc0ea6f1f5d,engage_psycap,2018-03-06 13:24:18,2018-03-06,13:24:18,13:24:18,1.0,7922.0,168.0,-1.0,0.0,0.0,laying down,4.0,4.0,4.0,7.0,7.0,7.0,7.0,3.0,7.0,4.0,7.0,7.0,7.0,7.0,7.0,6.0,7.0,7.0,6.0,4.0,3.0,3.0,3.0,1.0,1.0,1.0,2.0,4.0,6.416667,6.5,3.25,1.25,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2018-03-06,1
4,13507,SG1010,6ceb4ef3-6578-45cc-bc02-fa97614313e1,Floatpool,iPhone,Female,Night shift,1,29159257-93a3-471f-8a3d-a9c34dc8a65c,psych_flex,2018-03-07 09:22:09,2018-03-07,09:22:09,09:22:09,1.0,19930.0,121.0,-1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,homework\r\n,5.0,4.0,,,,,,,,,,,,4.5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2018-03-07,2


### final preprocess of combined data

In [57]:
#remove pilot participants ('0adb7679-9d26-46e7-a134-11da293910f3', '803be457-cee4-4f3f-9540-ff574c57c697',
# '9519432e-bf05-4f6c-80d7-f61e930f14ba', 'c7acafcf-f5dd-411e-b90c-988279852fb8', 'fae38580-1e2d-4491-9de5-7b75703ba4ce')

data_no_pilot = data_final_combined[(data_final_combined['ParticipantID'] != '0adb7679-9d26-46e7-a134-11da293910f3') 
                     & (data_final_combined['ParticipantID'] != '803be457-cee4-4f3f-9540-ff574c57c697') 
                     & (data_final_combined['ParticipantID'] != '9519432e-bf05-4f6c-80d7-f61e930f14ba') 
                     & (data_final_combined['ParticipantID'] != 'c7acafcf-f5dd-411e-b90c-988279852fb8') 
                     & (data_final_combined['ParticipantID'] != 'fae38580-1e2d-4491-9de5-7b75703ba4ce')]

print(data_no_pilot.shape)

(25897, 207)


In [58]:
#make new column corresponding to the number in series of survey sent for each participant for each survey type
#for psych_flex and engage_psycap this will be count of total surveys sent to each participant
#for MGT surveys this will be a count of the surveys completed for each participant

data_no_pilot = data_no_pilot.sort_values(by=['MitreID', 'survey_type', 'wave_study_day'])

part_id = data_no_pilot['MitreID'].unique()
survey_types = data_no_pilot['survey_type'].unique()

for participant in part_id:
    for survey in survey_types:
        surveys = data_no_pilot[(data_no_pilot['MitreID'] == participant) & (data_no_pilot['survey_type'] == survey)]
        data_no_pilot.loc[(data_no_pilot['MitreID'] == participant) & (data_no_pilot['survey_type'] == survey), 'survey_count'] = np.arange(surveys.shape[0])

In [61]:
#remove extra surveys
#for psych_flex anything over 49 (eg 50 surveys)
#for engage_psycap anything over 19 (eg 20 surveys)

final_data = data_no_pilot[((data_no_pilot['survey_type'] == 'engage_psycap') & (data_no_pilot['survey_count'] < 20)) 
                              | ((data_no_pilot['survey_type'] == 'psych_flex') & (data_no_pilot['survey_count'] < 50)) 
                              | ((data_no_pilot['survey_type'] == 'job') & (data_no_pilot['survey_count'] < 30)) 
                              | ((data_no_pilot['survey_type'] == 'health') & (data_no_pilot['survey_count'] < 40)) 
                              | ((data_no_pilot['survey_type'] == 'personality') & (data_no_pilot['survey_count'] < 10))]

print(final_data.shape)

(25338, 208)


In [62]:
#fix numeric data columns
final_data['activity_num'] = pd.to_numeric(final_data['activity_num'])
final_data['location_num'] = pd.to_numeric(final_data['location_num'])

final_data['pf_mgt'] = pd.to_numeric(final_data['pf_mgt'])

final_data['context1'] = pd.to_numeric(final_data['context1'])
final_data['context2'] = pd.to_numeric(final_data['context2'])
final_data['context3'] = pd.to_numeric(final_data['context3'])
final_data['context4'] = pd.to_numeric(final_data['context4'])
final_data['stress'] = pd.to_numeric(final_data['stress'])
final_data['anxiety'] = pd.to_numeric(final_data['anxiety'])
final_data['work'] = pd.to_numeric(final_data['work'])

In [63]:
#create day of week column using dt and wave_study_date
final_data['day_of_week'] = final_data['wave_study_date'].dt.day_name()
final_data['day_of_week'] = pd.Categorical(final_data['day_of_week'], categories=
    ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday', 'Sunday'],
    ordered=True)

In [64]:
#create new column corresponding to if they are working or not (as assessed by 'activity_num' == 0 in psych_flex and engage or 
# activity_num == 1 for MGT surveys)

working = []
yes = 1.0
no = 0.0
for index, row in final_data.iterrows():
    
    if (row['survey_type'] == 'psych_flex') | (row['survey_type'] == 'engage_psycap'):
        
        if row['activity_num'] == 0.0:
            working.append(1.0)
            
        elif row['activity_num'] == -1.0:
            working.append(0.0)
            
        elif row['activity_num'] > 0.0:
            working.append(0.0)
        
        else:
            working.append(np.nan)
            
    elif (row['survey_type'] == 'job') | (row['survey_type'] == 'health') | (row['survey_type'] == 'personality'): 
        
        if row['activity_num'] == 1.0:
            working.append(1.0)
            
        elif row['activity_num'] > 1.0:
            working.append(0.0)
            
        else:
            working.append(np.nan)
    
    else:
        working.append(np.nan)
        
print(len(working))
final_data['working'] = working

25338


In [65]:
#create new column corresponding to if they are at work or not (as assessed by 'location_num' == 1 in engage or 
#location_num == 2 in MGT surveys; psych_flex did not ask about location)

at_work = []
yes = 1.0
no = 0.0
for index, row in final_data.iterrows():
    
    if (row['survey_type'] == 'engage_psycap'):
        
        if row['location_num'] == 1.0:
            at_work.append(1.0)
            
        elif row['location_num'] < 1.0:
            at_work.append(0.0)
            
        elif row['location_num'] > 1.0:
            at_work.append(0.0)
        
        else:
            at_work.append(np.nan)
            
    elif (row['survey_type'] == 'job') | (row['survey_type'] == 'health') | (row['survey_type'] == 'personality'): 
        
        if row['location_num'] == 2.0:
            at_work.append(1.0)
            
        elif row['location_num'] > 2.0:
            at_work.append(0.0)
        elif row['location_num'] < 2.0:
            at_work.append(0.0)
            
        else:
            at_work.append(np.nan)
    
    else:
        at_work.append(np.nan)
        
print(len(at_work))
final_data['at_work'] = at_work

25338


### Add in demog, pre, post data and clean

In [66]:
#add demog, pre, post info 

path_demog_prepost = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/Clinical projects/TILES/final_data/Demog, PRE, PST survey composites.csv'

#read in data_demog_prepost, contains demographic information and pre/post questionaires
data_demog_prepost = pd.read_csv(path_demog_prepost)
data_demog_prepost = pd.DataFrame(data = data_demog_prepost)
print('Original demog_prepost shape:\n', data_demog_prepost.shape, '\n')
#replace blnaks with nans
data_demog_prepost = data_demog_prepost.replace(' ', np.nan)
#ensure no replicate ID (eg one row per participant in study)
print('Original demog_prepost unique IDs:\n', data_demog_prepost['ID'].unique().shape, '\n')
print('Original demog_prepost missing value couts:\n', data_demog_prepost.isnull().sum(), '\n')
print('Original demog_prepost data types:\n', data_demog_prepost.info(), '\n')
data_demog_prepost.head()

Original demog_prepost shape:
 (212, 179) 

Original demog_prepost unique IDs:
 (212,) 

Original demog_prepost missing value couts:
 ID                                                                0
date_time                                                         0
shipley.vocab                                                     0
shipley.abs                                                       0
irb                                                               0
itp                                                               0
ocb                                                               0
inter.deviance                                                    0
org.deviance                                                      0
extraversion                                                      0
agreeableness                                                     0
conscientiousness                                                 0
neuroticism                                       

Unnamed: 0,ID,date_time,shipley.vocab,shipley.abs,irb,itp,ocb,inter.deviance,org.deviance,extraversion,agreeableness,conscientiousness,neuroticism,openness,pos.affect,neg.affect,stai.trait,audit,gats.status,gats.quantity,gats.quantity.sub,ipaq,psqi,GenInst,gender,age,bornUS,country,lang,englyrs,educ,jobstat,occup,occup_TEXT,supervise,quantsup,size,duration,income,record_id,redcap_event_name,demographics_timestamp,race,ethnic,relationship,pregnant,children,housing,household___1,household___2,household___3,household___4,household___5,household___6,household___7,currentposition,position_other,certifications,nurseyears,shift,hours,overtime,commute_type,commute_time,extrajob,extrahours,student,demographics_complete,rand_36_item_sf_health_survey_instrument_version_1_timestamp,rand_36_item_sf_health_survey_instrument_version_1_complete,satisfaction_with_life_scale_swls_timestamp,satisfaction_with_life_scale_swls_complete,perceived_stress_scale_pss_timestamp,perceived_stress_scale_pss_complete,mpfi24_timestamp,mpfi24_01,mpfi24_02,mpfi24_03,mpfi24_04,mpfi24_05,mpfi24_06,mpfi24_07,mpfi24_08,mpfi24_09,mpfi24_10,mpfi24_11,mpfi24_12,mpfi24_13,mpfi24_14,mpfi24_15,mpfi24_16,mpfi24_17,mpfi24_18,mpfi24_19,mpfi24_20,mpfi24_21,mpfi24_22,mpfi24_23,mpfi24_24,General_Health,Physical_Functioning,Limits_Physical,Emotional_Wellbeing,Limits_Emotional,Social_Functioning,Pain,energy,fatigue,LifeSatisfaction,Stress,WAAQ,Flexibility,Inflexibility,Acceptance,Awareness,Self_as_Context,Defusion,Values,Action,Avoidance,LackofAwareness,Self_as_Content,Fusion,LackofValues,Inaction,Engagement,Engage_Vigor,Engage_Dedication,Engage_Absorbtion,PsyCap,Psycap_Hope,Psycap_Efficacy,Psycap_Reslilience,Psycap_Optimism,challengestressors,Hindrancestressors,poststudy_survey_timestamp_post,General_Health_post,Physical_Functioning_post,Limits_Physical_post,Emotional_Wellbeing_post,Limits_Emotional_post,Social_Functioning_post,Pain_post,energy_post,fatigue_post,LifeSatisfaction_post,Stress_post,WAAQ_post,Flexibility_post,Inflexibility_post,Acceptance_post,Awareness_post,Self_as_Context_post,Defusion_post,Values_post,Action_post,Avoidance_post,LackofAwareness_post,Self_as_Content_post,Fusion_post,LackofValues_post,Inaction_post,Engagement_post,Engage_Vigor_post,Engage_Dedication_post,Engage_Absorbtion_post,PsyCap_post,Psycap_Hope_post,Psycap_Efficacy_post,Psycap_Reslilience_post,Psycap_Optimism_post,challengestressors_post,Hindrancestressors_post,PsyFlexTot,PsyFlexSDTot,Context_Neg_Tot,Context_Pos_Tot,Context_All_Tot
0,SD1001,2/20/2018,31,9,47,5.0,55,18,22,2.75,3.166667,3.166667,2.916667,3.5,24,19,38,4.0,never,0.0,,23454.0,6.0,1,1,53,2,Guatemala,2,52.0,4,2,11,,2,,4,10,7,7,prestudy_survey_arm_1,2018-02-21 12:42:29,7,1,2,0,0,1,1,1,1,0,0,0,0,7,,7,30,1,36,0,2,3,0,0,9,2,2018-02-21 12:49:43,2,2018-02-21 12:51:05,2,2018-02-21 12:55:30,2,2018-02-21 13:02:33,4,4,4,4,4,4,4,5,4,3,4,4,3,4,2,2,3,3,3,2,1,1,2,2,80,100,100,55,100.0,75.0,100,50,60,4.4,1.75,5.57142857142857,4.0,2.33333333333333,4.0,4.0,4.0,4.5,3.5,4.0,3.5,2,3,2.5,1,2.0,3.11111111111111,2.66666666666667,3.33333333333333,3.33333333333333,4.5,4.75,3.33333333333333,5.0,5.0,3.5,3.25,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.0,0.0543928293220421,1.66666666666667,1.5,2.2
1,SD1002,2/22/2018,32,12,49,5.0,89,7,12,4.666667,4.416667,3.416667,1.5,4.083333,45,11,23,1.0,never,0.0,,17880.0,6.0,1,2,56,2,Viet Nam,2,38.0,4,2,10,,1,2.0,4,10,7,11,prestudy_survey_arm_1,2018-02-27 11:33:22,3,0,3,0,0,1,0,0,0,1,0,0,0,1,,6,12,1,36,24,2,3,0,0,9,2,2018-02-27 11:37:47,2,2018-02-27 11:38:30,2,2018-02-27 11:41:05,2,2018-02-27 11:43:21,5,5,6,6,6,6,5,5,5,5,5,5,5,5,2,2,2,2,1,1,1,1,1,1,95,95,100,85,100.0,100.0,90,90,80,5.2,1.125,6.28571428571429,5.33333333333333,2.0,5.0,6.0,6.0,5.0,5.0,5.0,5.0,2,2,1.0,1,1.0,6.0,6.0,6.0,6.0,5.75,5.25,6.0,6.0,6.0,4.375,1.25,2018-05-20 15:28:02,90.0,100.0,100.0,60.0,100.0,100.0,80.0,60.0,60.0,4.2,1.85714285714286,6.0,4.91666666666667,2.83333333333333,5.0,5.0,5.0,5.0,4.5,5.0,5.0,3.5,2.0,2.5,2.0,2.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,3.875,3.25,4.27315541601256,0.292940115660798,1.28571428571429,1.47916666666667,1.63265306122449
2,SD1003,2/23/2018,35,17,49,4.666667,52,11,19,2.75,4.333333,4.666667,2.416667,3.833333,32,13,40,1.0,never,0.0,,9522.0,7.0,1,2,25,1,,1,,4,2,10,,2,,4,2,4,9,prestudy_survey_arm_1,2018-02-26 21:23:59,3,0,1,0,0,2,0,0,0,0,1,0,0,1,,7,1,1,36,12,2,1,0,0,9,2,2018-02-26 21:26:22,2,2018-02-26 21:26:36,2,2018-02-26 21:27:24,2,2018-02-26 21:29:24,3,3,5,5,6,5,5,5,5,4,4,5,6,6,2,2,3,3,3,1,2,2,1,2,85,100,100,60,100.0,87.5,90,50,70,6.0,1.0,4.57142857142857,4.58333333333333,2.75,3.0,5.0,5.5,5.0,4.5,4.5,6.0,2,3,2.0,2,1.5,4.77777777777778,4.33333333333333,5.0,5.0,4.41666666666667,4.75,3.0,5.0,5.0,3.75,2.25,2018-05-18 14:53:27,85.0,100.0,100.0,50.0,66.6666666666667,75.0,90.0,60.0,40.0,6.0,1.375,5.57142857142857,4.83333333333333,2.91666666666667,4.0,5.0,5.0,5.0,5.0,5.0,5.0,2.0,3.0,3.5,2.0,2.0,3.55555555555556,3.0,4.0,3.66666666666667,4.5,5.0,3.0,5.0,5.0,4.5,3.0,4.25384615384615,0.5348048070283,5.7,3.26,8.96
3,SD1004,2/23/2018,27,14,46,4.0,60,10,26,2.25,4.166667,3.416667,3.416667,3.75,24,14,47,5.0,never,0.0,,7224.0,8.0,1,2,26,1,,1,,3,2,10,,2,,4,7,2,19,prestudy_survey_arm_1,2018-03-01 12:54:07,6,1,1,0,0,2,1,0,0,0,0,0,0,2,,7,5,1,36,0,6,1,0,0,9,2,2018-03-01 12:58:25,2,2018-03-01 12:59:04,2,2018-03-01 13:00:40,2,2018-03-01 13:03:28,3,4,4,4,4,4,3,4,5,4,5,6,4,4,3,3,2,2,1,1,1,1,2,2,85,95,100,40,66.6666666666667,100.0,90,40,40,4.8,1.5,4.28571428571429,4.16666666666667,2.16666666666667,3.5,4.0,4.0,3.5,4.5,5.5,4.0,3,2,1.0,1,2.0,2.66666666666667,2.33333333333333,3.33333333333333,2.33333333333333,3.5,3.75,1.66666666666667,4.66666666666667,4.0,4.125,2.625,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.29600301659125,0.613328341010413,1.0,1.0,1.03030303030303
4,SD1005,2/23/2018,19,12,39,4.0,63,7,15,3.083333,3.916667,2.666667,3.416667,2.75,31,23,42,2.0,current,1.0,1.0,23676.0,8.0,1,2,31,2,mexico,2,25.0,5,2,10,,2,,4,10,4,8,prestudy_survey_arm_1,2018-02-24 21:27:46,1,1,2,0,2,2,1,0,1,1,0,0,0,2,,7,10,1,36,0,2,2,0,0,9,2,2018-02-24 21:37:49,2,2018-02-24 21:38:47,2,2018-02-24 21:40:55,2,2018-02-24 21:45:48,3,4,4,3,4,4,3,3,5,5,5,5,4,3,2,2,1,1,2,2,1,1,1,3,80,100,100,50,66.6666666666667,62.5,100,40,60,5.2,1.875,6.85714285714286,4.0,1.91666666666667,3.5,3.5,4.0,3.0,5.0,5.0,3.5,2,1,2.0,1,2.0,4.33333333333333,3.66666666666667,5.0,4.33333333333333,5.58333333333333,5.5,5.66666666666667,5.66666666666667,5.5,4.625,3.0,2018-06-04 18:40:36,70.0,100.0,100.0,30.0,0.0,25.0,90.0,30.0,30.0,4.0,2.375,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,5.0,5.0,5.0,5.0,5.0,4.0,3.0,3.79487179487179,0.471055719765996,4.20833333333333,2.91666666666667,7.125


In [67]:
#add the information contained in data_demog_prepost to psych_flex_CR
#first create new data table of data_demog_prepost that contains the same number of rows for each participant in that is in psych_flex_CR
#(e.g. replicate data_demog_prepost so same length as psych_flex_CR for each participant)

participants = final_data['MitreID'].unique()

final_data_complete = pd.DataFrame()

for part in participants:
    df_part_long = pd.concat([data_demog_prepost[data_demog_prepost['ID'] == part]]*len(final_data[final_data['MitreID'] ==  part]), ignore_index=True)
    df_part_long.reset_index(inplace=True)
    psych_flex_demog_int = pd.concat([df_part_long, final_data[final_data['MitreID'] ==  part].reset_index()], axis = 1)
    final_data_complete = final_data_complete.append(psych_flex_demog_int)

#confirm the two data tables are now the same lenght
print('final_data and final_data_complete are the same length:', final_data.shape[0] == final_data_complete.shape[0])
print(final_data.shape[0])
print(final_data_complete.shape[0])
print('does the math make sense?', final_data_complete.shape[0] == (final_data_complete['ID'].values == final_data_complete['MitreID'].values).sum())
final_data_complete.head()

final_data and final_data_complete are the same length: True
25338
25338
does the math make sense? True


Unnamed: 0,index,ID,date_time,shipley.vocab,shipley.abs,irb,itp,ocb,inter.deviance,org.deviance,extraversion,agreeableness,conscientiousness,neuroticism,openness,pos.affect,neg.affect,stai.trait,audit,gats.status,gats.quantity,gats.quantity.sub,ipaq,psqi,GenInst,gender,age,bornUS,country,lang,englyrs,educ,jobstat,occup,occup_TEXT,supervise,quantsup,size,duration,income,record_id,redcap_event_name,demographics_timestamp,race,ethnic,relationship,pregnant,children,housing,household___1,household___2,household___3,household___4,household___5,household___6,household___7,currentposition,position_other,certifications,nurseyears,shift,hours,overtime,commute_type,commute_time,extrajob,extrahours,student,demographics_complete,rand_36_item_sf_health_survey_instrument_version_1_timestamp,rand_36_item_sf_health_survey_instrument_version_1_complete,satisfaction_with_life_scale_swls_timestamp,satisfaction_with_life_scale_swls_complete,perceived_stress_scale_pss_timestamp,perceived_stress_scale_pss_complete,mpfi24_timestamp,mpfi24_01,mpfi24_02,mpfi24_03,mpfi24_04,mpfi24_05,mpfi24_06,mpfi24_07,mpfi24_08,mpfi24_09,mpfi24_10,mpfi24_11,mpfi24_12,mpfi24_13,mpfi24_14,mpfi24_15,mpfi24_16,mpfi24_17,mpfi24_18,mpfi24_19,mpfi24_20,mpfi24_21,mpfi24_22,mpfi24_23,mpfi24_24,General_Health,Physical_Functioning,Limits_Physical,Emotional_Wellbeing,Limits_Emotional,Social_Functioning,Pain,energy,fatigue,LifeSatisfaction,Stress,WAAQ,Flexibility,Inflexibility,Acceptance,Awareness,Self_as_Context,Defusion,Values,Action,Avoidance,LackofAwareness,Self_as_Content,Fusion,LackofValues,Inaction,Engagement,Engage_Vigor,Engage_Dedication,Engage_Absorbtion,PsyCap,Psycap_Hope,Psycap_Efficacy,Psycap_Reslilience,Psycap_Optimism,challengestressors,Hindrancestressors,poststudy_survey_timestamp_post,General_Health_post,Physical_Functioning_post,Limits_Physical_post,Emotional_Wellbeing_post,Limits_Emotional_post,Social_Functioning_post,Pain_post,energy_post,fatigue_post,LifeSatisfaction_post,Stress_post,WAAQ_post,Flexibility_post,Inflexibility_post,Acceptance_post,Awareness_post,Self_as_Context_post,Defusion_post,Values_post,Action_post,Avoidance_post,LackofAwareness_post,Self_as_Content_post,Fusion_post,LackofValues_post,Inaction_post,Engagement_post,Engage_Vigor_post,Engage_Dedication_post,Engage_Absorbtion_post,PsyCap_post,Psycap_Hope_post,Psycap_Efficacy_post,Psycap_Reslilience_post,Psycap_Optimism_post,challengestressors_post,Hindrancestressors_post,PsyFlexTot,PsyFlexSDTot,Context_Neg_Tot,Context_Pos_Tot,Context_All_Tot,level_0,index.1,MitreID,ParticipantID,PrimaryUnit,SmartPhone,Sex,Shift,Wave,survey_id,survey_type,date_time.1,date,time,Timesent,completed,start_delay,time_to_complete,activity_num,location_num,engage_location,engage_activity,engage_3,engage_4,engage_5,engage_6,engage_7,engage_8,engage_9,engage_10,engage_11,engage_12,engage_13,engage_14,engage_15,engage_16,engage_17,engage_18,engage_19,engage_20,engage_21,engage_22,engage_23,engage_24,engage_25,engage_26,engage_27,engage_28,engage_29,engage_mgt,psycap_mgt,support_mgt,challenge_mgt,hindrance_mgt,activity,pf_03,pf_04,pf_05,pf_06,pf_07,pf_08,pf_09,pf_10,pf_11,pf_12,pf_13,pf_14,pf_15,pf_mgt,exp_0,exp_1,exp_2,exp_3,exp_4,exp_5,exp_6,exp_7,exp_8,exp_9,exp_10,exp_11,exp_12,exp_13,exp_neg,exp_pos,exp_neut,context1,context2,context2_10_TEXT,context2_TEXT,context3,context3_7_TEXT,context3_TEXT,context4,context4_3_TEXT,context4_TEXT,pand1,pand2,pand3,pand4,pand5,pand6,pand7,pand8,pand9,pand10,pand_pand1,pand_pand2,pand_pand3,pand_pand4,pand_pand5,pand_pand6,pand_pand7,pand_pand8,pand_pand9,pand_pand10,anxiety,stress,work,irbd1,irbd2,irbd3,irbd4,irbd5,irbd6,irbd7,irbd_irbd1,irbd_irbd2,irbd_irbd3,irbd_irbd4,irbd_irbd5,irbd_irbd6,irbd_irbd7,itpd1,itpd2,itpd3,itpd_itpd1,itpd_itpd2,itpd_itpd3,dalal1,dalal2,dalal3,dalal4,dalal5,dalal6,dalal7,dalal8,dalal9,dalal10,dalal11,dalal12,dalal13,dalal14,dalal15,dalal16,dalal_dalal1,dalal_dalal2,dalal_dalal3,dalal_dalal4,dalal_dalal5,dalal_dalal6,dalal_dalal7,dalal_dalal8,dalal_dalal9,dalal_dalal10,dalal_dalal11,dalal_dalal12,dalal_dalal13,dalal_dalal14,dalal_dalal15,dalal_dalal16,alc1,alc2_1,alc2_2,alc2_3,tob1,tob2_1,tob2_2,tob2_3,tob2_4,tob2_5,tob2_6,tob2_7,ex1_1,ex2_1,sleep_1,bfid1,bfid2,bfid3,bfid4,bfid5,bfid6,bfid7,bfid8,bfid9,bfid10,bfid_bfid1,bfid_bfid2,bfid_bfid3,bfid_bfid4,bfid_bfid5,bfid_bfid6,bfid_bfid7,bfid_bfid8,bfid_bfid9,bfid_bfid10,wave_study_date,wave_study_day,survey_count,day_of_week,working,at_work
0,0,SD1001,2/20/2018,31,9,47,5.0,55,18,22,2.75,3.166667,3.166667,2.916667,3.5,24,19,38,4.0,never,0.0,,23454.0,6.0,1,1,53,2,Guatemala,2,52,4,2,11,,2,,4,10,7,7,prestudy_survey_arm_1,2018-02-21 12:42:29,7,1,2,0,0,1,1,1,1,0,0,0,0,7,,7,30,1,36,0,2,3,0,0,9,2,2018-02-21 12:49:43,2,2018-02-21 12:51:05,2,2018-02-21 12:55:30,2,2018-02-21 13:02:33,4,4,4,4,4,4,4,5,4,3,4,4,3,4,2,2,3,3,3,2,1,1,2,2,80,100,100,55,100,75,100,50,60,4.4,1.75,5.57142857142857,4,2.33333333333333,4,4,4,4.5,3.5,4,3.5,2,3,2.5,1,2,3.11111111111111,2.66666666666667,3.33333333333333,3.33333333333333,4.5,4.75,3.33333333333333,5,5,3.5,3.25,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,0.0543928293220421,1.66666666666667,1.5,2.2,5189,21938,SD1001,e3e5e4aa-5950-4f1f-915c-c67598965b03,7 South ICU,Android,Male,Day shift,1,36ce098b-6905-4258-858a-8bd8a5fba129,engage_psycap,2018-03-07 22:24:24,2018-03-07,22:24:24,22:24:24,1.0,212.0,229.0,7.0,0.0,0.0,7.0,6.0,3.0,3.0,6.0,7.0,6.0,6.0,5.0,6.0,5.0,5.0,5.0,6.0,6.0,7.0,6.0,4.0,5.0,6.0,6.0,5.0,5.0,6.0,2.0,2.0,1.0,2.0,4.0,5.833333,5.25,5.5,1.75,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2018-03-07,2,0.0,Wednesday,0.0,0.0
1,1,SD1001,2/20/2018,31,9,47,5.0,55,18,22,2.75,3.166667,3.166667,2.916667,3.5,24,19,38,4.0,never,0.0,,23454.0,6.0,1,1,53,2,Guatemala,2,52,4,2,11,,2,,4,10,7,7,prestudy_survey_arm_1,2018-02-21 12:42:29,7,1,2,0,0,1,1,1,1,0,0,0,0,7,,7,30,1,36,0,2,3,0,0,9,2,2018-02-21 12:49:43,2,2018-02-21 12:51:05,2,2018-02-21 12:55:30,2,2018-02-21 13:02:33,4,4,4,4,4,4,4,5,4,3,4,4,3,4,2,2,3,3,3,2,1,1,2,2,80,100,100,55,100,75,100,50,60,4.4,1.75,5.57142857142857,4,2.33333333333333,4,4,4,4.5,3.5,4,3.5,2,3,2.5,1,2,3.11111111111111,2.66666666666667,3.33333333333333,3.33333333333333,4.5,4.75,3.33333333333333,5,5,3.5,3.25,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,0.0543928293220421,1.66666666666667,1.5,2.2,5192,21939,SD1001,e3e5e4aa-5950-4f1f-915c-c67598965b03,7 South ICU,Android,Male,Day shift,1,b2559df1-6379-49f7-ba40-e3f286fbb0c1,engage_psycap,2018-03-09 00:40:35,2018-03-09,00:40:35,00:40:35,1.0,106.0,2189.0,7.0,0.0,0.0,7.0,6.0,3.0,5.0,6.0,6.0,6.0,6.0,6.0,5.0,6.0,6.0,5.0,6.0,6.0,7.0,6.0,5.0,5.0,6.0,6.0,5.0,4.0,6.0,4.0,2.0,5.0,2.0,4.666667,5.916667,5.5,5.25,3.25,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2018-03-09,4,1.0,Friday,0.0,0.0
2,2,SD1001,2/20/2018,31,9,47,5.0,55,18,22,2.75,3.166667,3.166667,2.916667,3.5,24,19,38,4.0,never,0.0,,23454.0,6.0,1,1,53,2,Guatemala,2,52,4,2,11,,2,,4,10,7,7,prestudy_survey_arm_1,2018-02-21 12:42:29,7,1,2,0,0,1,1,1,1,0,0,0,0,7,,7,30,1,36,0,2,3,0,0,9,2,2018-02-21 12:49:43,2,2018-02-21 12:51:05,2,2018-02-21 12:55:30,2,2018-02-21 13:02:33,4,4,4,4,4,4,4,5,4,3,4,4,3,4,2,2,3,3,3,2,1,1,2,2,80,100,100,55,100,75,100,50,60,4.4,1.75,5.57142857142857,4,2.33333333333333,4,4,4,4.5,3.5,4,3.5,2,3,2.5,1,2,3.11111111111111,2.66666666666667,3.33333333333333,3.33333333333333,4.5,4.75,3.33333333333333,5,5,3.5,3.25,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,0.0543928293220421,1.66666666666667,1.5,2.2,5199,21940,SD1001,e3e5e4aa-5950-4f1f-915c-c67598965b03,7 South ICU,Android,Male,Day shift,1,0465420b-495c-41ba-8a31-6195431d33cf,engage_psycap,2018-03-12 22:08:17,2018-03-12,22:08:17,22:08:17,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2018-03-12,7,2.0,Monday,,
3,3,SD1001,2/20/2018,31,9,47,5.0,55,18,22,2.75,3.166667,3.166667,2.916667,3.5,24,19,38,4.0,never,0.0,,23454.0,6.0,1,1,53,2,Guatemala,2,52,4,2,11,,2,,4,10,7,7,prestudy_survey_arm_1,2018-02-21 12:42:29,7,1,2,0,0,1,1,1,1,0,0,0,0,7,,7,30,1,36,0,2,3,0,0,9,2,2018-02-21 12:49:43,2,2018-02-21 12:51:05,2,2018-02-21 12:55:30,2,2018-02-21 13:02:33,4,4,4,4,4,4,4,5,4,3,4,4,3,4,2,2,3,3,3,2,1,1,2,2,80,100,100,55,100,75,100,50,60,4.4,1.75,5.57142857142857,4,2.33333333333333,4,4,4,4.5,3.5,4,3.5,2,3,2.5,1,2,3.11111111111111,2.66666666666667,3.33333333333333,3.33333333333333,4.5,4.75,3.33333333333333,5,5,3.5,3.25,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,0.0543928293220421,1.66666666666667,1.5,2.2,5203,21941,SD1001,e3e5e4aa-5950-4f1f-915c-c67598965b03,7 South ICU,Android,Male,Day shift,1,554c06ab-7f7e-4f34-9457-cf2ca5c43f69,engage_psycap,2018-03-15 23:32:46,2018-03-15,23:32:46,23:32:46,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2018-03-15,10,3.0,Thursday,,
4,4,SD1001,2/20/2018,31,9,47,5.0,55,18,22,2.75,3.166667,3.166667,2.916667,3.5,24,19,38,4.0,never,0.0,,23454.0,6.0,1,1,53,2,Guatemala,2,52,4,2,11,,2,,4,10,7,7,prestudy_survey_arm_1,2018-02-21 12:42:29,7,1,2,0,0,1,1,1,1,0,0,0,0,7,,7,30,1,36,0,2,3,0,0,9,2,2018-02-21 12:49:43,2,2018-02-21 12:51:05,2,2018-02-21 12:55:30,2,2018-02-21 13:02:33,4,4,4,4,4,4,4,5,4,3,4,4,3,4,2,2,3,3,3,2,1,1,2,2,80,100,100,55,100,75,100,50,60,4.4,1.75,5.57142857142857,4,2.33333333333333,4,4,4,4.5,3.5,4,3.5,2,3,2.5,1,2,3.11111111111111,2.66666666666667,3.33333333333333,3.33333333333333,4.5,4.75,3.33333333333333,5,5,3.5,3.25,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,0.0543928293220421,1.66666666666667,1.5,2.2,5207,21942,SD1001,e3e5e4aa-5950-4f1f-915c-c67598965b03,7 South ICU,Android,Male,Day shift,1,1d9c54d4-ddb2-4777-b5d3-687d19ce697a,engage_psycap,2018-03-19 19:34:09,2018-03-19,19:34:09,19:34:09,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2018-03-19,14,4.0,Monday,,


In [85]:
#some dates have 'not completed' instead of date, change this to nan
final_data_complete.replace('[not completed]', np.nan, inplace=True)
#change dates from objects to datetimes
final_data_complete['demographics_timestamp'] = final_data_complete['demographics_timestamp'].astype('datetime64[ns]')
final_data_complete['rand_36_item_sf_health_survey_instrument_version_1_timestamp'] = final_data_complete['rand_36_item_sf_health_survey_instrument_version_1_timestamp'].astype('datetime64[ns]')
final_data_complete['satisfaction_with_life_scale_swls_timestamp'] = final_data_complete['satisfaction_with_life_scale_swls_timestamp'].astype('datetime64[ns]')
final_data_complete['perceived_stress_scale_pss_timestamp'] = final_data_complete['perceived_stress_scale_pss_timestamp'].astype('datetime64[ns]')
final_data_complete['mpfi24_timestamp'] = final_data_complete['mpfi24_timestamp'].astype('datetime64[ns]')

#compute time to take demog survey
final_data_complete['time_to_complete_demogs'] = (final_data_complete['mpfi24_timestamp'] - final_data_complete['demographics_timestamp']).astype('timedelta64[s]')

In [99]:
#organize variables
final_data_complete = final_data_complete[['MitreID', 'ParticipantID', 'ID', 
        'PrimaryUnit', 'SmartPhone', 'Sex', 'Shift', 'Wave','GenInst',
       'gender', 'age', 'bornUS', 'country', 'lang', 'englyrs', 'educ',
       'jobstat', 'occup', 'occup_TEXT', 'supervise', 'quantsup', 'size',
       'duration', 'income', 'record_id', 
        'race', 'ethnic', 'relationship',
       'pregnant', 'children', 'housing', 'household___1',
       'household___2', 'household___3', 'household___4', 'household___5',
       'household___6', 'household___7', 'currentposition',
       'position_other', 'certifications', 'nurseyears', 'shift', 'hours',
       'overtime', 'commute_type', 'commute_time', 'extrajob',
       'extrahours', 'student', 'demographics_complete', 'time_to_complete_demogs',
       'survey_id', 'survey_type', 'date', 'time', 'Timesent',
       'completed', 'start_delay', 'time_to_complete',
       'location_num', 'wave_study_date', 'wave_study_day',
       'survey_count', 'day_of_week', 'working', 'at_work', 
        'engage_location', 'engage_activity', 'engage_3',
       'engage_4', 'engage_5', 'engage_6', 'engage_7', 'engage_8',
       'engage_9', 'engage_10', 'engage_11', 'engage_12', 'engage_13',
       'engage_14', 'engage_15', 'engage_16', 'engage_17', 'engage_18',
       'engage_19', 'engage_20', 'engage_21', 'engage_22', 'engage_23',
       'engage_24', 'engage_25', 'engage_26', 'engage_27', 'engage_28',
       'engage_29', 'engage_mgt', 'psycap_mgt', 'support_mgt',
       'challenge_mgt', 'hindrance_mgt', 'activity', 'activity_num', 'pf_03', 'pf_04',
       'pf_05', 'pf_06', 'pf_07', 'pf_08', 'pf_09', 'pf_10', 'pf_11',
       'pf_12', 'pf_13', 'pf_14', 'pf_15', 'pf_mgt', 'exp_0', 'exp_1',
       'exp_2', 'exp_3', 'exp_4', 'exp_5', 'exp_6', 'exp_7', 'exp_8',
       'exp_9', 'exp_10', 'exp_11', 'exp_12', 'exp_13', 'exp_neg',
       'exp_pos', 'exp_neut', 'context1', 'context2', 'context2_10_TEXT',
       'context2_TEXT', 'context3', 'context3_7_TEXT', 'context3_TEXT',
       'context4', 'context4_3_TEXT', 'context4_TEXT', 'pand1', 'pand2',
       'pand3', 'pand4', 'pand5', 'pand6', 'pand7', 'pand8', 'pand9',
       'pand10', 'pand_pand1', 'pand_pand2', 'pand_pand3', 'pand_pand4',
       'pand_pand5', 'pand_pand6', 'pand_pand7', 'pand_pand8',
       'pand_pand9', 'pand_pand10', 'anxiety', 'stress', 'work', 'irbd1',
       'irbd2', 'irbd3', 'irbd4', 'irbd5', 'irbd6', 'irbd7', 'irbd_irbd1',
       'irbd_irbd2', 'irbd_irbd3', 'irbd_irbd4', 'irbd_irbd5',
       'irbd_irbd6', 'irbd_irbd7', 'itpd1', 'itpd2', 'itpd3',
       'itpd_itpd1', 'itpd_itpd2', 'itpd_itpd3', 'dalal1', 'dalal2',
       'dalal3', 'dalal4', 'dalal5', 'dalal6', 'dalal7', 'dalal8',
       'dalal9', 'dalal10', 'dalal11', 'dalal12', 'dalal13', 'dalal14',
       'dalal15', 'dalal16', 'dalal_dalal1', 'dalal_dalal2',
       'dalal_dalal3', 'dalal_dalal4', 'dalal_dalal5', 'dalal_dalal6',
       'dalal_dalal7', 'dalal_dalal8', 'dalal_dalal9', 'dalal_dalal10',
       'dalal_dalal11', 'dalal_dalal12', 'dalal_dalal13', 'dalal_dalal14',
       'dalal_dalal15', 'dalal_dalal16', 'alc1', 'alc2_1', 'alc2_2',
       'alc2_3', 'tob1', 'tob2_1', 'tob2_2', 'tob2_3', 'tob2_4', 'tob2_5',
       'tob2_6', 'tob2_7', 'ex1_1', 'ex2_1', 'sleep_1', 'bfid1', 'bfid2',
       'bfid3', 'bfid4', 'bfid5', 'bfid6', 'bfid7', 'bfid8', 'bfid9',
       'bfid10', 'bfid_bfid1', 'bfid_bfid2', 'bfid_bfid3', 'bfid_bfid4',
       'bfid_bfid5', 'bfid_bfid6', 'bfid_bfid7', 'bfid_bfid8',
       'bfid_bfid9', 'bfid_bfid10',
       'demographics_timestamp',
       'rand_36_item_sf_health_survey_instrument_version_1_timestamp',
       'satisfaction_with_life_scale_swls_timestamp',
       'perceived_stress_scale_pss_timestamp', 'mpfi24_timestamp', 'shipley.vocab', 'shipley.abs', 'irb',
       'itp', 'ocb', 'inter.deviance', 'org.deviance', 'extraversion',
       'agreeableness', 'conscientiousness', 'neuroticism', 'openness',
       'pos.affect', 'neg.affect', 'stai.trait', 'audit', 'gats.status',
       'gats.quantity', 'gats.quantity.sub', 'ipaq', 'psqi',
       'mpfi24_01', 'mpfi24_02', 'mpfi24_03', 'mpfi24_04', 'mpfi24_05',
       'mpfi24_06', 'mpfi24_07', 'mpfi24_08', 'mpfi24_09', 'mpfi24_10',
       'mpfi24_11', 'mpfi24_12', 'mpfi24_13', 'mpfi24_14', 'mpfi24_15',
       'mpfi24_16', 'mpfi24_17', 'mpfi24_18', 'mpfi24_19', 'mpfi24_20',
       'mpfi24_21', 'mpfi24_22', 'mpfi24_23', 'mpfi24_24',
       'General_Health', 'Physical_Functioning', 'Limits_Physical',
       'Emotional_Wellbeing', 'Limits_Emotional', 'Social_Functioning',
       'Pain', 'energy', 'fatigue', 'LifeSatisfaction', 'Stress', 'WAAQ',
       'Flexibility', 'Inflexibility', 'Acceptance', 'Awareness',
       'Self_as_Context', 'Defusion', 'Values', 'Action', 'Avoidance',
       'LackofAwareness', 'Self_as_Content', 'Fusion', 'LackofValues',
       'Inaction', 'Engagement', 'Engage_Vigor', 'Engage_Dedication',
       'Engage_Absorbtion', 'PsyCap', 'Psycap_Hope', 'Psycap_Efficacy',
       'Psycap_Reslilience', 'Psycap_Optimism', 'challengestressors',
       'Hindrancestressors', 'poststudy_survey_timestamp_post',
       'General_Health_post', 'Physical_Functioning_post',
       'Limits_Physical_post', 'Emotional_Wellbeing_post',
       'Limits_Emotional_post', 'Social_Functioning_post', 'Pain_post',
       'energy_post', 'fatigue_post', 'LifeSatisfaction_post',
       'Stress_post', 'WAAQ_post', 'Flexibility_post',
       'Inflexibility_post', 'Acceptance_post', 'Awareness_post',
       'Self_as_Context_post', 'Defusion_post', 'Values_post',
       'Action_post', 'Avoidance_post', 'LackofAwareness_post',
       'Self_as_Content_post', 'Fusion_post', 'LackofValues_post',
       'Inaction_post', 'Engagement_post', 'Engage_Vigor_post',
       'Engage_Dedication_post', 'Engage_Absorbtion_post', 'PsyCap_post',
       'Psycap_Hope_post', 'Psycap_Efficacy_post',
       'Psycap_Reslilience_post', 'Psycap_Optimism_post',
       'challengestressors_post', 'Hindrancestressors_post', 'PsyFlexTot',
       'PsyFlexSDTot', 'Context_Neg_Tot', 'Context_Pos_Tot',
       'Context_All_Tot']]

In [100]:
final_data_complete['survey_type'].value_counts()

psych_flex       10134
health            5525
job               4832
engage_psycap     4050
personality        797
Name: survey_type, dtype: int64

In [101]:
print(final_data_complete.shape)
final_data_complete.head()

(25338, 384)


Unnamed: 0,MitreID,ParticipantID,ID,PrimaryUnit,SmartPhone,Sex,Shift,Wave,GenInst,gender,age,bornUS,country,lang,englyrs,educ,jobstat,occup,occup_TEXT,supervise,quantsup,size,duration,income,record_id,race,ethnic,relationship,pregnant,children,housing,household___1,household___2,household___3,household___4,household___5,household___6,household___7,currentposition,position_other,certifications,nurseyears,shift,hours,overtime,commute_type,commute_time,extrajob,extrahours,student,demographics_complete,time_to_complete_demogs,survey_id,survey_type,date,time,Timesent,completed,start_delay,time_to_complete,location_num,wave_study_date,wave_study_day,survey_count,day_of_week,working,at_work,engage_location,engage_activity,engage_3,engage_4,engage_5,engage_6,engage_7,engage_8,engage_9,engage_10,engage_11,engage_12,engage_13,engage_14,engage_15,engage_16,engage_17,engage_18,engage_19,engage_20,engage_21,engage_22,engage_23,engage_24,engage_25,engage_26,engage_27,engage_28,engage_29,engage_mgt,psycap_mgt,support_mgt,challenge_mgt,hindrance_mgt,activity,activity_num,pf_03,pf_04,pf_05,pf_06,pf_07,pf_08,pf_09,pf_10,pf_11,pf_12,pf_13,pf_14,pf_15,pf_mgt,exp_0,exp_1,exp_2,exp_3,exp_4,exp_5,exp_6,exp_7,exp_8,exp_9,exp_10,exp_11,exp_12,exp_13,exp_neg,exp_pos,exp_neut,context1,context2,context2_10_TEXT,context2_TEXT,context3,context3_7_TEXT,context3_TEXT,context4,context4_3_TEXT,context4_TEXT,pand1,pand2,pand3,pand4,pand5,pand6,pand7,pand8,pand9,pand10,pand_pand1,pand_pand2,pand_pand3,pand_pand4,pand_pand5,pand_pand6,pand_pand7,pand_pand8,pand_pand9,pand_pand10,anxiety,stress,work,irbd1,irbd2,irbd3,irbd4,irbd5,irbd6,irbd7,irbd_irbd1,irbd_irbd2,irbd_irbd3,irbd_irbd4,irbd_irbd5,irbd_irbd6,irbd_irbd7,itpd1,itpd2,itpd3,itpd_itpd1,itpd_itpd2,itpd_itpd3,dalal1,dalal2,dalal3,dalal4,dalal5,dalal6,dalal7,dalal8,dalal9,dalal10,dalal11,dalal12,dalal13,dalal14,dalal15,dalal16,dalal_dalal1,dalal_dalal2,dalal_dalal3,dalal_dalal4,dalal_dalal5,dalal_dalal6,dalal_dalal7,dalal_dalal8,dalal_dalal9,dalal_dalal10,dalal_dalal11,dalal_dalal12,dalal_dalal13,dalal_dalal14,dalal_dalal15,dalal_dalal16,alc1,alc2_1,alc2_2,alc2_3,tob1,tob2_1,tob2_2,tob2_3,tob2_4,tob2_5,tob2_6,tob2_7,ex1_1,ex2_1,sleep_1,bfid1,bfid2,bfid3,bfid4,bfid5,bfid6,bfid7,bfid8,bfid9,bfid10,bfid_bfid1,bfid_bfid2,bfid_bfid3,bfid_bfid4,bfid_bfid5,bfid_bfid6,bfid_bfid7,bfid_bfid8,bfid_bfid9,bfid_bfid10,demographics_timestamp,rand_36_item_sf_health_survey_instrument_version_1_timestamp,satisfaction_with_life_scale_swls_timestamp,perceived_stress_scale_pss_timestamp,mpfi24_timestamp,shipley.vocab,shipley.abs,irb,itp,ocb,inter.deviance,org.deviance,extraversion,agreeableness,conscientiousness,neuroticism,openness,pos.affect,neg.affect,stai.trait,audit,gats.status,gats.quantity,gats.quantity.sub,ipaq,psqi,mpfi24_01,mpfi24_02,mpfi24_03,mpfi24_04,mpfi24_05,mpfi24_06,mpfi24_07,mpfi24_08,mpfi24_09,mpfi24_10,mpfi24_11,mpfi24_12,mpfi24_13,mpfi24_14,mpfi24_15,mpfi24_16,mpfi24_17,mpfi24_18,mpfi24_19,mpfi24_20,mpfi24_21,mpfi24_22,mpfi24_23,mpfi24_24,General_Health,Physical_Functioning,Limits_Physical,Emotional_Wellbeing,Limits_Emotional,Social_Functioning,Pain,energy,fatigue,LifeSatisfaction,Stress,WAAQ,Flexibility,Inflexibility,Acceptance,Awareness,Self_as_Context,Defusion,Values,Action,Avoidance,LackofAwareness,Self_as_Content,Fusion,LackofValues,Inaction,Engagement,Engage_Vigor,Engage_Dedication,Engage_Absorbtion,PsyCap,Psycap_Hope,Psycap_Efficacy,Psycap_Reslilience,Psycap_Optimism,challengestressors,Hindrancestressors,poststudy_survey_timestamp_post,General_Health_post,Physical_Functioning_post,Limits_Physical_post,Emotional_Wellbeing_post,Limits_Emotional_post,Social_Functioning_post,Pain_post,energy_post,fatigue_post,LifeSatisfaction_post,Stress_post,WAAQ_post,Flexibility_post,Inflexibility_post,Acceptance_post,Awareness_post,Self_as_Context_post,Defusion_post,Values_post,Action_post,Avoidance_post,LackofAwareness_post,Self_as_Content_post,Fusion_post,LackofValues_post,Inaction_post,Engagement_post,Engage_Vigor_post,Engage_Dedication_post,Engage_Absorbtion_post,PsyCap_post,Psycap_Hope_post,Psycap_Efficacy_post,Psycap_Reslilience_post,Psycap_Optimism_post,challengestressors_post,Hindrancestressors_post,PsyFlexTot,PsyFlexSDTot,Context_Neg_Tot,Context_Pos_Tot,Context_All_Tot
0,SD1001,e3e5e4aa-5950-4f1f-915c-c67598965b03,SD1001,7 South ICU,Android,Male,Day shift,1,1,1,53,2,Guatemala,2,52,4,2,11,,2,,4,10,7,7,7,1,2,0,0,1,1,1,1,0,0,0,0,7,,7,30,1,36,0,2,3,0,0,9,2,1204.0,36ce098b-6905-4258-858a-8bd8a5fba129,engage_psycap,2018-03-07,22:24:24,22:24:24,1.0,212.0,229.0,0.0,2018-03-07,2,0.0,Wednesday,0.0,0.0,0.0,7.0,6.0,3.0,3.0,6.0,7.0,6.0,6.0,5.0,6.0,5.0,5.0,5.0,6.0,6.0,7.0,6.0,4.0,5.0,6.0,6.0,5.0,5.0,6.0,2.0,2.0,1.0,2.0,4.0,5.833333,5.25,5.5,1.75,,7.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2018-02-21 12:42:29,2018-02-21 12:49:43,2018-02-21 12:51:05,2018-02-21 12:55:30,2018-02-21 13:02:33,31,9,47,5.0,55,18,22,2.75,3.166667,3.166667,2.916667,3.5,24,19,38,4.0,never,0.0,,23454.0,6.0,4,4,4,4,4,4,4,5,4,3,4,4,3,4,2,2,3,3,3,2,1,1,2,2,80,100,100,55,100,75,100,50,60,4.4,1.75,5.57142857142857,4,2.33333333333333,4,4,4,4.5,3.5,4,3.5,2,3,2.5,1,2,3.11111111111111,2.66666666666667,3.33333333333333,3.33333333333333,4.5,4.75,3.33333333333333,5,5,3.5,3.25,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,0.0543928293220421,1.66666666666667,1.5,2.2
1,SD1001,e3e5e4aa-5950-4f1f-915c-c67598965b03,SD1001,7 South ICU,Android,Male,Day shift,1,1,1,53,2,Guatemala,2,52,4,2,11,,2,,4,10,7,7,7,1,2,0,0,1,1,1,1,0,0,0,0,7,,7,30,1,36,0,2,3,0,0,9,2,1204.0,b2559df1-6379-49f7-ba40-e3f286fbb0c1,engage_psycap,2018-03-09,00:40:35,00:40:35,1.0,106.0,2189.0,0.0,2018-03-09,4,1.0,Friday,0.0,0.0,0.0,7.0,6.0,3.0,5.0,6.0,6.0,6.0,6.0,6.0,5.0,6.0,6.0,5.0,6.0,6.0,7.0,6.0,5.0,5.0,6.0,6.0,5.0,4.0,6.0,4.0,2.0,5.0,2.0,4.666667,5.916667,5.5,5.25,3.25,,7.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2018-02-21 12:42:29,2018-02-21 12:49:43,2018-02-21 12:51:05,2018-02-21 12:55:30,2018-02-21 13:02:33,31,9,47,5.0,55,18,22,2.75,3.166667,3.166667,2.916667,3.5,24,19,38,4.0,never,0.0,,23454.0,6.0,4,4,4,4,4,4,4,5,4,3,4,4,3,4,2,2,3,3,3,2,1,1,2,2,80,100,100,55,100,75,100,50,60,4.4,1.75,5.57142857142857,4,2.33333333333333,4,4,4,4.5,3.5,4,3.5,2,3,2.5,1,2,3.11111111111111,2.66666666666667,3.33333333333333,3.33333333333333,4.5,4.75,3.33333333333333,5,5,3.5,3.25,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,0.0543928293220421,1.66666666666667,1.5,2.2
2,SD1001,e3e5e4aa-5950-4f1f-915c-c67598965b03,SD1001,7 South ICU,Android,Male,Day shift,1,1,1,53,2,Guatemala,2,52,4,2,11,,2,,4,10,7,7,7,1,2,0,0,1,1,1,1,0,0,0,0,7,,7,30,1,36,0,2,3,0,0,9,2,1204.0,0465420b-495c-41ba-8a31-6195431d33cf,engage_psycap,2018-03-12,22:08:17,22:08:17,0.0,,,,2018-03-12,7,2.0,Monday,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2018-02-21 12:42:29,2018-02-21 12:49:43,2018-02-21 12:51:05,2018-02-21 12:55:30,2018-02-21 13:02:33,31,9,47,5.0,55,18,22,2.75,3.166667,3.166667,2.916667,3.5,24,19,38,4.0,never,0.0,,23454.0,6.0,4,4,4,4,4,4,4,5,4,3,4,4,3,4,2,2,3,3,3,2,1,1,2,2,80,100,100,55,100,75,100,50,60,4.4,1.75,5.57142857142857,4,2.33333333333333,4,4,4,4.5,3.5,4,3.5,2,3,2.5,1,2,3.11111111111111,2.66666666666667,3.33333333333333,3.33333333333333,4.5,4.75,3.33333333333333,5,5,3.5,3.25,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,0.0543928293220421,1.66666666666667,1.5,2.2
3,SD1001,e3e5e4aa-5950-4f1f-915c-c67598965b03,SD1001,7 South ICU,Android,Male,Day shift,1,1,1,53,2,Guatemala,2,52,4,2,11,,2,,4,10,7,7,7,1,2,0,0,1,1,1,1,0,0,0,0,7,,7,30,1,36,0,2,3,0,0,9,2,1204.0,554c06ab-7f7e-4f34-9457-cf2ca5c43f69,engage_psycap,2018-03-15,23:32:46,23:32:46,0.0,,,,2018-03-15,10,3.0,Thursday,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2018-02-21 12:42:29,2018-02-21 12:49:43,2018-02-21 12:51:05,2018-02-21 12:55:30,2018-02-21 13:02:33,31,9,47,5.0,55,18,22,2.75,3.166667,3.166667,2.916667,3.5,24,19,38,4.0,never,0.0,,23454.0,6.0,4,4,4,4,4,4,4,5,4,3,4,4,3,4,2,2,3,3,3,2,1,1,2,2,80,100,100,55,100,75,100,50,60,4.4,1.75,5.57142857142857,4,2.33333333333333,4,4,4,4.5,3.5,4,3.5,2,3,2.5,1,2,3.11111111111111,2.66666666666667,3.33333333333333,3.33333333333333,4.5,4.75,3.33333333333333,5,5,3.5,3.25,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,0.0543928293220421,1.66666666666667,1.5,2.2
4,SD1001,e3e5e4aa-5950-4f1f-915c-c67598965b03,SD1001,7 South ICU,Android,Male,Day shift,1,1,1,53,2,Guatemala,2,52,4,2,11,,2,,4,10,7,7,7,1,2,0,0,1,1,1,1,0,0,0,0,7,,7,30,1,36,0,2,3,0,0,9,2,1204.0,1d9c54d4-ddb2-4777-b5d3-687d19ce697a,engage_psycap,2018-03-19,19:34:09,19:34:09,0.0,,,,2018-03-19,14,4.0,Monday,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2018-02-21 12:42:29,2018-02-21 12:49:43,2018-02-21 12:51:05,2018-02-21 12:55:30,2018-02-21 13:02:33,31,9,47,5.0,55,18,22,2.75,3.166667,3.166667,2.916667,3.5,24,19,38,4.0,never,0.0,,23454.0,6.0,4,4,4,4,4,4,4,5,4,3,4,4,3,4,2,2,3,3,3,2,1,1,2,2,80,100,100,55,100,75,100,50,60,4.4,1.75,5.57142857142857,4,2.33333333333333,4,4,4,4.5,3.5,4,3.5,2,3,2.5,1,2,3.11111111111111,2.66666666666667,3.33333333333333,3.33333333333333,4.5,4.75,3.33333333333333,5,5,3.5,3.25,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,0.0543928293220421,1.66666666666667,1.5,2.2


In [102]:
#save to csv
final_data_complete.to_csv('final_data_complete.csv')

In [103]:
#pickle to save
final_data_complete.to_pickle('final_data_complete.pkl')