In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#load raw data
users = pd.read_csv('data/USERS.csv')
qs = pd.read_csv('data/QUESTIONNAIRE.csv')
motion = pd.read_csv('data/MOTION.csv')
sound = pd.read_csv('data/SOUND.csv')
location = pd.read_csv('data/LOCATION.csv')

#remove user ids under 30...these were test users
users = users[users['userID'] >= 30]
qs = qs[qs['user'] >= 30]
motion = motion[motion['user'] >= 30]
sound = sound[sound['user']>= 30]
location = location[location['user'] >= 30]

#Remove Questionnaire entries that do not have a valid Result1 and Result2 
#(Result1 and Result2 are the SF36 questionnaire summary values for Physical Component Score and Mental component scores. 
#These are summary meeasure computed from the other 36 items. Result1 and Result2 are computed at the time of completing the questionnaire on the App
qs = qs[(qs['Result1'] > 0) & (qs['Result2'] > 0)]

#Join Questionnaire with User (do a qs left join with users so that only users with valid questionnaire data are kept in the resulting join)
users = users.rename(columns={'userID':'user'})
valid_questionnaire = pd.merge(qs,users,on='user',how='left')

### Motion File Processing:

##### Users that have uploaded less than 40 hours of movement data (40 hours when phone was not stationary) will be removed

In [38]:
#Count number of f2 features for each user....using f2 as we want to count number of hours where there was activity. If f2 is NAN then there was no activity
NUM_ACTIVE_DAYS_NEEDED = 5
entryCounts = motion.groupby(by='user').count()[['f2']]
#select users with at least 40 hours of non-stationary motion data
validmotion_Users= entryCounts[entryCounts['f2'] > 8 * NUM_ACTIVE_DAYS_NEEDED].reset_index()
print(f"Motion file has {validmotion_Users.shape[0]} users with at least {8 * NUM_ACTIVE_DAYS_NEEDED} hours of active motion data")
#left join valid user with full motion file...in effect this will remove non valid users from the motion dataframe
validmotion = pd.merge(validmotion_Users['user'],motion,on='user',how='left')

Motion file has 500 users with at least 40 hours of active motion data
