In [140]:

import numpy as np
import pandas as pd

### Data processing


In [141]:
#LOAD The data
path = 'dataset_mood_smartphone.csv'
df = pd.read_csv(path, header=0, sep=',', index_col=0, )

#Split time column into time and date
date, time = df['time'].str.split(' ', 1).str
df.drop(labels=['time'], axis=1,inplace = True)
df.insert(1, 'date', date)
df.insert(2, 'time', time)

#sort on id and date
df = df.sort_values(by=['id', 'date'])

#Get the unique id's and data variables
ids = df.id.unique()
inputs = df.variable.unique()


In [142]:
#function to merge the days of the variables per patient  ##Note takes a while to complete
def mergedays(df):
    table = pd.DataFrame(columns=['id','date', 'variable', 'value'])
    ids = df.id.unique()
    #Rebuild the dataframe for every patient, day , variable
    i = 0
    for patientid in ids:
        patientdata = df.loc[df['id'] == patientid]
        days = patientdata.date.unique()
        for day in days:
            daydata = patientdata.loc[patientdata['date'] == day]
            variables = daydata.variable.unique()
            for var in variables:
                if var == 'mood' or var == 'circumplex.arousal' or var == 'circumplex.valence':
                    value = daydata.loc[daydata['variable'] == var].mean()[0]
                else:
                    #note due to sum suming over all indexing is needed
                    value = daydata.loc[daydata['variable'] == var].sum()[4]

                #print([patientid,day,var,value])
                table.loc[i] = [patientid,day,var,value]
                i+=1
    return table

In [143]:
#STATUS PRINTS      #NOTE before cleaning
print("STATISTICS")
print("The amount of unique ids are: ", len(ids), "\n", ids)
print("\nThe variables used are:\n", inputs)

print("\nErrors are within:  circumplex.valence 156 NA & circumplex.arousal 46 NA")
print("The count and amount of data inputs do not match")

print("\nHow many data points per id:")
for i in ids:
    print(i, " amount of data inputs: ", len(df.loc[df['id'] == i]))

print("\nHow often are the variables used:")
for j in inputs:
    print(j, "\nAmount of data inputs: ", len(df.loc[df['variable'] == j]))
    print(df.loc[df['variable'] == j].describe(),"\n")
    
    


STATISTICS
The amount of unique ids are:  27 
 ['AS14.01' 'AS14.02' 'AS14.03' 'AS14.05' 'AS14.06' 'AS14.07' 'AS14.08'
 'AS14.09' 'AS14.12' 'AS14.13' 'AS14.14' 'AS14.15' 'AS14.16' 'AS14.17'
 'AS14.19' 'AS14.20' 'AS14.23' 'AS14.24' 'AS14.25' 'AS14.26' 'AS14.27'
 'AS14.28' 'AS14.29' 'AS14.30' 'AS14.31' 'AS14.32' 'AS14.33']

The variables used are:
 ['call' 'sms' 'mood' 'circumplex.arousal' 'circumplex.valence' 'activity'
 'screen' 'appCat.builtin' 'appCat.communication' 'appCat.entertainment'
 'appCat.other' 'appCat.social' 'appCat.unknown' 'appCat.utilities'
 'appCat.finance' 'appCat.office' 'appCat.travel' 'appCat.weather'
 'appCat.game']

Errors are within:  circumplex.valence 156 NA & circumplex.arousal 46 NA
The count and amount of data inputs do not match

How many data points per id:
AS14.01  amount of data inputs:  21999
AS14.02  amount of data inputs:  14581
AS14.03  amount of data inputs:  14425
AS14.05  amount of data inputs:  15745
AS14.06  amount of data inputs:  18092
AS14.0

In [144]:
# Clean data from NA
df = df.dropna(axis=0, how='any')

###Check to see the cleaning result
# print("\nHow often are the variables used:")
# for j in inputs:
#     print(j, "\nAmount of data inputs: ", len(df.loc[df['variable'] == j]))
#     print(df.loc[df['variable'] == j].describe(),"\n")
###


In [145]:
df = mergedays(df)
print(df)

#Todo correlation
#todo plots


            id        date              variable        value
0      AS14.01  2014-02-17                  call     2.000000
1      AS14.01  2014-02-18                  call     1.000000
2      AS14.01  2014-02-19                  call     7.000000
3      AS14.01  2014-02-19                   sms     2.000000
4      AS14.01  2014-02-20                  call     2.000000
5      AS14.01  2014-02-20                   sms     3.000000
6      AS14.01  2014-02-21                   sms     1.000000
7      AS14.01  2014-02-22                  call     2.000000
8      AS14.01  2014-02-22                   sms     1.000000
9      AS14.01  2014-02-25                  call     3.000000
10     AS14.01  2014-02-26                  mood     6.250000
11     AS14.01  2014-02-26    circumplex.arousal    -0.250000
12     AS14.01  2014-02-26    circumplex.valence     0.750000
13     AS14.01  2014-02-26                  call     1.000000
14     AS14.01  2014-02-26                   sms     2.000000
15     A