In [197]:
# Feb. 2021
# Use the depresjon dataset to forecast health monitor activity based on existing health monitor activity
# depresjon dataset link: https://datasets.simula.no/depresjon/
# Goal: Predict whether someone is depressed based on their health monitor activity using a Random Forest
# TODO later: experiment with different class balancing techniques to understand them better

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import os
from datetime import datetime
import random
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.layers import Bidirectional
import keras
from sklearn.metrics import mean_squared_error

import tensorflow as tf

In [2]:
os.getcwd()

'/Users/Claire/code/github.com/claire.y.yang/predicting-depression/nb'

### Data Preprocessing

In [3]:
condition_directory = "../data/condition"
control_directory = "../data/control/"

condition_dict = {}
control_dict = {}

for filename in os.listdir(condition_directory):
    if filename.endswith(".csv"): 
        condition_file_path = os.path.join(condition_directory, filename)
        df_condition = pd.read_csv(condition_file_path)
        subj = filename.split(".")[0]
        condition_dict[subj] = df_condition
        continue
    else:
        continue
        
for filename in os.listdir(control_directory):
    if filename.endswith(".csv"): 
        control_file_path = os.path.join(control_directory, filename)
        df_control = pd.read_csv(control_file_path)
        subj = filename.split(".")[0]
        control_dict[subj] = df_control
        continue
    else:
        continue
        

In [4]:
# First, create a column in each df denoting the Y, M, d, H, M for the condition data
for cond in condition_dict:
    cond_df = condition_dict[cond]
    years = []
    months = []
    days = []
    hours = []
    minutes = []
    for idx, row in cond_df.iterrows():
        timestamp = row['timestamp']
        parsed_timestamp = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
        years.append(parsed_timestamp.year)
        months.append(parsed_timestamp.month)
        days.append(parsed_timestamp.day)
        hours.append(parsed_timestamp.hour)
        minutes.append(parsed_timestamp.minute)
        
    cond_df['Year'] = years
    cond_df['Month'] = months
    cond_df['Day'] = days
    cond_df['Hour'] = hours
    cond_df['Minute'] = minutes


In [5]:
# Get rid of days in which we do not have the full 24 hours of data for the condition data

new_condition_dict = {}
for cond in condition_dict:
    cond_df = condition_dict[cond]
    cur_ymd = (9999, 1, 1)
    days_to_be_removed = []
    
    for idx, row in cond_df.iterrows():
        ymd = (row['Year'], row['Month'], row['Day'])
        if cur_ymd != ymd:
            # Check if the hour and minute are equal to 0, 0
            if row['Hour'] != 0:
                # Then we remove all rows containing data of this day
                days_to_be_removed.append(ymd)
            cur_ymd = ymd
        
        # If it's the last row, and the hour and minute do not equal 23, 59, then remove the whole day
        if idx == len(cond_df) - 1:
            if row['Hour'] != 23 and row['Minute'] != 59:
                days_to_be_removed.append(ymd)
        
#     print("Beforehand Month, Day list:", set(cond_df['Month']), set(cond_df['Day']))
    # Now actually remove the day in cond_df
    for day in days_to_be_removed:
        cond_df = cond_df[cond_df['Day'] != day[2]]
        
    new_condition_dict[cond] = cond_df
        
#     print("Afterward Month, Day list:", set(cond_df['Month']), set(cond_df['Day']))

In [8]:
# First, create a column in each df denoting the Y, M, d, H, M for the control data
for cond in control_dict:
    control_df = control_dict[cond]
    years = []
    months = []
    days = []
    hours = []
    minutes = []
    for idx, row in control_df.iterrows():
        timestamp = row['timestamp']
        parsed_timestamp = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
        years.append(parsed_timestamp.year)
        months.append(parsed_timestamp.month)
        days.append(parsed_timestamp.day)
        hours.append(parsed_timestamp.hour)
        minutes.append(parsed_timestamp.minute)
        
    control_df['Year'] = years
    control_df['Month'] = months
    control_df['Day'] = days
    control_df['Hour'] = hours
    control_df['Minute'] = minutes


In [9]:
# Get rid of days in which we do not have the full 24 hours of data for the control data

new_control_dict = {}
for cond in control_dict:
    control_df = control_dict[cond]
    cur_ymd = (9999, 1, 1)
    days_to_be_removed = []
    
    for idx, row in control_df.iterrows():
        ymd = (row['Year'], row['Month'], row['Day'])
        if cur_ymd != ymd:
            # Check if the hour and minute are equal to 0, 0
            if row['Hour'] != 0:
                # Then we remove all rows containing data of this day
                days_to_be_removed.append(ymd)
            cur_ymd = ymd
        
        # If it's the last row, and the hour and minute do not equal 23, 59, then remove the whole day
        if idx == len(control_df) - 1:
            if row['Hour'] != 23 and row['Minute'] != 59:
                days_to_be_removed.append(ymd)

    # Now actually remove the day in cond_df
    for day in days_to_be_removed:
        control_df = control_df[control_df['Day'] != day[2]]
        
    new_control_dict[cond] = control_df

In [56]:
# Downsample so that it's for each half hour for each value
# Then, create a dictionary of vectors for each condition
# we will then use this to create the train, test sets

summed_condition_dict = {}

for cond in new_condition_dict:
    new_cond_df = new_condition_dict[cond]
    new_cond_df['timestamp'] = pd.to_datetime(new_cond_df['timestamp'], errors='coerce')
    summed_cond_dict = dict(new_cond_df.set_index('timestamp').resample("30T").sum()['activity'])
    
    list_vectors = []
    cur_day = 0
    cur_day_vector = []
    
    for time in summed_cond_dict:
        day = time.day
        activity_30_min = summed_cond_dict[time]
        if day != cur_day:
            if cur_day != 0:
                list_vectors.append(cur_day_vector)
            # It's a new day, so just reset + append to a new cur_day_vector
            cur_day_vector = [activity_30_min]
            cur_day = day
        else:
            cur_day_vector.append(activity_30_min)
            
    list_vectors.append(cur_day_vector) # append the last cur_day_vector
#     print(len(list_vectors))
#     print(len(list_vectors[1]))
    
    summed_condition_dict[cond] = list_vectors
    
# list_vectors should be 48 x n dimensional

In [57]:
# Downsample so that it's for each half hour for each value
# Then, create a dictionary of vectors for each condition
# we will then use this to create the train, test sets

summed_control_dict = {}
summed_final_control_dict = {}

for cond in new_control_dict:
    new_control_df = new_control_dict[cond]
    new_control_df['timestamp'] = pd.to_datetime(new_control_df['timestamp'], errors='coerce')
    summed_control_dict = dict(new_control_df.set_index('timestamp').resample("30T").sum()['activity'])
    
    list_vectors = []
    cur_day = 0
    cur_day_vector = []
    
    for time in summed_control_dict:
        day = time.day
        activity_30_min = summed_control_dict[time]
        if day != cur_day:
            if cur_day != 0:
                list_vectors.append(cur_day_vector)
            # It's a new day, so just reset + append to a new cur_day_vector
            cur_day_vector = [activity_30_min]
            cur_day = day
        else:
            cur_day_vector.append(activity_30_min)
            
    list_vectors.append(cur_day_vector) # append the last cur_day_vector
    print(len(list_vectors))
    print(len(list_vectors[1]))
    
    summed_final_control_dict[cond] = list_vectors
    
# list_vectors should be 48 x n dimensional

32
48
14
48
14
48
21
48
35
48
21
48
21
48
35
48
45
48
16
48
14
48
14
48
35
48
14
48
18
48
21
48
13
48
22
48
35
48
35
48
19
48
16
48
14
48
16
48
16
48
14
48
16
48
14
48
19
48
23
48
14
48
19
48


In [133]:
# Now, create a function that will split the dictionary into different vectors, with input look_back
# We also truncate all of the data points so that we only have 14 different days worth of data for each subject

def split_input(dictionary, group_type):
    X_y_list = []
    
    for subj in dictionary:
        list_vectors = dictionary[subj]
        list_vectors = list_vectors[:14]
        if group_type == "condition":
            encode = 0
        else:
            encode = 1
        X_y_list.append((list_vectors, encode))
    return X_y_list
            
condition_data_list = split_input(summed_condition_dict, "condition")
control_data_list = split_input(summed_final_control_dict, "control")

In [134]:
# for i in range(0, len(control_data_list)):
#     print(len(control_data_list[i][0]))
    
# the first index goes through the different samples
# the second index controls whether it's the list of vectors or the group_type

In [211]:
# Now, we should do a random split of the sample, so that it's 75% training, 25% testing

def create_random_split(data_list):
    data = [i for i in range(0, len(data_list))]
    random.shuffle(data)

    train_split = int(len(data_list) * 0.75)

    train_indices = data[:train_split]
    testing_indices = data[train_split:]

    train_X = []
    train_y = []
    test_X = []
    test_y = []

    for i in range(0, len(data_list)):
        if i in train_indices:
            train_X.append(data_list[i][0])
            train_y.append(data_list[i][1])
        elif i in testing_indices:
            test_X.append(data_list[i][0])
            test_y.append(data_list[i][1])
    return train_X, train_y, test_X, test_y

control_train_X, control_train_y, control_test_X, control_test_y = create_random_split(control_data_list)
condition_train_X, condition_train_y, condition_test_X, condition_test_y = create_random_split(condition_data_list)

In [226]:
def convert_to_np_array(train_X, train_y, test_X, test_y):
    train_X = np.array(train_X)
    train_y = np.array(train_y)
    test_X = np.array(test_X)
    test_y = np.array(test_y)
    
    return train_X, train_y, test_X, test_y

control_train_X, control_train_y, control_test_X, control_test_y = convert_to_np_array(control_train_X, control_train_y, control_test_X, control_test_y)
condition_train_X, condition_train_y, condition_test_X, condition_test_y = convert_to_np_array(condition_train_X, condition_train_y, condition_test_X, condition_test_y)

In [227]:
train_X = []
train_y = []

test_X = []
test_y = []

for i in range(0, len(control_train_X)):
    train_X.append(control_train_X[i])
    train_y.append(control_train_y[i])
for j in range(0, len(condition_train_X)):
    train_X.append(condition_train_X[j])
    train_y.append(control_train_y[j])
    
for i in range(0, len(control_test_X)):
    test_X.append(control_test_X[i])
    test_y.append(control_test_y[i])
for j in range(0, len(condition_test_X)):
    test_X.append(condition_test_X[j])
    test_y.append(control_test_y[j])
    

In [228]:
train_X, train_y, test_X, test_y = convert_to_np_array(train_X, train_y, test_X, test_y)


In [229]:
train_X

array([list([[140, 184, 178, 148, 923, 111, 92, 0, 190, 137, 2218, 3889, 24040, 18221, 25530, 11151, 7851, 11456, 10603, 11456, 15104, 14867, 14735, 10198, 18045, 5319, 13312, 14408, 11587, 12485, 11107, 14832, 9157, 12425, 16530, 14381, 3838, 7688, 26354, 14381, 29957, 22732, 6235, 10380, 17414, 7438, 783, 0], [246, 96, 502, 0, 414, 58, 5, 446, 91, 181, 0, 3978, 22466, 23012, 15474, 9610, 13181, 15594, 16249, 15047, 11869, 20437, 23687, 25810, 18678, 10667, 14819, 22484, 24317, 19340, 20256, 19225, 11206, 9215, 9646, 13174, 29944, 7721, 8591, 6245, 10806, 7424, 15027, 2519, 1899, 2256, 7891, 10913], [381, 32, 22, 20, 155, 15, 149, 65, 24, 524, 692, 854, 15, 76, 484, 6298, 9916, 29411, 17006, 28322, 33571, 21300, 12450, 19916, 17963, 11659, 6079, 14997, 11452, 7810, 9086, 5171, 8990, 7128, 9365, 13733, 13115, 4631, 5774, 21242, 7839, 8214, 2883, 4059, 7733, 8564, 0, 8], [9, 1161, 333, 609, 79, 821, 1631, 32, 99, 215, 453, 1354, 236, 267, 14, 880, 365, 6183, 17849, 21781, 17249, 33778, 

### Create model

In [230]:
# Use Keras to create a deep learning model
# TODO: try out different layers to understand it better and compare to using decision tree ensemble in scikit
model = Sequential()
# the input dim is 14x48 for each sample
model.add(keras.Input(shape=(1,14,48)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
print(model.summary())

# Use Keras
# I referenced this: https://stackabuse.com/solving-sequence-problems-with-lstm-in-keras-part-2/
# This is a many-to-many sequence problem with multiple features
model = Sequential()
# encoder layer
model.add(LSTM(16, activation='relu', input_shape=(14, 48), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(32))
model.add(Dropout(0.2))
model.add(Dense(1, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

print(model.summary())

Model: "sequential_23"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_40 (Dense)             (None, 1, 14, 32)         1568      
_________________________________________________________________
dense_41 (Dense)             (None, 1, 14, 1)          33        
Total params: 1,601
Trainable params: 1,601
Non-trainable params: 0
_________________________________________________________________
None
Model: "sequential_24"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_12 (LSTM)               (None, 14, 16)            4160      
_________________________________________________________________
dropout_12 (Dropout)         (None, 14, 16)            0         
_________________________________________________________________
lstm_13 (LSTM)               (None, 32)                6272      
_______________________

In [232]:
train_X_tensor = tf.convert_to_tensor(train_X)
type(train_X_tensor)

TypeError: Cannot convert value <class 'list'> to a TensorFlow DType.

In [198]:
# fit the keras model on the dataset
model.fit(tf.convert_to_tensor(train_X), tf.convert_to_tensor(train_y), epochs=150, batch_size=10)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type numpy.ndarray).

In [None]:
# evaluate the keras model
_, accuracy = model.evaluate(X, y)
print('Accuracy: %.2f' % (accuracy*100))