In [1]:
import pandas
import matplotlib.pyplot as plt
import numpy
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder


numpy.random.seed(7)

data = pandas.read_csv("../data/interim/accelData.csv")
print data.shape
print data[1:5] #Data is separated 60ms
#plt.plot(data[['accelerationX','accelerationY','accelerationZ']][1:100])
#plt.show()

# We clean up the Activity and social values
cleandata = data
cleandata.loc[cleandata['Activity'].isnull(),'Activity'] = 'Other'
cleandata.loc[cleandata['Activity'] == 'OFF','Activity'] = 'Other'
cleandata.loc[cleandata['Activity'] == 'TEC','Activity'] = 'Other'
cleandata.loc[cleandata['Activity'] == 'TDT','Activity'] = 'Other'
cleandata.loc[cleandata['Social'].isnull(),'Social'] = 'Other'
#print numpy.unique(cleandata['Activity']), numpy.unique(cleandata['Social'])
#print cleandata.shape

cleandata = cleandata[cleandata.notnull().all(axis=1)]
#print 'Not null data'
#print cleandata.shape

train = cleandata[~cleandata['session'].isin(['case1-day1-session1-teacher1','case2-day3-session1-teacher2'])]
#print train.shape
test = cleandata[cleandata['session'].isin(['case1-day1-session1-teacher1','case2-day3-session1-teacher2'])]
#print test.shape

# We split our datasets into session+timestamps, X and Y
times_train = train.loc[:,['session','timestamp']]
times_test = test.loc[:,['session','timestamp']]

X_train = train.loc[:,['accelerationX','accelerationY','accelerationZ']].astype(float)
Y_train = train.loc[:,'Activity'] # Social is 8

X_test = test.loc[:,['accelerationX','accelerationY','accelerationZ']].astype(float)
Y_test = test.loc[:,'Activity']

# One hot encoding of the response variable (using dummy variables)
from keras.utils.np_utils import to_categorical

# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y_train)
encoded_Y_train = encoder.transform(Y_train)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_train = to_categorical(encoded_Y_train)
encoder.fit(Y_test)
encoded_Y_test = encoder.transform(Y_test)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_test = to_categorical(encoded_Y_test)

# Sanity check on matrix dimensions, after droppinig null/nans
print times_train.shape #
print X_train.shape #
print Y_test.shape #
print dummy_y_test.shape #

#print 'X before normalization'
#print X_train[1:5]
# normalize the dataset
scaler = MinMaxScaler(feature_range=(0, 1))
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
#print 'X after normalization'
#print X_train[1:5,:]

Using Theano backend.


(505693, 9)
   Unnamed: 0  accelerationX  accelerationY  accelerationZ  timestamp  \
1         621         -0.306          6.704          7.834       60.0   
2          63         -0.383          6.771          6.799      121.0   
3         642         -0.162          6.416          7.010      183.0   
4         655         -0.249          6.445          6.962      241.0   

                        session  timestamp.orig Activity Social  
1  case1-day1-session1-teacher1   1433229445693      NaN    NaN  
2  case1-day1-session1-teacher1   1433229445754      TDT    CLS  
3  case1-day1-session1-teacher1   1433229445816      TDT    CLS  
4  case1-day1-session1-teacher1   1433229445874      TDT    CLS  
(411852, 2)
(411852, 3)
(93841,)
(93841, 5)


In [2]:
# reshape input to be [samples, time steps, features]
trainX = numpy.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
testX = numpy.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))
print trainX.shape

# create and fit the LSTM network
batch_size = 1
model = Sequential()
# stateful LSTM!
model.add(LSTM(20, batch_input_shape=(batch_size, 1, X_train.shape[1]), 
               stateful=True))
model.add(Dense(5, activation='sigmoid'))
# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print model.summary()

from keras.callbacks import ModelCheckpoint

# To save the best model
# serialize model to JSON
model_json = model.to_json()
with open("acc.model--1lstm.json", "w") as json_file:
    json_file.write(model_json)
filepath="acc.weights--1lstm.best.hdf5"
# Define that the accuracy in cv is monitored, and that weights are stored in a file when max accuracy is achieved
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

nb_epochs = 1
sessions = numpy.unique(times_train.session)
print sessions

(411852, 1, 3)
____________________________________________________________________________________________________
Layer (type)                       Output Shape        Param #     Connected to                     
lstm_1 (LSTM)                      (1, 20)             1920        lstm_input_1[0][0]               
____________________________________________________________________________________________________
dense_1 (Dense)                    (1, 5)              105         lstm_1[0][0]                     
Total params: 2025
____________________________________________________________________________________________________
None
['case1-day1-session2-teacher1' 'case1-day1-session3-teacher1'
 'case1-day1-session4-teacher1' 'case2-day1-session1-teacher2'
 'case2-day1-session2-teacher2' 'case2-day2-session1-teacher2'
 'case2-day2-session2-teacher2' 'case2-day3-session2-teacher2'
 'case2-day4-session1-teacher2' 'case2-day4-session2-teacher2']


In [4]:

# Manually create epochs and reset between sessions
for i in range(nb_epochs):
    # Single epoch. Remember to not shuffle the data!
    #print trainX[0:5,:,:]
    #print dummy_y_train[0:5,:]
    for session in sessions:
        sessionX = trainX[numpy.where(times_train.session == session)[0],:,:]
        sessionY = dummy_y_train[numpy.where(times_train.session == session)[0],:]
        print session, sessionX.shape, sessionY.shape
        print testX.shape, dummy_y_test.shape
        history = model.fit(sessionX, sessionY, validation_data=(testX,dummy_y_test), 
                            nb_epoch=1, batch_size=batch_size, shuffle=False, 
                            verbose=1, callbacks=callbacks_list)
        # Remember to reset the state between epochs!
        #model.reset_states()
    model.reset_states()
    # Estimate model performance, and reset states!
    testScore = model.evaluate(testX, dummy_y_test, batch_size=batch_size, 
                                verbose=0)
    model.reset_states()
    print 'Test score after epoch of whole dataset:'
    print testScore

# Estimate model performance, and reset states!
testScore = model.evaluate(testX, dummy_y_test, batch_size=batch_size, 
                            verbose=0)
model.reset_states()
print 'Test score:'
print testScore

case1-day1-session2-teacher1 (40739, 1, 3) (40739, 5)
(93841, 1, 3) (93841, 5)
Train on 40739 samples, validate on 93841 samples
Epoch 1/1
case1-day1-session3-teacher1 (38226, 1, 3) (38226, 5)
(93841, 1, 3) (93841, 5)
Train on 38226 samples, validate on 93841 samples
Epoch 1/1
case1-day1-session4-teacher1 (35156, 1, 3) (35156, 5)
(93841, 1, 3) (93841, 5)
Train on 35156 samples, validate on 93841 samples
Epoch 1/1
case2-day1-session1-teacher2 (28453, 1, 3) (28453, 5)
(93841, 1, 3) (93841, 5)
Train on 28453 samples, validate on 93841 samples
Epoch 1/1
case2-day1-session2-teacher2 (36861, 1, 3) (36861, 5)
(93841, 1, 3) (93841, 5)
Train on 36861 samples, validate on 93841 samples
Epoch 1/1
case2-day2-session1-teacher2 (44040, 1, 3) (44040, 5)
(93841, 1, 3) (93841, 5)
Train on 44040 samples, validate on 93841 samples
Epoch 1/1
case2-day2-session2-teacher2 (44581, 1, 3) (44581, 5)
(93841, 1, 3) (93841, 5)
Train on 44581 samples, validate on 93841 samples
Epoch 1/1
case2-day3-session2-teacher

# 2-layer lstm?


In [6]:
# create and fit the LSTM network
batch_size = 1
model = Sequential()
# stateful LSTM!
model.add(LSTM(20, batch_input_shape=(batch_size, 1, X_train.shape[1]), 
               stateful=True, return_sequences=True))
model.add(LSTM(20, stateful=True))
model.add(Dense(5, activation='sigmoid'))
# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print model.summary()

from keras.callbacks import ModelCheckpoint

# To save the best model
# serialize model to JSON
model_json = model.to_json()
with open("acc.model--2lstm.json", "w") as json_file:
    json_file.write(model_json)
filepath="acc.weights--2lstm.best.hdf5"
# Define that the accuracy in cv is monitored, and that weights are stored in a file when max accuracy is achieved
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

nb_epochs = 1
sessions = numpy.unique(times_train.session)

____________________________________________________________________________________________________
Layer (type)                       Output Shape        Param #     Connected to                     
lstm_4 (LSTM)                      (1, 1, 20)          1920        lstm_input_3[0][0]               
____________________________________________________________________________________________________
lstm_5 (LSTM)                      (1, 20)             3280        lstm_4[0][0]                     
____________________________________________________________________________________________________
dense_3 (Dense)                    (1, 5)              105         lstm_5[0][0]                     
Total params: 5305
____________________________________________________________________________________________________
None


In [7]:

# Manually create epochs and reset between sessions
for i in range(nb_epochs):
    # Single epoch. Remember to not shuffle the data!
    #print trainX[0:5,:,:]
    #print dummy_y_train[0:5,:]
    for session in sessions:
        sessionX = trainX[numpy.where(times_train.session == session)[0],:,:]
        sessionY = dummy_y_train[numpy.where(times_train.session == session)[0],:]
        print session, sessionX.shape, sessionY.shape
        print testX.shape, dummy_y_test.shape
        history = model.fit(sessionX, sessionY, validation_data=(testX,dummy_y_test), 
                            nb_epoch=1, batch_size=batch_size, shuffle=False, 
                            verbose=1, callbacks=callbacks_list)
        # Remember to reset the state between epochs!
        #model.reset_states()
    model.reset_states()
    # Estimate model performance, and reset states!
    testScore = model.evaluate(testX, dummy_y_test, batch_size=batch_size, 
                                verbose=0)
    model.reset_states()
    print 'Test score after epoch of whole dataset:'
    print testScore

# Estimate model performance, and reset states!
testScore = model.evaluate(testX, dummy_y_test, batch_size=batch_size, 
                            verbose=0)
model.reset_states()
print 'Test score:'
print testScore

case1-day1-session2-teacher1 (40739, 1, 3) (40739, 5)
(93841, 1, 3) (93841, 5)
Train on 40739 samples, validate on 93841 samples
Epoch 1/1
case1-day1-session3-teacher1 (38226, 1, 3) (38226, 5)
(93841, 1, 3) (93841, 5)
Train on 38226 samples, validate on 93841 samples
Epoch 1/1
 5192/38226 [===>..........................] - ETA: 363s - loss: 6.8705 - acc: 0.5576

KeyboardInterrupt: 

... something may not be working correctly, as all epochs and all LSTM models seem to give the same test score!

# For comparison, do RF of accel dataset

Both on the **raw data**, and on the **10s data** (accel features only)

In [None]:
# Import the random forest package
from sklearn.ensemble import RandomForestClassifier 

# Create the random forest object which will include all the parameters
# for the fit
forest = RandomForestClassifier(n_estimators = 100)

# Fit the training data to the Survived labels and create the decision trees
forest = forest.fit(train_data[0::,1::],train_data[0::,0])

# Take the same decision trees and run it on the test data
print 'Accuracy on test data (RAW):',forest.score(test_data)