# Load data

In [1]:
import pandas as pd
import numpy as np
from utils import * 

In [2]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [3]:
data = pd.read_csv('../input/new_data.csv')
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [4]:
data = transformDf(data)
train = transformDf(train)
test = transformDf(test)

In [5]:
data['ActivityID'].unique()

array(['1.0', '8.0', '6.0', '3.0', '9.0', '2.0', '4.0', '5.0', '7.0'], dtype=object)

In [6]:
data['TimeSinceMidnight'].unique()

array([ 60938.,  60953.,  62152., ...,  77531.,   1811.,   7660.])

In [7]:
data.shape, train.shape, test.shape #13710, 9181, 4529

((13710, 7), (9181, 7), (4529, 7))

# Parameters

Parameters for preprocessing

In [8]:
groupByCase = data.groupby(['CaseID'])

In [9]:
# define the denominator for normalization
divisor = data['Duration'].mean()
divisor2 = data['CumDuration'].mean()

In [10]:
#find len of longest case
maxlen = findLongestLength(groupByCase)
maxlen

15

In [11]:
#define number of features
num_features = len(data['ActivityID'].unique()) + 5
num_features

14

In [12]:
unique_chars = data['ActivityID'].unique().tolist()
target_chars = unique_chars + ['EOS']

In [13]:
chartoindice = char_indice_dict(unique_chars)
targetchartoindice = char_indice_dict(target_chars)

In [14]:
targetchartoindice

{'1.0': 0,
 '2.0': 5,
 '3.0': 3,
 '4.0': 6,
 '5.0': 7,
 '6.0': 2,
 '7.0': 8,
 '8.0': 1,
 '9.0': 4,
 'EOS': 9}

# Train

## Input

In [15]:
train_groupByCase = train.groupby(['CaseID'])

In [16]:
sentences, sentences_t, sentences_t2, sentences_t3, sentences_t4 = getFeature(train_groupByCase)

In [17]:
len(sentences), len(sentences_t), len(sentences_t2), len(sentences_t3), len(sentences_t4)

(9181, 9181, 9181, 9181, 9181)

In [18]:
X = vectorizeInput(train_groupByCase, maxlen, num_features, chartoindice, divisor, divisor2)

In [19]:
X.shape

(9181, 15, 14)

## Output

In [20]:
next_chars, next_chars_t, next_chars_t2, next_chars_t3, next_chars_t4 = getOutput(train_groupByCase)

In [21]:
len(next_chars), len(next_chars_t), len(next_chars_t2), len(next_chars_t3), len(next_chars_t4)

(9181, 9181, 9181, 9181, 9181)

### y_a

In [22]:
next_chars[:10]

['8.0', '6.0', 'EOS', '8.0', '6.0', 'EOS', '8.0', '6.0', 'EOS', '8.0']

In [23]:
y_a = one_hot_encode(train_groupByCase, targetchartoindice)

In [24]:
y_a.shape

(9181, 10)

In [25]:
y_a[:10]

array([[ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]], dtype=float32)

### y_t 

In [26]:
next_chars_t = np.asarray(next_chars_t)

In [27]:
y_t = nomalize(train_groupByCase, divisor)

In [28]:
y_t

array([  7.11185218e-05,   8.24970111e-01,   0.00000000e+00, ...,
         7.11185218e-05,   4.12487426e-04,   0.00000000e+00])

In [29]:
y_t.shape

(9181,)

# Test

In [None]:
test_groupByCase = test.groupby(['CaseID'])

In [None]:
df_test = pd.DataFrame(columns=['CaseID', 'ActivityID', 'CompleteTimestamp', 'Duration', 'CumDuration', 'TimeSinceMidnight', 'WeekDay'])
for case, group in test_groupByCase:
    if group.shape[0] > 1:     
        df_test = df_test.append(group)

In [None]:
df_test.shape #No case with one activity

In [None]:
sentences, sentences_t, sentences_t2, sentences_t3, sentences_t4 = getFeature(test_groupByCase)

In [None]:
len(sentences), len(sentences_t), len(sentences_t2), len(sentences_t3), len(sentences_t4)

In [None]:
X_test = vectorizeInput(test_groupByCase, maxlen, num_features, chartoindice, divisor, divisor2)

In [None]:
X_test.shape

In [None]:
from itertools import groupby

In [None]:
bList = [list(g[1]) for g in groupby(sorted(sentences, key=len), key=len)]

In [None]:
len(bList)

In [None]:
bList[0]

## Input

In [None]:
sentences, sentences_t, sentences_t2, sentences_t3, sentences_t4 = getFeature(groupByCase)

In [None]:
len(sentences), len(sentences_t), len(sentences_t2), len(sentences_t3), len(sentences_t4)

In [None]:
X = vectorizeInput(groupByCase, maxlen, num_features, chartoindice, divisor, divisor2)

In [None]:
X.shape

## Output

In [None]:
next_chars, next_chars_t, next_chars_t2, next_chars_t3, next_chars_t4 = getOutput(groupByCase)

In [None]:
len(next_chars), len(next_chars_t), len(next_chars_t2), len(next_chars_t3), len(next_chars_t4)

### y_a

One-hot encode for y_a

In [None]:
next_chars[:10]

In [None]:
y_a = one_hot_encode(groupByCase, targetchartoindice)

In [None]:
y_a.shape

In [None]:
y_a[:10]

### y_t

Scale/Normalize data. This can be done by using [sklearn](http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-scaler)

In [None]:
next_chars_t = np.asarray(next_chars_t)

In [None]:
next_chars_t.shape

In [None]:
y_t = nomalize(next_chars_t, divisor)

In [None]:
y_t

In [None]:
y_t.shape

# Save data

In [None]:
import pickle

In [None]:
with open('../input/parameters.pkl', 'wb') as f:
    pickle.dump(maxlen, f, protocol=2)
    pickle.dump(num_features, f, protocol=2)
    pickle.dump(chartoindice, f, protocol=2)
    pickle.dump(targetchartoindice, f, protocol=2)
    pickle.dump(divisor, f, protocol=2)
    pickle.dump(divisor2, f, protocol=2)

In [None]:
with open('../input/preprocessed_data.pkl', 'wb') as f:
    pickle.dump(X, f, protocol=2)
    pickle.dump(y_a, f, protocol=2)
    pickle.dump(y_t, f, protocol=2)
    pickle.dump(X_train, f, protocol=2)
    pickle.dump(y_a_train, f, protocol=2)
    pickle.dump(y_t_train, f, protocol=2)
    pickle.dump(X_test, f, protocol=2)
    pickle.dump(y_a_test, f, protocol=2)
    pickle.dump(y_t_test, f, protocol=2)

# Test model

In [None]:
X = X_train
y_a = y_a_train
y_t = y_t_train

In [None]:
from keras.models import Sequential, Model
from keras.layers.core import Dense
from keras.layers.recurrent import LSTM, GRU, SimpleRNN
from keras.layers import Input
from keras.utils.data_utils import get_file
from keras.regularizers import WeightRegularizer
from keras.optimizers import Nadam
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.layers.normalization import BatchNormalization
from theano.ifelse import ifelse

In [None]:
# build the model: 
print('Build model...')
main_input = Input(shape=(maxlen, num_features), name='main_input')
# train a 2-layer LSTM with one shared layer
l1 = LSTM(100, consume_less='gpu', init='glorot_uniform', return_sequences=True, dropout_W=0.2)(main_input) # the shared layer
b1 = BatchNormalization()(l1)
l2_1 = LSTM(100, consume_less='gpu', init='glorot_uniform', return_sequences=False, dropout_W=0.2)(b1) # the layer specialized in activity prediction
b2_1 = BatchNormalization()(l2_1)
l2_2 = LSTM(100, consume_less='gpu', init='glorot_uniform', return_sequences=False, dropout_W=0.2)(b1) # the layer specialized in time prediction
b2_2 = BatchNormalization()(l2_2)
act_output = Dense(len(target_chars), activation='softmax', init='glorot_uniform', name='act_output')(b2_1)
time_output = Dense(1, init='glorot_uniform', name='time_output')(b2_2)

model = Model(input=[main_input], output=[act_output, time_output])

opt = Nadam(lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=1e-08, schedule_decay=0.004, clipvalue=3)

model.compile(loss={'act_output':'categorical_crossentropy', 'time_output':'mae'}, optimizer=opt)
early_stopping = EarlyStopping(monitor='val_loss', patience=42)
model_checkpoint = ModelCheckpoint('output_files/models/model_{epoch:02d}-{val_loss:.2f}.h5', monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto')
lr_reducer = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, verbose=0, mode='auto', epsilon=0.0001, cooldown=0, min_lr=0)

model.fit(X, {'act_output':y_a, 'time_output':y_t}, validation_split=0.2, verbose=2, callbacks=[early_stopping, model_checkpoint, lr_reducer], batch_size=maxlen, nb_epoch=500)

# Test prediction

In [None]:
from keras.models import load_model

In [None]:
# set parameters
predict_size = 1

# load model, set this to the model generated by train.py
#model = load_model('output_files/models/model_168-0.50.h5')
model = load_model('output_files/models/model_31-0.95.h5')

In [None]:
pred = model.predict(X_test, verbose=0)

In [None]:
pred[0].shape

In [None]:
pred[1].shape

In [None]:
pred[1]*divisor

In [None]:
y_t_test

In [None]:
pred[0]

In [None]:
y_a_test