In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [3]:
data = pd.read_csv('../input/test.csv')

In [4]:
data.head()

Unnamed: 0,CaseID,ActivityID,CompleteTimestamp,Duration,CumDuration,TimeSinceMidnight,WeekDay
0,3087.0,1.0,2010-12-07 18:32:31,0.0,0.0,66751.0,1.0
1,3087.0,8.0,2010-12-11 00:35:42,280991.0,280991.0,2142.0,5.0
2,3087.0,6.0,2010-12-13 23:50:04,256462.0,537453.0,85804.0,0.0
3,3088.0,1.0,2011-02-24 17:23:53,0.0,0.0,62633.0,3.0
4,3088.0,1.0,2011-02-24 23:34:42,22249.0,22249.0,84882.0,3.0


In [5]:
data.shape

(4529, 7)

In [6]:
# convert to float
for col in list(data):
    if col != 'CompleteTimestamp':
        data[col] = data[col].apply(float)

In [7]:
# Convert activity from float to str
data.ActivityID = data.ActivityID.astype(str)

In [8]:
data.head()

Unnamed: 0,CaseID,ActivityID,CompleteTimestamp,Duration,CumDuration,TimeSinceMidnight,WeekDay
0,3087.0,1.0,2010-12-07 18:32:31,0.0,0.0,66751.0,1.0
1,3087.0,8.0,2010-12-11 00:35:42,280991.0,280991.0,2142.0,5.0
2,3087.0,6.0,2010-12-13 23:50:04,256462.0,537453.0,85804.0,0.0
3,3088.0,1.0,2011-02-24 17:23:53,0.0,0.0,62633.0,3.0
4,3088.0,1.0,2011-02-24 23:34:42,22249.0,22249.0,84882.0,3.0


In [9]:
data['ActivityID'].unique()

array(['1.0', '8.0', '6.0', '9.0', '2.0', '3.0', '4.0', '5.0', '7.0'], dtype=object)

# Utils

In [12]:
groupByCase = data.groupby(['CaseID'])

In [13]:
#find len of longest case
maxlen = 1
for case, group in groupByCase:
    temp_len = group.shape[0]
    if temp_len > maxlen:
        maxlen = temp_len
        
maxlen += 1 # include EOS
print(maxlen)

13


In [14]:
#define number of features
num_features = len(data['ActivityID'].unique()) + 5
num_features

14

In [15]:
unique_chars = data['ActivityID'].unique().tolist()
target_chars = unique_chars + ['EOS']

In [16]:
unique_chars, target_chars

(['1.0', '8.0', '6.0', '9.0', '2.0', '3.0', '4.0', '5.0', '7.0'],
 ['1.0', '8.0', '6.0', '9.0', '2.0', '3.0', '4.0', '5.0', '7.0', 'EOS'])

In [17]:
chartoindice = {}
for indice, char in enumerate(unique_chars):
    chartoindice[char] = indice

In [18]:
targetchartoindice = {}
for indice, char in enumerate(target_chars):
    targetchartoindice[char] = indice

In [19]:
chartoindice, targetchartoindice

({'1.0': 0,
  '2.0': 4,
  '3.0': 5,
  '4.0': 6,
  '5.0': 7,
  '6.0': 2,
  '7.0': 8,
  '8.0': 1,
  '9.0': 3},
 {'1.0': 0,
  '2.0': 4,
  '3.0': 5,
  '4.0': 6,
  '5.0': 7,
  '6.0': 2,
  '7.0': 8,
  '8.0': 1,
  '9.0': 3,
  'EOS': 9})

# Input

In [20]:
def getList(df):
    temp = []
    lst = df.tolist()
    for i in range(1, len(lst)+1):
        sub_lst = lst[:i]
        temp.append(sub_lst)
    return temp

In [21]:
sentences = []
sentences_t = []
sentences_t2 =[]
sentences_t3 = []
sentences_t4 = []
for case, group in groupByCase:
    case_sentences = getList(group['ActivityID'])
    sentences += case_sentences
    
    case_sentences_t = getList(group['Duration'])
    sentences_t += case_sentences_t
    
    case_sentences_t2 = getList(group['CumDuration'])
    sentences_t2 += case_sentences_t2
    
    case_sentences_t3 = getList(group['TimeSinceMidnight'])
    sentences_t3 += case_sentences_t3
    
    case_sentences_t4 = getList(group['WeekDay'])
    sentences_t4 += case_sentences_t4

In [22]:
len(sentences), len(sentences_t), len(sentences_t2), len(sentences_t3), len(sentences_t4)

(9181, 9181, 9181, 9181, 9181)

In [23]:
divisor = data['Duration'].mean()
divisor2 = data['CumDuration'].mean()
# fill values for input X
X = np.zeros((len(sentences), maxlen, num_features), dtype=np.float32)
for i, sentence in enumerate(sentences):
    leftpad = maxlen-len(sentence)
    sentence_t = sentences_t[i]
    sentence_t2 = sentences_t2[i]
    sentence_t3 = sentences_t3[i]
    sentence_t4 = sentences_t4[i]
    for t, char in enumerate(sentence):
        for c in unique_chars:
            if c==char:
                X[i, t+leftpad, chartoindice[c]] = 1
            #print(X)   
        X[i, t+leftpad, len(unique_chars)] = t+1
        X[i, t+leftpad, len(unique_chars)+1] = sentence_t[t]/divisor
        X[i, t+leftpad, len(unique_chars)+2] = sentence_t2[t]/divisor2
        X[i, t+leftpad, len(unique_chars)+3] = sentence_t3[t]/86400
        X[i, t+leftpad, len(unique_chars)+4] = sentence_t4[t]/7

# Output

In [24]:
def getNextActivity(df):
    '''This is used to get next activity'''
    temp = []
    lst = df.tolist()
    for i in range(1, len(df)):
        ele = lst[i]
        temp.append(ele)
    temp.append('EOS') #EOS: end of sentence
    return temp

In [25]:
def getNextTime(df):
    '''This is used to get next time'''
    temp = []
    lst = df.tolist()
    for i in range(1, len(df)):
        ele = lst[i]
        temp.append(ele)
    temp.append(0) # beginning time of next activity
    return temp

In [26]:
next_chars = []
next_chars_t = []
next_chars_t2 = []
next_chars_t3 = []
next_chars_t4 = []
for case, group in groupByCase:
    case_next_char = getNextActivity(group['ActivityID'])
    next_chars += case_next_char
    
    case_next_char_t = getNextTime(group['Duration'])
    next_chars_t += case_next_char_t
    
    case_next_char_t2 = getNextTime(group['CumDuration'])
    next_chars_t2 += case_next_char_t2
    
    case_next_char_t3 = getNextTime(group['TimeSinceMidnight'])
    next_chars_t3 += case_next_char_t3
    
    case_next_char_t4 = getNextTime(group['WeekDay'])
    next_chars_t4 += case_next_char_t4

In [27]:
len(next_chars), len(next_chars_t), len(next_chars_t2), len(next_chars_t3), len(next_chars_t4)

(9181, 9181, 9181, 9181, 9181)

## y_a

One-hot encode for y_a

In [28]:
next_chars[:10]

['8.0', '6.0', 'EOS', '8.0', '6.0', 'EOS', '8.0', '6.0', 'EOS', '8.0']

```python
# new next_chars
next_chars_indice = [targetchartoindice[act] for act in next_chars]
# reshape for OHC without warning
next_chars_indice = np.asarray(next_chars_indice).reshape(-1,1)

next_chars_indice[:10]

encoder = OneHotEncoder()
data_feature_one_hot_encoded = encoder.fit_transform(next_chars_indice)

y_a = data_feature_one_hot_encoded.toarray()
y_a

#y_a.shape (13710, 9)
```

In [29]:
# fill values for output y_a
y_a = np.zeros((len(sentences), len(target_chars)), dtype=np.float32)
for i in range(len(sentences)):
    for c in target_chars:
        if c==next_chars[i]:
            y_a[i, targetchartoindice[c]] = 1

In [30]:
y_a.shape

(9181, 10)

In [31]:
y_a

array([[ 0.,  1.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  1., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       ..., 
       [ 0.,  1.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  1., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.]], dtype=float32)

## y_t

Scale/Normalize data. This can be done by using [sklearn](http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-scaler)

In [32]:
next_chars_t = np.asarray(next_chars_t)

In [33]:
next_chars_t = next_chars_t.reshape(-1, 1)

In [34]:
scaler = StandardScaler().fit(next_chars_t)

In [35]:
y_t = scaler.transform(next_chars_t) 

In [39]:
y_t = y_t.reshape([next_chars_t.shape[0],])

In [40]:
y_t.shape

(9181,)

# Save data

In [41]:
import pickle

In [43]:
with open('../input/test_data.pkl', 'wb') as f:
    pickle.dump(X, f, protocol=-1)
    pickle.dump(y_a, f, protocol=-1)
    pickle.dump(y_t, f, protocol=-1)

# Test

In [36]:
from keras.models import Sequential, Model
from keras.layers.core import Dense
from keras.layers.recurrent import LSTM, GRU, SimpleRNN
from keras.layers import Input
from keras.utils.data_utils import get_file
from keras.regularizers import WeightRegularizer
from keras.optimizers import Nadam
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.layers.normalization import BatchNormalization
from theano.ifelse import ifelse

Using Theano backend.


In [37]:
# build the model: 
print('Build model...')
main_input = Input(shape=(maxlen, num_features), name='main_input')
# train a 2-layer LSTM with one shared layer
l1 = LSTM(100, consume_less='gpu', init='glorot_uniform', return_sequences=True, dropout_W=0.2)(main_input) # the shared layer
b1 = BatchNormalization()(l1)
l2_1 = LSTM(100, consume_less='gpu', init='glorot_uniform', return_sequences=False, dropout_W=0.2)(b1) # the layer specialized in activity prediction
b2_1 = BatchNormalization()(l2_1)
l2_2 = LSTM(100, consume_less='gpu', init='glorot_uniform', return_sequences=False, dropout_W=0.2)(b1) # the layer specialized in time prediction
b2_2 = BatchNormalization()(l2_2)
act_output = Dense(len(target_chars), activation='softmax', init='glorot_uniform', name='act_output')(b2_1)
time_output = Dense(1, init='glorot_uniform', name='time_output')(b2_2)

model = Model(input=[main_input], output=[act_output, time_output])

opt = Nadam(lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=1e-08, schedule_decay=0.004, clipvalue=3)

model.compile(loss={'act_output':'categorical_crossentropy', 'time_output':'mae'}, optimizer=opt)
early_stopping = EarlyStopping(monitor='val_loss', patience=42)
model_checkpoint = ModelCheckpoint('output_files/models/model_{epoch:02d}-{val_loss:.2f}.h5', monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto')
lr_reducer = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, verbose=0, mode='auto', epsilon=0.0001, cooldown=0, min_lr=0)

model.fit(X, {'act_output':y_a, 'time_output':y_t}, validation_split=0.2, verbose=2, callbacks=[early_stopping, model_checkpoint, lr_reducer], batch_size=maxlen, nb_epoch=500)

Build model...


  "flatten outdim parameter is deprecated, use ndim instead.")


Train on 10968 samples, validate on 2742 samples
Epoch 1/500


KeyboardInterrupt: 

In [38]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
main_input (InputLayer)          (None, 15, 14)        0                                            
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 15, 100)       46000       main_input[0][0]                 
____________________________________________________________________________________________________
batchnormalization_1 (BatchNorma (None, 15, 100)       400         lstm_1[0][0]                     
____________________________________________________________________________________________________
lstm_2 (LSTM)                    (None, 100)           80400       batchnormalization_1[0][0]       
___________________________________________________________________________________________