# Packages

In [432]:
#pip install keras
#pip install keras.utils
#pip install tensorflow

In [66]:
import numpy as np
import pandas as pd

from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Masking
from keras.layers import Dropout
from keras.layers import Concatenate
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical

import pickle
import itertools

np.set_printoptions(suppress=True)

# Raw data

In [41]:
seqFile = 'sequences.pkl'
labelFile = 'labels.pkl'
timeFile = 'times.pkl'

sequences = np.array(pickle.load(open(seqFile, 'rb')), dtype='object')
labels = np.array(pickle.load(open(labelFile, 'rb')), dtype='float32')
times = np.array(pickle.load(open(timeFile, 'rb'), encoding='latin1'), dtype='object')

# Convert raw data to pandas DataFrame

In [8]:
seq_idx = np.arange(len(sequences))
seq_idx = np.concatenate([list(itertools.repeat(x, len(y))) for x,y in zip(seq_idx, sequences)])

In [9]:
d = {'id': seq_idx, 
     'values': np.concatenate(sequences), 
     'times': np.concatenate(times)}

d = pd.DataFrame(d)

d.head()

Unnamed: 0,id,values,times
0,0,79,50
1,0,44,57
2,0,89,77
3,0,24,124
4,0,36,199


# Extract sequences and time duration feature from DF

In [12]:
sequences = np.array(d.groupby(['id'], sort=False).values.apply(list).tolist(), dtype=object)
times = np.array(d.groupby(['id'], sort=False).times.apply(list).tolist(), dtype=object)

# Pad sequences

In [20]:
seqs_padded = sequence.pad_sequences(sequences)
seqs_padded[0]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 79, 44,
       89, 24, 36, 14, 10, 10,  5, 59, 66, 91, 62, 11, 60, 54, 22, 61, 41,
       51, 52, 21, 12,  1,  7, 33, 30, 79, 78, 40, 48, 98, 56, 39, 85,  8,
       84, 51, 23, 13, 22, 26, 37, 66, 47, 21, 72, 67, 16,  3, 22, 87, 88,
        6, 56, 85,  8,  8, 76,  7, 47, 23, 14, 84, 95, 22, 78, 46, 48, 16,
       22, 34], dtype=int32)

# Pad time duration feature

In [18]:
times_padded = sequence.pad_sequences(times)
times_padded = np.array(times_padded)
times_padded[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   50,
         57,   77,  124,  199,  209,  262,  361,  457,  473,  561,  633,
        642,  651,  663,  759,  840,  925,  955,  958, 1019, 1069, 1137,
       1148, 1208, 1255, 1292, 1309, 1407, 1445, 1495, 1593, 1681, 1779,
       1818, 1823, 1920, 1995, 2078, 2114, 2159, 2178, 2241, 2286, 2348,
       2436, 2479, 2491, 2514, 2525, 2579, 2656, 2726, 2814, 2903, 2964,
       3061, 3089, 3103, 3175, 3245, 3248, 3305, 3315, 3318, 3347, 3396,
       3493, 3585, 3666, 3747, 3790], dtype=int32)

# One-hot encode sequences

In [21]:
seqs_encoded = to_categorical(seqs_padded)
seqs_encoded[0]

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [23]:
seqs_encoded.shape
# n sequences = 100   len(sequences)
# max length = 104    np.max([np.max(i) for i in sequences])
# unique ints = 100   len(np.unique(np.concatenate(sequences)))

(100, 104, 100)

# Append time duration feature to encoded sequences

In [54]:
def append_time(ohe, time):
    out = [np.append(i, j) for i,j in zip(ohe, time)]
    return(out)

seqs_with_time = np.array([append_time(seqs_encoded[i], times_padded[i]) for i in range(len(seqs_encoded))])

In [55]:
seqs_with_time[0]

array([[   1.,    0.,    0., ...,    0.,    0.,    0.],
       [   1.,    0.,    0., ...,    0.,    0.,    0.],
       [   1.,    0.,    0., ...,    0.,    0.,    0.],
       ...,
       [   0.,    0.,    0., ...,    0., 3666., 3666.],
       [   0.,    0.,    0., ...,    0., 3747., 3747.],
       [   0.,    0.,    0., ...,    0., 3790., 3790.]])

# Create LSTM

In [56]:
samples, timesteps, features = seqs_with_time.shape

In [63]:
model = Sequential()
model.add(Masking(mask_value = 0., input_shape=(timesteps, features)))
model.add(LSTM(100, return_sequences=True, dropout=0.2))
model.add(layers.Flatten())
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking_4 (Masking)         (None, 104, 102)          0         
                                                                 
 lstm_4 (LSTM)               (None, 104, 100)          81200     
                                                                 
 flatten_1 (Flatten)         (None, 10400)             0         
                                                                 
Total params: 81,200
Trainable params: 81,200
Non-trainable params: 0
_________________________________________________________________


In [64]:
model2 = Sequential()
model2.add(Dense(1, input_shape=(1,), activation='sigmoid'))

In [67]:
merged = Concatenate([model, model2])

In [57]:
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking_1 (Masking)         (None, 104, 102)          0         
                                                                 
 dropout_2 (Dropout)         (None, 104, 102)          0         
                                                                 
 lstm_1 (LSTM)               (None, 100)               81200     
                                                                 
 dropout_3 (Dropout)         (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
Total params: 81,301
Trainable params: 81,301
Non-trainable params: 0
_________________________________________________________________


# Fit, evaluate, and predict

In [68]:
model.fit(seqs_with_time, labels, epochs=3, batch_size=10)

RuntimeError: You must compile your model before training/testing. Use `model.compile(optimizer, loss)`.

In [59]:
model.evaluate(seqs_with_time, labels, verbose=0)

[0.2918994426727295, 0.9200000166893005]

In [60]:
model.predict(seqs_with_time)[0:5]

array([[0.05783281],
       [0.05624515],
       [0.06183383],
       [0.05807522],
       [0.05601501]], dtype=float32)

In [None]:
seqs = np.array(d.groupby(['id'], sort=False).values.apply(list).tolist())
ts = np.array(d.groupby(['id'], sort=False).times.apply(list).tolist())

  seqs = np.array(d.groupby(['id'], sort=False).values.apply(list).tolist())
  ts = np.array(d.groupby(['id'], sort=False).times.apply(list).tolist())


In [None]:
ts[0]

[50,
 57,
 77,
 124,
 199,
 209,
 262,
 361,
 457,
 473,
 561,
 633,
 642,
 651,
 663,
 759,
 840,
 925,
 955,
 958,
 1019,
 1069,
 1137,
 1148,
 1208,
 1255,
 1292,
 1309,
 1407,
 1445,
 1495,
 1593,
 1681,
 1779,
 1818,
 1823,
 1920,
 1995,
 2078,
 2114,
 2159,
 2178,
 2241,
 2286,
 2348,
 2436,
 2479,
 2491,
 2514,
 2525,
 2579,
 2656,
 2726,
 2814,
 2903,
 2964,
 3061,
 3089,
 3103,
 3175,
 3245,
 3248,
 3305,
 3315,
 3318,
 3347,
 3396,
 3493,
 3585,
 3666,
 3747,
 3790]