# Model LSTM (Incomplete)


## Imports

In [None]:
import pandas as pd
import posixpath
from data_mining_project import data, PROJECT_PATH, DATA_PATH, OUTPUT_PATH
import numpy as np
import matplotlib as plt
import plotly.express as px
import time
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

## Load Data

In [None]:
file_name = "preprocessed_data.csv"  
file_path = posixpath.join(OUTPUT_PATH, file_name)
data_df = data.load_data_csv(file_path)
data_df = data.reformat_str_to_list(data_df, cols=["events_sequence", "seconds_to_incident_sequence", "dj_ac_state_sequence", "dj_dc_state_sequence", "ac_dc_prob_timestamp"], col_type=int)
data_df = data.reformat_str_to_list(data_df, cols=["train_kph_sequence"], col_type=float)
data_df.drop(columns=["ac_dc_prob_num", "ac_dc_prob"], inplace=True)
data_df

## Add timestamp_diff column

In [None]:
def compute_timestamp_diff(x):
    new_x = np.zeros(x.shape[0])
    for i in range(1, x.shape[0]):
        new_x[i] = x[i] - x[i-1]
    return new_x
    
data_df.insert(1, "timestamp_diff", data_df["seconds_to_incident_sequence"].apply(compute_timestamp_diff))
data_df.drop(columns=["seconds_to_incident_sequence"],inplace=True)
data_df

In [None]:
data_df.drop(columns=["timestamp_diff","train_kph_sequence", "dj_ac_state_sequence", "dj_dc_state_sequence", "ac_dc_prob_timestamp"], inplace=True)

In [None]:
XY = data_df.to_numpy()
XY.shape

## Padding Data

In [None]:
X = XY[:, :-1]
max_len = max(x[0].shape[0] for x in X)
new_X = np.empty((X.shape[0], X.shape[1], max_len), dtype=object)

for i in range(X.shape[0]):
    new_X[i] = pad_sequences(X[i], maxlen=max_len, padding="post")
    
X = new_X
X.shape

## Converting data types into consecutive number encoding

In [None]:
event_type_mapping = {x: i for i, x in enumerate(np.unique(X))}
X = np.vectorize(event_type_mapping.get)(X)
print(np.unique(X).shape[0])

## Concatenating columns

In [None]:
X = X.reshape((X.shape[0], X.shape[1]*X.shape[2]))
X.shape

In [None]:
X

## Converting labels to one-hot encoding
### Map each incident type to consecutive number encoding at first

In [None]:
Y = XY[:, -1]
incident_type_mapping = {x: i for i, x in enumerate(np.unique(Y))}
Y = np.array([incident_type_mapping[incident] for incident in Y])
#Y = np.eye(np.unique(Y).shape[0])[Y]
Y.shape

## Train model

### Split data into overlapping sequences of size S

In [None]:
def split_seq(X, S, step):
    new_X = []
    print(X.shape)
    for i in range(X.shape[0]):
        new_X.append(np.array([X[i][j:j + S] for j in range(0, X.shape[1] - S + 1, step)]))
            
    return np.array(new_X)

X_tr = X[:int(X.shape[0]*.75), :].astype("float64")
S = 16
X_tr = split_seq(X_tr, S=S, step=S)

Y_tr = Y[:int(Y.shape[0]*.75)].astype("float64")
print(X_tr.shape)
print(Y_tr.shape)

In [None]:
def train_model(X_tr, Y_tr, num_classes, voc_size):
    model = Sequential()
    model.add(Embedding(input_dim=voc_size, output_dim=128, mask_zero=True))
    model.add(LSTM(units=128, activation="relu", return_sequences=False, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(num_classes, activation="softmax"))
    optimizer = Adam(learning_rate=0.001)
    model.summary()
    model.compile(loss='sparse_categorical_crossentropy',optimizer=optimizer,metrics=['accuracy'])
    callback = tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=5)
    model.fit(X_tr, Y_tr, epochs=20, batch_size=32,
              verbose=1, callback = [callback])
    
    return model
X_tr = X[:int(X.shape[0]*0.75), :-1].astype("float64")    
voc_size = np.unique(X).shape[0]
num_classes = np.unique(Y).shape[0]
model = train_model(X_tr, Y_tr, num_classes=num_classes, voc_size=voc_size)

## Test model

In [None]:
def test_model(X_ts, model):
    Y_hat_ts = model.predict(X_ts)
    return Y_hat_ts