In [1]:
import re
import numpy as np
import pandas as pd
from string import punctuation
from os import listdir
from nltk.corpus import stopwords
from pickle import dump, load
from keras.layers import Input, Dense, Flatten, Dropout, Embedding, Conv1D, MaxPooling1D, concatenate
from keras.layers import Attention, Lambda, Bidirectional
from keras.layers import GRU, Dense, Dropout, Flatten, Input, Embedding, concatenate, Bidirectional, Attention , Lambda 
from keras.layers import LSTM
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.callbacks import EarlyStopping
from keras.models import Model, load_model
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from keras import backend as K
from imblearn.under_sampling import RandomUnderSampler

## Develop LSTM Model 

In [2]:
# load a clean dataset
def load_dataset(filename):
	return load(open(filename, 'rb'))

In [3]:
# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

In [4]:
# calculate the maximum document length
def max_length(lines):
	return max([len(s.split()) for s in lines])

In [5]:
# encode a list of lines
def encode_text(tokenizer, lines, length):
	# integer encode
	encoded = tokenizer.texts_to_sequences(lines)
	# pad encoded sequences
	padded = pad_sequences(encoded, maxlen=length, padding='post')
	return padded

In [6]:
def define_model(length, vocab_size):
    # Input layer
    inputs = Input(shape=(length,))
    embedding = Embedding(vocab_size, 100)(inputs)

    # LSTM layer
    lstm1 = LSTM(100, return_sequences=True)(embedding)
    drop = Dropout(0.3)(lstm1)
    
    # Attention layer
    atten = Attention()([drop, drop])  
    # Attention weights
    atten_weights = Lambda(lambda x: K.mean(x, axis=1))(atten)
    
    # Flatten the LSTM output
    flat = Flatten()(drop)

    # Merge attention weights with main output
    merged = concatenate([flat, atten_weights])

    # Dense layers
    dense1 = Dense(10, activation='relu')(merged)
    dense2 = Dense(50, activation='relu')(dense1)  
    main_output = Dense(3, activation='softmax', name='main_output')(dense2)

    # Create the model
    model = Model(inputs=inputs, outputs=[main_output, atten_weights])

    # Compile
    model.compile(loss={'main_output': 'sparse_categorical_crossentropy'}, optimizer='Adam', metrics=['accuracy'])

    # Summarize
    print(model.summary())
    plot_model(model, show_shapes=True, to_file='LSTMModel.png')
    
    return model

In [7]:
 # load training dataset
trainLines, trainLabels = load_dataset('../Dataset/train.pkl')
# create tokenizer
tokenizer = create_tokenizer(trainLines)
# calculate max document length
length = max_length(trainLines)
# calculate vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Max document length: %d' % length)
print('Vocabulary size: %d' % vocab_size)
# encode data
trainX = encode_text(tokenizer, trainLines, length)
print(trainX.shape)

Max document length: 3158
Vocabulary size: 11931
(13542, 3158)


In [8]:

# define model
model = define_model(length, vocab_size)

# Define early stopping callback
early_stopping = EarlyStopping(monitor='main_output_accuracy', patience=3)

# Train the model with early stopping
model.fit(trainX, np.array(trainLabels), epochs=15, batch_size=32,
          callbacks=[early_stopping])

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 3158)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 3158, 100)    1193100     ['input_1[0][0]']                
                                                                                                  
 lstm (LSTM)                    (None, 3158, 100)    80400       ['embedding[0][0]']              
                                                                                                  
 dropout (Dropout)              (None, 3158, 100)    0           ['lstm[0][0]']                   
                                                                                              

<keras.callbacks.History at 0x1dc3182d190>

In [9]:
import os

# Define the file path
file_path = '../Models/lstm.h5'

# Check if the file exists
if os.path.isfile(file_path):
    # Delete the file
    os.remove(file_path)

# Save the model
model.save(file_path)