<a href="https://colab.research.google.com/github/elliot-brooks/nlu-coursework/blob/main/src/av_model_one.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [16]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import LSTM, Dense, Embedding, Input, concatenate
import numpy as np
import pandas as pd
import nltk
import re

# Load training data

In [17]:
training_corpus = pd.read_csv("train.csv", encoding='utf-8')

# Pre-process training data

In [18]:
'''
Pre-process training data
- Case Folding
- Separate all punctuation with spaces
'''
def preprocess(string):
  output = str(string).lower()
  separated_string = re.sub(r'([^\w\s])', r' \1 ', str(string))
  return output

'''
Tokenise processed data
- OOV token = "UNK"
- Pre-pad each sequence
'''
def tokenise(data, column_1, column_2, max_sequence_length) :
  first_pairs = data[column_1].tolist()
  second_pairs = data[column_2].tolist()

  # init tokeniser
  tk = Tokenizer(oov_token='UNK', lower=True)
  tk.fit_on_texts(first_pairs + second_pairs)

  # tokenise texts
  tokenised_first_pairs = tk.texts_to_sequences(first_pairs)
  tokenised_second_pairs = tk.texts_to_sequences(second_pairs)

  # pad sequences
  tokenised_first_pairs = pad_sequences(tokenised_first_pairs, maxlen=max_sequence_length, padding='pre')
  tokenised_second_pairs = pad_sequences(tokenised_second_pairs, maxlen=max_sequence_length, padding='pre')

  # return vocabulary
  vocab = tk.word_index

  return tokenised_first_pairs, tokenised_second_pairs, vocab

def prepare_data(data, sequence_size) :
  data["text_1"] = data["text_1"].apply(lambda x: preprocess(x))
  data["text_2"] = data["text_2"].apply(lambda x: preprocess(x))
  sequences_1, sequences_2, vocab = tokenise(data, "text_1", "text_2", sequence_size)
  return [sequences_1, sequences_2], vocab

SEQUENCE_SIZE = 150
pairwise_labels = np.array(training_corpus['label'])
input_data, vocab = prepare_data(training_corpus, SEQUENCE_SIZE)

# Define Language Model

In [19]:
vocab_size = len(vocab) + 1

# Define input layers
left_input = Input(shape=(SEQUENCE_SIZE,), dtype='int32')
right_input = Input(shape=(SEQUENCE_SIZE,), dtype='int32')

EMBEDDING_DIM = 100
embedding_layer = Embedding(vocab_size, EMBEDDING_DIM, input_length=SEQUENCE_SIZE)
left_encoding = embedding_layer(left_input)
right_encoding = embedding_layer(right_input)

# Create Shared LSTM model
LSTM_UNITS = 128
shared_lstm = LSTM(LSTM_UNITS)
left_output = shared_lstm(left_encoding)
right_output = shared_lstm(right_encoding)

# Concatenate the outputs
concatenated_output = concatenate([left_output, right_output])

# Dense layer for probability distribution
output = Dense(1, activation='sigmoid')(concatenated_output)

# Define the model
model = Model(inputs=[left_input, right_input], outputs=output)

# Summarise Model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_5 (InputLayer)        [(None, 150)]                0         []                            
                                                                                                  
 input_6 (InputLayer)        [(None, 150)]                0         []                            
                                                                                                  
 embedding_2 (Embedding)     (None, 150, 100)             1062880   ['input_5[0][0]',             
                                                          0          'input_6[0][0]']             
                                                                                                  
 lstm_2 (LSTM)               (None, 128)                  117248    ['embedding_2[0][0]',   

# Train Model

In [20]:
model.fit(input_data, pairwise_labels, epochs=6, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7fdd1c6c1510>

# Save Model

In [21]:
model.save("AV_LSTM_MODEL")

In [22]:
# Zip model file
!zip -r /content/LSTM_MODEL.zip /content/AV_LSTM_MODEL

updating: content/AV_LSTM_MODEL/ (stored 0%)
updating: content/AV_LSTM_MODEL/keras_metadata.pb (deflated 88%)
updating: content/AV_LSTM_MODEL/variables/ (stored 0%)
updating: content/AV_LSTM_MODEL/variables/variables.index (deflated 59%)
updating: content/AV_LSTM_MODEL/variables/variables.data-00000-of-00001 (deflated 10%)
updating: content/AV_LSTM_MODEL/fingerprint.pb (stored 0%)
updating: content/AV_LSTM_MODEL/assets/ (stored 0%)
updating: content/AV_LSTM_MODEL/saved_model.pb (deflated 90%)
