<a href="https://colab.research.google.com/github/elliot-brooks/nlu-coursework/blob/main/src/av_model_one.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import LSTM, Dense, Embedding, Input, concatenate
import numpy as np
import pandas as pd
import nltk
import re

FIRST_PAIR = 0
SECOND_PAIR = 1

# Load training data

In [2]:
training_corpus = pd.read_csv("train.csv", encoding='utf-8')
# pairwise_labels = np.array(training_corpus['label'])
# pairwise_data = np.array(training_corpus[['text_1', 'text_2']])

# Pre-process training data

In [3]:
def preprocess(string):
  output = str(string).lower()
  separated_string = re.sub(r'([^\w\s])', r' \1 ', str(string))
  return output

training_corpus["text_1"] = training_corpus["text_1"].apply(lambda x: preprocess(x))
training_corpus["text_2"] = training_corpus["text_2"].apply(lambda x: preprocess(x))
pairwise_labels = np.array(training_corpus['label'])

In [4]:
def tokenise(data, column_1, column_2, max_sequence_length) :
  first_pairs = training_corpus[column_1].tolist()
  second_pairs = training_corpus[column_2].tolist()

  # init tokeniser
  tk = Tokenizer(oov_token='UNK', lower=True)
  tk.fit_on_texts(first_pairs + second_pairs)

  # tokenise texts
  tokenised_first_pairs = tk.texts_to_sequences(first_pairs)
  tokenised_second_pairs = tk.texts_to_sequences(second_pairs)

  # pad sequences
  tokenised_first_pairs = pad_sequences(tokenised_first_pairs, maxlen=max_sequence_length, padding='pre')
  tokenised_second_pairs = pad_sequences(tokenised_second_pairs, maxlen=max_sequence_length, padding='pre')

  # return vocabulary
  vocab = tk.word_index

  return tokenised_first_pairs, tokenised_second_pairs, vocab

SEQUENCE_SIZE = 150
sequences_1, sequences_2, vocab = tokenise(training_corpus, "text_1", "text_2", SEQUENCE_SIZE)

In [5]:
print(sequences_1[0])
print(sequences_2[0])

[    0     0     0     0     0     0     0     0     0     0     0  1623
   755  2397   385   133     3    30  9015     3   184   665     6    30
  2829   592  1425  7812    67    32  4378    36   216  1218  2366   432
 13494     5    30   670   419  6011 42618    32    95   302   747    18
     4   928   215   216   787   746    11   153    23   187     3    73
    18    97     7  1474    16     8     4  3761   133     8  9281    14
    21   259     6     2  3096    46  3039     5     7    85   542    14
    85    62  2162   670  2421    84    14   182    14    57   266    23
   120     6     4   127     3   123    18    14    95     2    46  1891
    37  2350     7  2549  2506  3267   554    14   220   399   197   315
     7  2506  3267   220    16    46     9  2011     5  1247    63    34
     6   196  1116   199   568     6   849   329    45   985  5654     5
 16969  4728  1524   534     2   751]
[    0     0     0     0     0     0     0     0     0     0     0  1245
  7416  2159 

# Define Language Model

In [6]:
vocab_size = len(vocab) + 1

# Define input layers
left_input = Input(shape=(SEQUENCE_SIZE,), dtype='int32')
right_input = Input(shape=(SEQUENCE_SIZE,), dtype='int32')

EMBEDDING_DIM = 100
embedding_layer = Embedding(vocab_size, EMBEDDING_DIM, input_length=SEQUENCE_SIZE)
left_encoding = embedding_layer(left_input)
right_encoding = embedding_layer(right_input)

# Shared LSTM layer
lstm_units = 128
shared_lstm = LSTM(lstm_units)
left_output = shared_lstm(left_encoding)
right_output = shared_lstm(right_encoding)

# Concatenate the outputs
concatenated_output = concatenate([left_output, right_output])

# Dense layer for classification
output = Dense(1, activation='sigmoid')(concatenated_output)

# Define the model
model = Model(inputs=[left_input, right_input], outputs=output)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 150)]                0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 150)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 150, 100)             1062880   ['input_1[0][0]',             
                                                          0          'input_2[0][0]']             
                                                                                                  
 lstm (LSTM)                 (None, 128)                  117248    ['embedding[0][0]',       

# Train Model

In [7]:
def combine_pairwise_data(sequence_1, sequence_2) :
  tuple_list = []
  for i in range(len(sequence_1)) :
    tuple_list.append((sequence_1[i], sequence_2[i]))
  return np.array(tuple_list)

input_data = combine_pairwise_data(sequences_1, sequences_2)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit([sequences_1, sequences_2], pairwise_labels, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7cec56967cd0>

# Save Model

In [8]:
model.save("AV_LSTM_MODEL")

In [9]:
!zip -r /content/LSTM_MODEL.zip /content/AV_LSTM_MODEL

  adding: content/AV_LSTM_MODEL/ (stored 0%)
  adding: content/AV_LSTM_MODEL/keras_metadata.pb (deflated 88%)
  adding: content/AV_LSTM_MODEL/variables/ (stored 0%)
  adding: content/AV_LSTM_MODEL/variables/variables.data-00000-of-00001 (deflated 10%)
  adding: content/AV_LSTM_MODEL/variables/variables.index (deflated 59%)
  adding: content/AV_LSTM_MODEL/fingerprint.pb (stored 0%)
  adding: content/AV_LSTM_MODEL/saved_model.pb (deflated 89%)
  adding: content/AV_LSTM_MODEL/assets/ (stored 0%)
