<a href="https://colab.research.google.com/github/elliot-brooks/nlu-coursework/blob/main/src/AV_LSTM_TRAIN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
!pip install -U transformers
!pip install -U accelerate



In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import DistilBertTokenizer, TFDistilBertModel
import nltk
import re

# Load training data

In [3]:
training_corpus = pd.read_csv("train.csv", encoding='utf-8')

# Pre-process training data

In [4]:
# Case fold to lower-case
def preprocess(string):
  output = str(string).lower()
  return output

# Prepare data for Distilled Bert by concatenating pairs with [SEP] token
def prepare_data(data) :
  data["text_1"] = data["text_1"].apply(lambda x: preprocess(x))
  data["text_2"] = data["text_2"].apply(lambda x: preprocess(x))
  concat_pairs = []
  for index, row in data.iterrows():
      concatenated_pair = row["text_1"] + " [SEP] " + row["text_2"]
      concat_pairs.append(concatenated_pair)
  return concat_pairs

concat_data = prepare_data(training_corpus)

Create BERT embeddings

In [5]:
tokeniser = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g.

In [6]:
SEQ_LENGTH = 256
BATCH_SIZE = 32
def create_bert_embeddings_batch(texts, tokeniser, model, batch_size, seq_length) :
  embeddings = []
  for i in range(0, len(texts), batch_size) :
    batch = texts[i:i + batch_size]
    inputs = tokeniser.batch_encode_plus(batch, padding='max_length', truncation=True, return_tensors='tf', max_length=seq_length, add_special_tokens=True)

    # Create embeddings
    outputs = model(inputs['input_ids'], attention_mask=inputs['attention_mask'])

    last_hidden_state_CLS = outputs.last_hidden_state[:, 0, :]

    embeddings.append(last_hidden_state_CLS)
  return embeddings

bert_embeddings = create_bert_embeddings_batch(concat_data, tokeniser, bert_model, BATCH_SIZE, SEQ_LENGTH)

# Define classification model

In [24]:
LSTM_UNITS = 128
DROPOUT_RATE = 0.2
LEARNING_RATE = 5e-5
model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(768, 1)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=LSTM_UNITS, activation='tanh', return_sequences=True)),
        tf.keras.layers.Dropout(DROPOUT_RATE),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=LSTM_UNITS//2, return_sequences=False)),
        tf.keras.layers.Dropout(DROPOUT_RATE),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
])


# Summarise Model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE), loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_20 (Bidirect  (None, 768, 256)          133120    
 ional)                                                          
                                                                 
 dropout_39 (Dropout)        (None, 768, 256)          0         
                                                                 
 bidirectional_21 (Bidirect  (None, 128)               164352    
 ional)                                                          
                                                                 
 dropout_40 (Dropout)        (None, 128)               0         
                                                                 
 dense_30 (Dense)            (None, 64)                8256      
                                                                 
 dense_31 (Dense)            (None, 32)              

# Train Model

In [26]:
train_inputs = np.concatenate(bert_embeddings, axis=0)
train_labels = np.array(training_corpus['label'])

print(train_inputs.shape)
print(train_labels.shape)
model.fit(train_inputs, train_labels, epochs=10, batch_size=64)

(30000, 768)
(30000,)
Epoch 1/10
Epoch 2/10
Epoch 3/10

KeyboardInterrupt: 

# Save Model

In [None]:
model.save("AV_LSTM_MODEL")

In [None]:
# Zip model file
!zip -r /content/LSTM_MODEL.zip /content/AV_LSTM_MODEL