## Imports

In [None]:
import math
from src.model_evaluation import load_model, evaluate_model, generate_text
from src.model_training import RNNLanguageModel, LSTMLanguageModel
import numpy as np
import os
import pickle
import random
import torch
import torch.nn as nn

## Setup

In [None]:
# Device agnostic code
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set seeds
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.enabled = False
random.seed(seed)
np.random.seed(seed)

## Load Data

In [None]:
with open(os.path.normpath(os.path.join("data", "word_tokenisation_reuters_data.pkl")), "rb") as f:
    word_tokenised_numericalised_docs = pickle.load(f)

with open(os.path.normpath(os.path.join("data", "word_tokenisation_reuters_train_vocab.pkl")), "rb") as f:
    word_tokenisation_train_vocab = pickle.load(f)

with open(os.path.normpath(os.path.join("data", "subword_tokenisation_reuters_data.pkl")), "rb") as f:
    subword_tokenised_numericalised_docs = pickle.load(f)

with open(os.path.normpath(os.path.join("data", "subword_tokenisation_reuters_train_vocab.pkl")), "rb") as f:
    subword_tokenisation_train_vocab = pickle.load(f)

## Set Hyperparameters

In [None]:
EMBEDDING_SIZE = 128
HIDDEN_SIZE = 256
NUM_LAYERS = 2
DROPOUT = 0.0
BATCH_SIZE = 32

## RNN

### Word Tokenisation, Context Length = 16

In [None]:
# Initialise model
rnn = RNNLanguageModel(
    vocab_size=len(word_tokenisation_train_vocab),
    embed_size=EMBEDDING_SIZE,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT,
    pad_idx=word_tokenisation_train_vocab["<pad>"]
).to(device)

rnn_word_tokenised_context_16_filename = ""

rnn_word_tokenised_context_16 = load_model(model_initialised=rnn, filename=rnn_word_tokenised_context_16_filename, device=device)

In [None]:
# Cross Entropy Loss
rnn_word_tokenised_context_16_cross_entropy_loss = evaluate_model(
    model=rnn_word_tokenised_context_16,
    converted_tokenised_docs=word_tokenised_numericalised_docs,
    seq_len=16,
    batch_size=BATCH_SIZE,
    criterion=nn.CrossEntropyLoss(ignore_index=word_tokenisation_train_vocab["<pad>"])
    device=device,
    vocab_size=len(word_tokenisation_train_vocab)
)
print(f"Cross Entropy Loss: {rnn_word_tokenised_context_16_cross_entropy_loss}")

# Perplexity
rnn_word_tokenised_context_16_perplexity = math.exp(rnn_word_tokenised_context_16_cross_entropy_loss)
print(f"Perplexity: {rnn_word_tokenised_context_16_perplexity}")

In [None]:
# Generate text
starting_sequence = ["Today"]

rnn_word_tokenised_context_16_generated_sequence = generate_text(
    model=rnn_word_tokenised_context_16,
    train_vocab=word_tokenisation_train_vocab,
    start_seq=starting_sequence,
    temperature=1.0
)
print(f"Generated Sequence from {starting_sequence}:")
print(rnn_word_tokenised_context_16_generated_sequence)

### Word Tokenisation, Context Length = 32

In [None]:
# Initialise model
rnn = RNNLanguageModel(
    vocab_size=len(word_tokenisation_train_vocab),
    embed_size=EMBEDDING_SIZE,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT,
    pad_idx=word_tokenisation_train_vocab["<pad>"]
).to(device)

rnn_word_tokenised_context_32_filename = ""

rnn_word_tokenised_context_32 = load_model(model_initialised=rnn, filename=rnn_word_tokenised_context_32_filename, device=device)

In [None]:
# Cross Entropy Loss
rnn_word_tokenised_context_32_cross_entropy_loss = evaluate_model(
    model=rnn_word_tokenised_context_32,
    converted_tokenised_docs=word_tokenised_numericalised_docs,
    seq_len=32,
    batch_size=BATCH_SIZE,
    criterion=nn.CrossEntropyLoss(ignore_index=word_tokenisation_train_vocab["<pad>"])
    device=device,
    vocab_size=len(word_tokenisation_train_vocab)
)
print(f"Cross Entropy Loss: {rnn_word_tokenised_context_32_cross_entropy_loss}")

# Perplexity
rnn_word_tokenised_context_32_perplexity = math.exp(rnn_word_tokenised_context_32_cross_entropy_loss)
print(f"Perplexity: {rnn_word_tokenised_context_32_perplexity}")

In [None]:
# Generate text
starting_sequence = ["Today"]

rnn_word_tokenised_context_32_generated_sequence = generate_text(
    model=rnn_word_tokenised_context_32,
    train_vocab=word_tokenisation_train_vocab,
    start_seq=starting_sequence,
    temperature=1.0
)
print(f"Generated Sequence from {starting_sequence}:")
print(rnn_word_tokenised_context_32_generated_sequence)

### Subword Tokenisation, Context Length = 16

In [None]:
# Initialise model
rnn = RNNLanguageModel(
    vocab_size=len(subword_tokenisation_train_vocab),
    embed_size=EMBEDDING_SIZE,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT,
    pad_idx=subword_tokenisation_train_vocab["<pad>"]
).to(device)

rnn_subword_tokenised_context_16_filename = ""

rnn_subword_tokenised_context_16 = load_model(model_initialised=rnn, filename=rnn_subword_tokenised_context_16_filename, device=device)

In [None]:
# Cross Entropy Loss
rnn_subword_tokenised_context_16_cross_entropy_loss = evaluate_model(
    model=rnn_subword_tokenised_context_16,
    converted_tokenised_docs=subword_tokenised_numericalised_docs,
    seq_len=16,
    batch_size=BATCH_SIZE,
    criterion=nn.CrossEntropyLoss(ignore_index=subword_tokenisation_train_vocab["<pad>"])
    device=device,
    vocab_size=len(subword_tokenisation_train_vocab)
)
print(f"Cross Entropy Loss: {rnn_subword_tokenised_context_16_cross_entropy_loss}")

# Perplexity
rnn_subword_tokenised_context_16_perplexity = math.exp(rnn_subword_tokenised_context_16_cross_entropy_loss)
print(f"Perplexity: {rnn_subword_tokenised_context_16_perplexity}")

In [None]:
# Generate text
starting_sequence = ["Today"]

rnn_subword_tokenised_context_16_generated_sequence = generate_text(
    model=rnn_subword_tokenised_context_16,
    train_vocab=subword_tokenisation_train_vocab,
    start_seq=starting_sequence,
    temperature=1.0
)
print(f"Generated Sequence from {starting_sequence}:")
print(rnn_subword_tokenised_context_16_generated_sequence)

### Subword Tokenisation, Context Length = 32

In [None]:
# Initialise model
rnn = RNNLanguageModel(
    vocab_size=len(subword_tokenisation_train_vocab),
    embed_size=EMBEDDING_SIZE,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT,
    pad_idx=subword_tokenisation_train_vocab["<pad>"]
).to(device)

rnn_subword_tokenised_context_32_filename = ""

rnn_subword_tokenised_context_32 = load_model(model_initialised=rnn, filename=rnn_subword_tokenised_context_32_filename, device=device)

In [None]:
# Cross Entropy Loss
rnn_subword_tokenised_context_32_cross_entropy_loss = evaluate_model(
    model=rnn_subword_tokenised_context_32,
    converted_tokenised_docs=subword_tokenised_numericalised_docs,
    seq_len=32,
    batch_size=BATCH_SIZE,
    criterion=nn.CrossEntropyLoss(ignore_index=subword_tokenisation_train_vocab["<pad>"])
    device=device,
    vocab_size=len(subword_tokenisation_train_vocab)
)
print(f"Cross Entropy Loss: {rnn_subword_tokenised_context_32_cross_entropy_loss}")

# Perplexity
rnn_subword_tokenised_context_32_perplexity = math.exp(rnn_subword_tokenised_context_32_cross_entropy_loss)
print(f"Perplexity: {rnn_subword_tokenised_context_32_perplexity}")

In [None]:
# Generate text
starting_sequence = ["Today"]

rnn_subword_tokenised_context_32_generated_sequence = generate_text(
    model=rnn_subword_tokenised_context_32,
    train_vocab=subword_tokenisation_train_vocab,
    start_seq=starting_sequence,
    temperature=1.0
)
print(f"Generated Sequence from {starting_sequence}:")
print(rnn_subword_tokenised_context_32_generated_sequence)

## LSTM

### Word Tokenisation, Context Length = 16

In [None]:
# Initialise model
lstm = LSTMLanguageModel(
    vocab_size=len(word_tokenisation_train_vocab),
    embed_size=EMBEDDING_SIZE,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT,
    pad_idx=word_tokenisation_train_vocab["<pad>"]
).to(device)

lstm_word_tokenised_context_16_filename = ""

lstm_word_tokenised_context_16 = load_model(model_initialised=lstm, filename=lstm_word_tokenised_context_16_filename, device=device)

In [None]:
# Cross Entropy Loss
lstm_word_tokenised_context_16_cross_entropy_loss = evaluate_model(
    model=lstm_word_tokenised_context_16,
    converted_tokenised_docs=word_tokenised_numericalised_docs,
    seq_len=16,
    batch_size=BATCH_SIZE,
    criterion=nn.CrossEntropyLoss(ignore_index=word_tokenisation_train_vocab["<pad>"])
    device=device,
    vocab_size=len(word_tokenisation_train_vocab)
)
print(f"Cross Entropy Loss: {lstm_word_tokenised_context_16_cross_entropy_loss}")

# Perplexity
lstm_word_tokenised_context_16_perplexity = math.exp(lstm_word_tokenised_context_16_cross_entropy_loss)
print(f"Perplexity: {lstm_word_tokenised_context_16_perplexity}")

In [None]:
# Generate text
starting_sequence = ["Today"]

lstm_word_tokenised_context_16_generated_sequence = generate_text(
    model=lstm_word_tokenised_context_16,
    train_vocab=word_tokenisation_train_vocab,
    start_seq=starting_sequence,
    temperature=1.0
)
print(f"Generated Sequence from {starting_sequence}:")
print(lstm_word_tokenised_context_16_generated_sequence)

### Word Tokenisation, Context Length = 32

In [None]:
# Initialise model
lstm = LSTMLanguageModel(
    vocab_size=len(word_tokenisation_train_vocab),
    embed_size=EMBEDDING_SIZE,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT,
    pad_idx=word_tokenisation_train_vocab["<pad>"]
).to(device)

lstm_word_tokenised_context_32_filename = ""

lstm_word_tokenised_context_32 = load_model(model_initialised=lstm, filename=lstm_word_tokenised_context_32_filename, device=device)

In [None]:
# Cross Entropy Loss
lstm_word_tokenised_context_32_cross_entropy_loss = evaluate_model(
    model=lstm_word_tokenised_context_32,
    converted_tokenised_docs=word_tokenised_numericalised_docs,
    seq_len=32,
    batch_size=BATCH_SIZE,
    criterion=nn.CrossEntropyLoss(ignore_index=word_tokenisation_train_vocab["<pad>"])
    device=device,
    vocab_size=len(word_tokenisation_train_vocab)
)
print(f"Cross Entropy Loss: {lstm_word_tokenised_context_32_cross_entropy_loss}")

# Perplexity
lstm_word_tokenised_context_32_perplexity = math.exp(lstm_word_tokenised_context_32_cross_entropy_loss)
print(f"Perplexity: {lstm_word_tokenised_context_32_perplexity}")

In [None]:
# Generate text
starting_sequence = ["Today"]

lstm_word_tokenised_context_32_generated_sequence = generate_text(
    model=lstm_word_tokenised_context_32,
    train_vocab=word_tokenisation_train_vocab,
    start_seq=starting_sequence,
    temperature=1.0
)
print(f"Generated Sequence from {starting_sequence}:")
print(lstm_word_tokenised_context_32_generated_sequence)

### Subword Tokenisation, Context Length = 16

In [None]:
# Initialise model
lstm = LSTMLanguageModel(
    vocab_size=len(subword_tokenisation_train_vocab),
    embed_size=EMBEDDING_SIZE,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT,
    pad_idx=subword_tokenisation_train_vocab["<pad>"]
).to(device)

lstm_subword_tokenised_context_16_filename = ""

lstm_subword_tokenised_context_16 = load_model(model_initialised=lstm, filename=lstm_subword_tokenised_context_16_filename, device=device)

In [None]:
# Cross Entropy Loss
lstm_subword_tokenised_context_16_cross_entropy_loss = evaluate_model(
    model=lstm_subword_tokenised_context_16,
    converted_tokenised_docs=subword_tokenised_numericalised_docs,
    seq_len=16,
    batch_size=BATCH_SIZE,
    criterion=nn.CrossEntropyLoss(ignore_index=subword_tokenisation_train_vocab["<pad>"])
    device=device,
    vocab_size=len(subword_tokenisation_train_vocab)
)
print(f"Cross Entropy Loss: {lstm_subword_tokenised_context_16_cross_entropy_loss}")

# Perplexity
lstm_subword_tokenised_context_16_perplexity = math.exp(lstm_subword_tokenised_context_16_cross_entropy_loss)
print(f"Perplexity: {lstm_subword_tokenised_context_16_perplexity}")

In [None]:
# Generate text
starting_sequence = ["Today"]

lstm_subword_tokenised_context_16_generated_sequence = generate_text(
    model=lstm_subword_tokenised_context_16,
    train_vocab=subword_tokenisation_train_vocab,
    start_seq=starting_sequence,
    temperature=1.0
)
print(f"Generated Sequence from {starting_sequence}:")
print(lstm_subword_tokenised_context_16_generated_sequence)

### Subword Tokenisation, Context Length = 32

In [None]:
# Initialise model
lstm = LSTMLanguageModel(
    vocab_size=len(subword_tokenisation_train_vocab),
    embed_size=EMBEDDING_SIZE,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT,
    pad_idx=subword_tokenisation_train_vocab["<pad>"]
).to(device)

lstm_subword_tokenised_context_32_filename = ""

lstm_subword_tokenised_context_32 = load_model(model_initialised=lstm, filename=lstm_subword_tokenised_context_32_filename, device=device)

In [None]:
# Cross Entropy Loss
lstm_subword_tokenised_context_32_cross_entropy_loss = evaluate_model(
    model=lstm_subword_tokenised_context_32,
    converted_tokenised_docs=subword_tokenised_numericalised_docs,
    seq_len=32,
    batch_size=BATCH_SIZE,
    criterion=nn.CrossEntropyLoss(ignore_index=subword_tokenisation_train_vocab["<pad>"])
    device=device,
    vocab_size=len(subword_tokenisation_train_vocab)
)
print(f"Cross Entropy Loss: {lstm_subword_tokenised_context_32_cross_entropy_loss}")

# Perplexity
lstm_subword_tokenised_context_32_perplexity = math.exp(lstm_subword_tokenised_context_32_cross_entropy_loss)
print(f"Perplexity: {lstm_subword_tokenised_context_32_perplexity}")

In [None]:
# Generate text
starting_sequence = ["Today"]

lstm_subword_tokenised_context_32_generated_sequence = generate_text(
    model=lstm_subword_tokenised_context_32,
    train_vocab=subword_tokenisation_train_vocab,
    start_seq=starting_sequence,
    temperature=1.0
)
print(f"Generated Sequence from {starting_sequence}:")
print(lstm_subword_tokenised_context_32_generated_sequence)