# **TASK 3: Language Model:**

**Project Setup Guide**

1. Unzipping and Uploading the Project

  - Unzip the `Coursework_chatwipa.zip` file.
  - Upload the extracted folder to your Google Drive, maintaining the following structure:

```
drive
|- Coursework_chatwipa
|  |- TASK 1
|  |- TASK 2
|  |- TASK 3
|  |  |- Model
|  |  |  |- task3_gru_nopunc_model.keras
|  |  |  |- task3_lstm_nopunc_model.keras
|  |  |  |- task3_gru_withpunc_model.keras
|  |  |  |- tokenizer_no_punctuation.pkl
|  |  |  |- tokenizer_with_punctuation.pkl
|  |  |- TASK3.ipynb
|  |
```

2. Uploading Additional Data

    - Place the dataset text files (`61262-0.txt`) into the `TASK 3` folder.
    - The final folder structure should be:

```
drive
|- Coursework_chatwipa
|  |- TASK 1
|  |- TASK 2
|  |- TASK 3
|  |  |- Model
|  |  |  |- task3_gru_nopunc_model.keras
|  |  |  |- task3_lstm_nopunc_model.keras
|  |  |  |- task3_gru_withpunc_model.keras
|  |  |  |- tokenizer_no_punctuation.pkl
|  |  |  |- tokenizer_with_punctuation.pkl
|  |  |- TASK3.ipynb
|  |  |- 61262-0.txt
|  |
```

3. Running the Demonstration

  - Navigate to the **Demonstration** section of the notebook to test the process before proceeding further.

Install library

In [None]:
!pip install -q -U keras-tuner
!pip install rouge-score

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/129.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━[0m [32m92.2/129.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=e1b007f0eac9eb10873acf7cf40038875386cc04edd2fdc7ffbe62c8f8384b94
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, GRU, LayerNormalization, BatchNormalization
import keras_tuner as kt
import re
from keras.optimizers import Adam, RMSprop, SGD
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import pickle
import spacy
from rouge_score import rouge_scorer

## 1. Load data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Load and preprocess the text
with open("/content/drive/MyDrive/Coursework_chatwipa/TASK 3/61262-0.txt", 'r', encoding='utf-8') as file:
    text = file.read().lower()

In [None]:
text



## 2. Pre-processing


### 2.1 Clean text

#### 2.1.1 Clean all punctuation

In [None]:
# Remove Gutenberg header and footer (if present)
text_no_punctuation = re.sub(r"\*\*\* START OF THE PROJECT GUTENBERG EBOOK.*?\*\*\*", "", text, flags=re.DOTALL)
text_no_punctuation = re.sub(r"\*\*\* END OF THE PROJECT GUTENBERG EBOOK.*?\*\*\*", "", text_no_punctuation, flags=re.DOTALL)

# Remove excessive newlines
text_no_punctuation = re.sub(r"\n{2,}", " ", text_no_punctuation).strip()
# Remove non-alphabet characters:
text_no_punctuation = re.sub(r'[^a-zA-Z\s]', '', text_no_punctuation)
# Remove HTML tags:
text_no_punctuation = re.sub(r'<.*?>', '', text_no_punctuation)
# Remove space:
text_no_punctuation = re.sub(r'\n', ' ', text_no_punctuation)

In [None]:
text_no_punctuation



#### 2.1.2 Clean some punctutation

- Leave full stop and question mark to train the model, aiming to achieve the human-level sentences

In [None]:
# Remove Gutenberg header and footer (if present)
text_with_punctuation = re.sub(r"\*\*\* START OF THE PROJECT GUTENBERG EBOOK.*?\*\*\*", "", text, flags=re.DOTALL)
text_with_punctuation = re.sub(r"\*\*\* END OF THE PROJECT GUTENBERG EBOOK.*?\*\*\*", "", text_with_punctuation, flags=re.DOTALL)

# Remove excessive newlines and multiple space
text_with_punctuation = re.sub(r"\n{2,}", " ", text_with_punctuation).strip()
text_with_punctuation = re.sub(r"\s{2,}", " ", text_with_punctuation)
text_with_punctuation = re.sub(r'\s+', ' ', text_with_punctuation)
# Remove non-alphabet characters except full stop and question mark
text_with_punctuation = re.sub(r"[^a-zA-Z\s.?\']", "", text_with_punctuation)
# Remove HTML tags
text_with_punctuation = re.sub(r"<.*?>", "", text_with_punctuation)
# Add spaces around punctuation to separate them
text_with_punctuation = re.sub(r"([.?])", r" \1", text_with_punctuation)

In [None]:
text_with_punctuation



### 2.2 Word Tokenization

#### 2.2.1 Tokenizer - no punctuation

In [None]:
# Tokenize text
tokenizer = Tokenizer(lower=False)
tokenizer.fit_on_texts([text_no_punctuation])
word_index_no = tokenizer.word_index
vocabulary_size_keras = len(word_index_no) + 1

# Save the tokenizer
with open('/content/drive/MyDrive/Deep learning model/Task3/tokenizer_no_punctuation.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

In [None]:
print("Vobaculary size:", vocabulary_size_keras)

Vobaculary size: 6264


In [None]:
word_index_no

{'the': 1,
 'to': 2,
 'a': 3,
 'i': 4,
 'of': 5,
 'and': 6,
 'in': 7,
 'was': 8,
 'he': 9,
 'you': 10,
 'it': 11,
 'that': 12,
 'is': 13,
 'his': 14,
 'poirot': 15,
 'with': 16,
 'not': 17,
 'had': 18,
 'but': 19,
 'my': 20,
 'at': 21,
 'as': 22,
 'have': 23,
 'on': 24,
 'we': 25,
 'for': 26,
 'me': 27,
 'be': 28,
 'him': 29,
 'her': 30,
 'one': 31,
 'no': 32,
 'from': 33,
 'said': 34,
 'there': 35,
 'what': 36,
 'she': 37,
 'been': 38,
 'all': 39,
 'were': 40,
 'by': 41,
 'do': 42,
 'this': 43,
 'will': 44,
 'mr': 45,
 'an': 46,
 'they': 47,
 'so': 48,
 'then': 49,
 'man': 50,
 'out': 51,
 'are': 52,
 'up': 53,
 'would': 54,
 'your': 55,
 'did': 56,
 'see': 57,
 'about': 58,
 'little': 59,
 'which': 60,
 'if': 61,
 'has': 62,
 'us': 63,
 'who': 64,
 'some': 65,
 'well': 66,
 'when': 67,
 'our': 68,
 'friend': 69,
 'now': 70,
 'must': 71,
 'or': 72,
 'into': 73,
 'know': 74,
 'hastings': 75,
 'time': 76,
 'them': 77,
 'very': 78,
 'mrs': 79,
 'down': 80,
 'door': 81,
 'two': 82,
 'its'

#### 2.2.2 Tokenizer - with punctuation

In [None]:
# Tokenize text
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts([text_with_punctuation])
word_index_punc = tokenizer.word_index
vocabulary_size_punc = len(word_index_punc) + 1

# Save the tokenizer
with open('/content/drive/MyDrive/Deep learning model/Task3/tokenizer_with_punctuation.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

In [None]:
print("Vobaculary size:", vocabulary_size_punc)

Vobaculary size: 6272


In [None]:
word_index_punc

{'.': 1,
 'the': 2,
 'to': 3,
 'a': 4,
 'i': 5,
 'of': 6,
 'and': 7,
 'in': 8,
 'was': 9,
 'he': 10,
 'you': 11,
 'it': 12,
 'that': 13,
 '?': 14,
 'is': 15,
 'his': 16,
 'poirot': 17,
 'with': 18,
 'not': 19,
 'had': 20,
 'but': 21,
 'my': 22,
 'at': 23,
 'as': 24,
 'have': 25,
 'on': 26,
 'we': 27,
 'for': 28,
 'me': 29,
 'be': 30,
 'him': 31,
 'her': 32,
 'one': 33,
 'no': 34,
 'from': 35,
 'said': 36,
 'there': 37,
 'what': 38,
 'she': 39,
 'been': 40,
 'all': 41,
 'were': 42,
 'by': 43,
 'do': 44,
 'this': 45,
 'will': 46,
 'mr': 47,
 'an': 48,
 'they': 49,
 'so': 50,
 'then': 51,
 'man': 52,
 'out': 53,
 'are': 54,
 'up': 55,
 'would': 56,
 'your': 57,
 'did': 58,
 'see': 59,
 'about': 60,
 'little': 61,
 'which': 62,
 'if': 63,
 'has': 64,
 'us': 65,
 'who': 66,
 'some': 67,
 'well': 68,
 'when': 69,
 'our': 70,
 'friend': 71,
 'now': 72,
 'must': 73,
 'or': 74,
 'into': 75,
 'know': 76,
 'hastings': 77,
 'time': 78,
 'them': 79,
 'very': 80,
 'mrs': 81,
 'down': 82,
 'door': 83

### 2.3 Prepare data for training

#### 2.3.1 Data preparation - No punctuation

In [None]:
# Tokenize the text into sequences
maxlen = 10  # Sequence length
step = 1  # Shift the window by 1 word

# Prepare training sequences
sequences = []

# Split the word into list
words = text_no_punctuation.split()

# Create sequences of maxlen words
for i in range(0, len(words) - maxlen, step):
    sequences.append([word_index_no[word] for word in words[i: i + maxlen]])

# Prepare X (inputs) and y (next word predictions)
x_no = np.array(sequences)
y_no = np.array([word_index_no[words[i + maxlen]] for i in range(0, len(words) - maxlen, step)])

# Print the prepared sequences and next words
print("Sequences (x):")
print(x_no)
print("\nNext words (y):")
print(y_no)

Sequences (x):
[[ 351    5    1 ... 1098   41    1]
 [   5    1 1571 ...   41    1  167]
 [   1 1571 1572 ...    1  167 3070]
 ...
 [1573    5   15 ...  394    5    1]
 [   5   15 1098 ...    5    1 1571]
 [  15 1098   41 ...    1 1571 1572]]

Next words (y):
[ 167 3070    1 ... 1571 1572 1573]


#### 2.3.2 Data preparation - With punctuation

In [None]:
# Tokenize the text into sequences
maxlen = 10  # Sequence length
step = 1  # Shift the window by 1 word

# Prepare training sequences
sequences = []

# Split the word into list
words = text_with_punctuation.split()

# Create sequences of maxlen words
for i in range(0, len(words) - maxlen, step):
    # Use get to avoid KeyError for OOV words
    sequences.append([word_index_punc.get(word, 0) for word in words[i: i + maxlen]])

# Prepare X (inputs) and y (next word predictions)
x_punc = np.array(sequences)
y_punc = np.array([word_index_punc.get(words[i + maxlen], 0) for i in range(0, len(words) - maxlen, step)])

# Print the prepared sequences and next words
print("Sequences (x):")
print(x_punc)
print("\nNext words (y):")
print(y_punc)

Sequences (x):
[[ 353    6    2 ... 1101   43    2]
 [   6    2 1574 ...   43    2  169]
 [   2 1574 1575 ...    2  169 3075]
 ...
 [1576    6   17 ...  396    6    2]
 [   6   17 1101 ...    6    2 1574]
 [  17 1101   43 ...    2 1574 1575]]

Next words (y):
[ 169 3075    2 ... 1574 1575 1576]


## 3. Model trianing - No punctuation

### 3.1 Optimizer selection

#### Adam

In [None]:
def model_builder_optimizer():

  # Model
  model = Sequential()
  model.add(Embedding(vocabulary_size_keras, 512))
  model.add(LSTM(512, return_sequences=True))
  model.add(LSTM(256))
  model.add(Dense(512, activation='relu'))
  model.add(Dense(256, activation='relu'))
  model.add(Dense(vocabulary_size_keras, activation='softmax'))

  # Complie
  model.compile(optimizer = Adam(learning_rate = 0.001, clipvalue=5.0),
                loss='sparse_categorical_crossentropy', metrics=['accuracy'])
  return model

# Early stopping callback
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

# Dynamic change learning rate if validation loss does not improve
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-5)

In [None]:
# Build the model
model = model_builder_optimizer()

# Train the model
model.fit(x_no, y_no, epochs=20, validation_split=0.2, callbacks=[stop_early, reduce_lr], batch_size=64)

Epoch 1/20
[1m656/656[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 18ms/step - accuracy: 0.0547 - loss: 6.9399 - val_accuracy: 0.0609 - val_loss: 6.6782 - learning_rate: 0.0010
Epoch 2/20
[1m656/656[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 15ms/step - accuracy: 0.0599 - loss: 6.3506 - val_accuracy: 0.0623 - val_loss: 6.6761 - learning_rate: 0.0010
Epoch 3/20
[1m656/656[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 15ms/step - accuracy: 0.0654 - loss: 6.1168 - val_accuracy: 0.0755 - val_loss: 6.6104 - learning_rate: 0.0010
Epoch 4/20
[1m656/656[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 15ms/step - accuracy: 0.0783 - loss: 5.8647 - val_accuracy: 0.0795 - val_loss: 6.7123 - learning_rate: 0.0010
Epoch 5/20
[1m656/656[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 15ms/step - accuracy: 0.0943 - loss: 5.6509 - val_accuracy: 0.0911 - val_loss: 6.7728 - learning_rate: 0.0010
Epoch 6/20
[1m656/656[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

<keras.src.callbacks.history.History at 0x7b91c59309d0>

#### RMSprop

In [None]:
def model_builder_optimizer():

  # Model
  model = Sequential()
  model.add(Embedding(vocabulary_size_keras, 512))
  model.add(LSTM(512, return_sequences=True))
  model.add(LSTM(256))
  model.add(Dense(512, activation='relu'))
  model.add(Dense(256, activation='relu'))
  model.add(Dense(vocabulary_size_keras, activation='softmax'))

  # Complie
  model.compile(optimizer = RMSprop(learning_rate = 0.001, clipvalue=5.0),
                loss='sparse_categorical_crossentropy', metrics=['accuracy'])
  return model

# Early stopping callback
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

# Dynamic change learning rate if validation loss does not improve
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-5)

In [None]:
# Build the model
model = model_builder_optimizer()

# Train the model
model.fit(x_no, y_no, epochs=20, validation_split=0.2, callbacks=[stop_early, reduce_lr], batch_size=64)

Epoch 1/20
[1m656/656[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 13ms/step - accuracy: 0.0556 - loss: 6.9920 - val_accuracy: 0.0609 - val_loss: 6.5935 - learning_rate: 0.0010
Epoch 2/20
[1m656/656[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 14ms/step - accuracy: 0.0610 - loss: 6.4823 - val_accuracy: 0.0722 - val_loss: 6.4760 - learning_rate: 0.0010
Epoch 3/20
[1m656/656[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 14ms/step - accuracy: 0.0662 - loss: 6.3402 - val_accuracy: 0.0776 - val_loss: 6.3524 - learning_rate: 0.0010
Epoch 4/20
[1m656/656[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 13ms/step - accuracy: 0.0710 - loss: 6.1466 - val_accuracy: 0.0825 - val_loss: 6.2874 - learning_rate: 0.0010
Epoch 5/20
[1m656/656[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 14ms/step - accuracy: 0.0835 - loss: 6.0334 - val_accuracy: 0.0858 - val_loss: 6.2987 - learning_rate: 0.0010
Epoch 6/20
[1m656/656[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[

<keras.src.callbacks.history.History at 0x7b9115b6aa10>

#### SGD

In [None]:
def model_builder_optimizer():

  # Model
  model = Sequential()
  model.add(Embedding(vocabulary_size_keras, 512))
  model.add(LSTM(512, return_sequences=True))
  model.add(LSTM(256))
  model.add(Dense(512, activation='relu'))
  model.add(Dense(256, activation='relu'))
  model.add(Dense(vocabulary_size_keras, activation='softmax'))

  # Complie
  model.compile(optimizer = SGD(learning_rate = 0.001, clipvalue=5.0),
                loss='sparse_categorical_crossentropy', metrics=['accuracy'])
  return model

# Early stopping callback
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

# Dynamic change learning rate if validation loss does not improve
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-5)

In [None]:
# Build the model
model = model_builder_optimizer()

# Train the model
model.fit(x_no, y_no, epochs=20, validation_split=0.2, callbacks=[stop_early, reduce_lr], batch_size=64)

Epoch 1/20
[1m656/656[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 11ms/step - accuracy: 0.0472 - loss: 8.7411 - val_accuracy: 0.0609 - val_loss: 8.7366 - learning_rate: 0.0010
Epoch 2/20
[1m656/656[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 10ms/step - accuracy: 0.0594 - loss: 8.7352 - val_accuracy: 0.0609 - val_loss: 8.7305 - learning_rate: 0.0010
Epoch 3/20
[1m656/656[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 12ms/step - accuracy: 0.0587 - loss: 8.7293 - val_accuracy: 0.0609 - val_loss: 8.7243 - learning_rate: 0.0010
Epoch 4/20
[1m656/656[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 13ms/step - accuracy: 0.0586 - loss: 8.7232 - val_accuracy: 0.0609 - val_loss: 8.7181 - learning_rate: 0.0010
Epoch 5/20
[1m656/656[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 10ms/step - accuracy: 0.0598 - loss: 8.7169 - val_accuracy: 0.0609 - val_loss: 8.7118 - learning_rate: 0.0010
Epoch 6/20
[1m656/656[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

<keras.src.callbacks.history.History at 0x7b90340baa10>

#### **Discussion**

The results indicate that the Adam optimizer leads to overfitting after a few epochs, while SGD struggles to improve accuracy. Therefore, RMSprop is chosen as the optimizer, as it provides a better balance and improves validation accuracy.

### 3.2 LSTM model

#### Hyperparameter tuning

In [None]:
def build_model_tuning(hp):
    model = Sequential()

    # Choose an optimal embedding dimension between 64 and 256
    embed_dim = hp.Int('embedding_dim', min_value=64, max_value=256, step=32)
    model.add(Embedding(input_dim=vocabulary_size_keras, output_dim=embed_dim))

    # Choose an optimal number of LSTM units between 64 and 512
    lstm_units = hp.Int('lstm_units', min_value=64, max_value=512, step=64)
    model.add(LSTM(units=lstm_units, return_sequences=True))

    # Choose an optimal number of LSTM units between 64 and 512
    lstm_units2 = hp.Int('lstm_units2', min_value=64, max_value=512, step=64)
    model.add(LSTM(units=lstm_units2))

    # Dense layers
    model.add(Dense(512, activation='relu'))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(vocabulary_size_keras, activation='softmax'))

    model.compile(optimizer=RMSprop(learning_rate=0.001, clipvalue=5.0),
                  loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model

# Early stopping callback
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [None]:
tuner = kt.Hyperband(build_model_tuning, objective='val_accuracy',
                     max_epochs=20, factor=3, directory=f'/content/',
                     project_name=f'lstm')

tuner.search(x_no, y_no, epochs=20, validation_split=0.2, callbacks=[stop_early])

best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"Best hyperparameters for LSTM: {best_hps.values}")

best_model = tuner.get_best_models(num_models=1)[0]
val_loss, val_acc = best_model.evaluate(x_no, y_no, verbose=0)
print(f"Validation accuracy for LSTM: {val_acc:.4f}\n")

Trial 30 Complete [00h 02m 20s]
val_accuracy: 0.1030014306306839

Best val_accuracy So Far: 0.10404954850673676
Total elapsed time: 00h 33m 50s
Best hyperparameters for LSTM: {'embedding_dim': 192, 'lstm_units': 448, 'lstm_units2': 384, 'tuner/epochs': 20, 'tuner/initial_epoch': 7, 'tuner/bracket': 2, 'tuner/round': 2, 'tuner/trial_id': '0012'}
Validation accuracy for LSTM: 0.1379



#### Trian model

In [None]:
def model_builder():

  # Model
  model = Sequential()
  model.add(Embedding(vocabulary_size_keras, 192))
  model.add(LSTM(448, return_sequences=True))
  model.add(LSTM(384))
  model.add(Dense(512, activation='relu'))
  model.add(Dense(256, activation='relu'))
  model.add(Dense(vocabulary_size_keras, activation='softmax'))

  # Complie
  model.compile(optimizer = RMSprop(learning_rate = 0.001, clipvalue=5.0),
                loss='sparse_categorical_crossentropy', metrics=['accuracy'])
  return model

In [None]:
# Build the model using best hyperparameters
model = model_builder()

# Train and Save model
checkpoint = ModelCheckpoint("/content/task3_lstm_nopunc_model.keras", save_best_only=True, monitor="loss", mode="min")
history = model.fit(x_no, y_no, epochs=100, validation_split=0.1, callbacks=[checkpoint], batch_size=64)

Epoch 1/100
[1m738/738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.0588 - loss: 7.0049 - val_accuracy: 0.0589 - val_loss: 6.6050
Epoch 2/100
[1m738/738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - accuracy: 0.0613 - loss: 6.4942 - val_accuracy: 0.0705 - val_loss: 6.4375
Epoch 3/100
[1m738/738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - accuracy: 0.0707 - loss: 6.3099 - val_accuracy: 0.0760 - val_loss: 6.3326
Epoch 4/100
[1m738/738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - accuracy: 0.0738 - loss: 6.1573 - val_accuracy: 0.0789 - val_loss: 6.2779
Epoch 5/100
[1m738/738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - accuracy: 0.0835 - loss: 6.0396 - val_accuracy: 0.0861 - val_loss: 6.2621
Epoch 6/100
[1m738/738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - accuracy: 0.0921 - loss: 5.9745 - val_accuracy: 0.0979 - val_loss: 6.1917
Epoch 7/100
[1

#### Evaluation

In [None]:
# Load model
model = load_model("/content/drive/MyDrive/Coursework_chatwipa/TASK 3/Model/task3_lstm_nopunc_model.keras")
model.summary()

In [None]:
# Load the tokenizer using pickle
with open('/content/drive/MyDrive/Coursework_chatwipa/TASK 3/Model/tokenizer_no_punctuation.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

In [None]:
def generate_text(model, tokenizer, input_text, num_gen_words):

    output_text = []
    for i in range(num_gen_words):

        # Predict word
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        pad_encoded = pad_sequences([encoded_text], maxlen=10, truncating='pre')
        prob_index = model.predict(pad_encoded, verbose=0)[0]

        # Map highest probability index with word
        highest_prob_index = np.argmax(prob_index)
        pred_word = tokenizer.index_word.get(highest_prob_index, '')

        # Update the input text for the next prediction
        input_text = input_text + ' ' + pred_word

        # Output text
        output_text.append(pred_word)

    return " ".join(output_text)

In [None]:
# Seed text:
'''Original: After a short interval, the American film star was ushered in, and we rose to our feet.'''

seed_text = "After a short interval, the American film star was ushered "
num_gen_words = 10
generated_text = generate_text(model, tokenizer, seed_text, num_gen_words=num_gen_words)
print(generated_text)

in and we rose and our eyes minor on a


### 3.3 GRU model

#### Hyperparameter tuning

In [None]:
def build_model_tuning(hp):
    model = Sequential()

    # Choose an optimal embedding dimension between 64 and 512
    embed_dim = hp.Int('embedding_dim', min_value=64, max_value=512, step=32)
    model.add(Embedding(input_dim=vocabulary_size_keras, output_dim=embed_dim))

    # Choose an optimal number of GRU units between 64 and 512
    gru_units = hp.Int('gru_units', min_value=64, max_value=512, step=64)
    model.add(GRU(units=gru_units, return_sequences=True))

    # Choose an optimal number of GRU units between 128 and 512
    gru_units2 = hp.Int('gru_units2', min_value=64, max_value=512, step=64)
    model.add(GRU(units=gru_units2))

    # Dense layers
    model.add(Dense(512, activation='relu'))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(vocabulary_size_keras, activation='softmax'))

    model.compile(optimizer=RMSprop(learning_rate=0.001, clipvalue=5.0),
                  loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model

# Early stopping callback
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [None]:
tuner = kt.Hyperband(build_model_tuning, objective='val_accuracy',
                     max_epochs=20, factor=3, directory=f'/content/',
                     project_name=f'gru')

tuner.search(x_no, y_no, epochs=20, validation_split=0.2, callbacks=[stop_early])

best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"Best hyperparameters for GRU: {best_hps.values}")

best_model = tuner.get_best_models(num_models=1)[0]
val_loss, val_acc = best_model.evaluate(x_no, y_no, verbose=0)
print(f"Validation accuracy for GRU: {val_acc:.4f}\n")

Trial 30 Complete [00h 01m 45s]
val_accuracy: 0.10557408630847931

Best val_accuracy So Far: 0.10805145651102066
Total elapsed time: 00h 30m 20s
Best hyperparameters for GRU: {'embedding_dim': 256, 'gru_units': 128, 'gru_units2': 256, 'tuner/epochs': 7, 'tuner/initial_epoch': 0, 'tuner/bracket': 1, 'tuner/round': 0}
Validation accuracy for GRU: 0.1284



#### Trian model

In [None]:
def model_builder():

  # Model
  model = Sequential()
  model.add(Embedding(vocabulary_size_keras, 256))
  model.add(GRU(128, return_sequences=True))
  model.add(GRU(256))
  model.add(Dense(512, activation='relu'))
  model.add(Dense(256, activation='relu'))
  model.add(Dense(vocabulary_size_keras, activation='softmax'))

  # Complie
  model.compile(optimizer = RMSprop(learning_rate = 0.001, clipvalue=5.0),
                loss='sparse_categorical_crossentropy', metrics=['accuracy'])
  return model

In [None]:
# Build the model using best hyperparameters
model = model_builder()

# Train and Save model
checkpoint = ModelCheckpoint("/content/task3_gru_nopunc_model.keras", save_best_only=True, monitor="loss", mode="min")
history = model.fit(x_no, y_no, epochs=100, validation_split=0.1, callbacks=[checkpoint], batch_size=64)

Epoch 1/100
[1m738/738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8ms/step - accuracy: 0.0565 - loss: 6.9672 - val_accuracy: 0.0758 - val_loss: 6.4481
Epoch 2/100
[1m738/738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.0746 - loss: 6.2958 - val_accuracy: 0.0932 - val_loss: 6.2500
Epoch 3/100
[1m738/738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.0958 - loss: 5.9987 - val_accuracy: 0.0985 - val_loss: 6.2034
Epoch 4/100
[1m738/738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - accuracy: 0.1114 - loss: 5.8530 - val_accuracy: 0.1010 - val_loss: 6.2595
Epoch 5/100
[1m738/738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.1213 - loss: 5.7135 - val_accuracy: 0.1056 - val_loss: 6.1868
Epoch 6/100
[1m738/738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - accuracy: 0.1349 - loss: 5.5775 - val_accuracy: 0.1088 - val_loss: 6.1817
Epoch 7/100
[1m738/73

#### Evaluation

In [None]:
# Load model
model = load_model("/content/drive/MyDrive/Coursework_chatwipa/TASK 3/Model/task3_gru_nopunc_model.keras")
model.summary()

In [None]:
# Load the tokenizer using pickle
with open('/content/drive/MyDrive/Coursework_chatwipa/TASK 3/Model/tokenizer_no_punctuation.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

In [None]:
def generate_text(model, tokenizer, input_text, num_gen_words):

    output_text = []
    for i in range(num_gen_words):

        # Predict word
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        pad_encoded = pad_sequences([encoded_text], maxlen=10, truncating='pre')
        prob_index = model.predict(pad_encoded, verbose=0)[0]

        # Map highest probability index with word
        highest_prob_index = np.argmax(prob_index)
        pred_word = tokenizer.index_word.get(highest_prob_index, '')

        # Update the input text for the next prediction
        input_text = input_text + ' ' + pred_word

        # Output text
        output_text.append(pred_word)

    return " ".join(output_text)

In [None]:
# Seed text:
'''Original: After a short interval, the American film star was ushered in, and we rose to our feet.'''

seed_text = "After a short interval, the American film star was ushered"
num_gen_words = 10
generated_text = generate_text(model, tokenizer, seed_text, num_gen_words=num_gen_words)
print(generated_text)

coming from limited he channel the city of whispered landlady


## 4. Model trianing - With punctuation

### 4.1 GRU model

#### Hyperparameter tuning

In [None]:
def build_model_tuning(hp):
    model = Sequential()

    # Choose an optimal embedding dimension between 64 and 512
    embed_dim = hp.Int('embedding_dim', min_value=64, max_value=512, step=32)
    model.add(Embedding(input_dim=vocabulary_size_punc, output_dim=embed_dim))

    # Choose an optimal number of GRU units between 64 and 512
    gru_units = hp.Int('gru_units', min_value=64, max_value=512, step=64)
    model.add(GRU(units=gru_units, return_sequences=True))

    # Choose an optimal number of GRU units between 128 and 512
    gru_units2 = hp.Int('gru_units2', min_value=64, max_value=512, step=64)
    model.add(GRU(units=gru_units2))

    # Dense layers
    model.add(Dense(512, activation='relu'))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(vocabulary_size_punc, activation='softmax'))

    model.compile(optimizer=RMSprop(learning_rate=0.001, clipvalue=5.0),
                  loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model

# Early stopping callback
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [None]:
tuner = kt.Hyperband(build_model_tuning, objective='val_accuracy',
                     max_epochs=20, factor=3, directory=f'/content/',
                     project_name=f'gru_punc')

tuner.search(x_punc, y_punc, epochs=20, validation_split=0.2, callbacks=[stop_early])

best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"Best hyperparameters for GRU (with punctuation): {best_hps.values}")

best_model = tuner.get_best_models(num_models=1)[0]
val_loss, val_acc = best_model.evaluate(x_punc, y_punc, verbose=0)
print(f"Validation accuracy for GRU (with punctuation): {val_acc:.4f}\n")

Trial 30 Complete [00h 01m 54s]
val_accuracy: 0.14254538714885712

Best val_accuracy So Far: 0.1469099223613739
Total elapsed time: 00h 32m 24s
Best hyperparameters for GRU (with punctuation): {'embedding_dim': 512, 'gru_units': 256, 'gru_units2': 448, 'tuner/epochs': 7, 'tuner/initial_epoch': 3, 'tuner/bracket': 2, 'tuner/round': 1, 'tuner/trial_id': '0000'}
Validation accuracy for GRU (with punctuation): 0.1743



#### Trian model

In [None]:
def model_builder():

  # Model
  model = Sequential()
  model.add(Embedding(vocabulary_size_punc, 512))
  model.add(GRU(256, return_sequences=True))
  model.add(GRU(448))
  model.add(Dense(512, activation='relu'))
  model.add(Dense(256, activation='relu'))
  model.add(Dense(vocabulary_size_punc, activation='softmax'))

  # Complie
  model.compile(optimizer = RMSprop(learning_rate = 0.001, clipvalue=5.0),
                loss='sparse_categorical_crossentropy', metrics=['accuracy'])
  return model

In [None]:
# Build the model using best hyperparameters
model = model_builder()

# Train and Save model
checkpoint = ModelCheckpoint("/content/task3_gru_withpunc_model.keras",
                             save_best_only=True, monitor="loss", mode="min")
history = model.fit(x_punc, y_punc, epochs=100, validation_split=0.1, callbacks=[checkpoint], batch_size=64)

Epoch 1/100
[1m806/806[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.0715 - loss: 6.7084 - val_accuracy: 0.1121 - val_loss: 6.1211
Epoch 2/100
[1m806/806[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 10ms/step - accuracy: 0.1117 - loss: 5.9316 - val_accuracy: 0.1285 - val_loss: 5.9789
Epoch 3/100
[1m806/806[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 10ms/step - accuracy: 0.1378 - loss: 5.6609 - val_accuracy: 0.1273 - val_loss: 5.9147
Epoch 4/100
[1m806/806[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 10ms/step - accuracy: 0.1562 - loss: 5.4825 - val_accuracy: 0.1391 - val_loss: 5.9572
Epoch 5/100
[1m806/806[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 10ms/step - accuracy: 0.1737 - loss: 5.3271 - val_accuracy: 0.1358 - val_loss: 5.9429
Epoch 6/100
[1m806/806[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 10ms/step - accuracy: 0.1814 - loss: 5.2503 - val_accuracy: 0.1397 - val_loss: 5.9788
Epoch 7/100
[1

#### Evaluation

In [None]:
# Load model
model = load_model("/content/drive/MyDrive/Coursework_chatwipa/TASK 3/Model/task3_gru_withpunc_model.keras")
model.summary()

In [None]:
# Load the tokenizer using pickle
with open('/content/drive/MyDrive/Coursework_chatwipa/TASK 3/Model/tokenizer_with_punctuation.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

In [None]:
def generate_text(model, tokenizer, input_text, num_gen_words):

    output_text = []
    for i in range(num_gen_words):

        # Predict word
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        pad_encoded = pad_sequences([encoded_text], maxlen=10, truncating='pre')
        prob_index = model.predict(pad_encoded, verbose=0)[0]

        # Map highest probability index with word
        highest_prob_index = np.argmax(prob_index)
        pred_word = tokenizer.index_word.get(highest_prob_index, '')

        # Update the input text for the next prediction
        input_text = input_text + ' ' + pred_word

        # Output text
        output_text.append(pred_word)

    return " ".join(output_text)

In [None]:
# Seed text:
'''Original: After a short interval, the American film star was ushered in, and we rose to our feet.'''

seed_text = "After a short interval, the American film star was "
num_gen_words = 10
generated_text = generate_text(model, tokenizer, seed_text, num_gen_words=num_gen_words)
print(generated_text)

doubtless six itself . from too in vain we are


## 5. Comparison with ChatGPT



Text generator

In [None]:
def generate_text(model, tokenizer, input_text, num_gen_words):

    output_text = []
    for i in range(num_gen_words):

        # Predict word
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        pad_encoded = pad_sequences([encoded_text], maxlen=10, truncating='pre')
        prob_index = model.predict(pad_encoded, verbose=0)[0]

        # Map highest probability index with word
        highest_prob_index = np.argmax(prob_index)
        pred_word = tokenizer.index_word.get(highest_prob_index, '')

        # Update the input text for the next prediction
        input_text = input_text + ' ' + pred_word

        # Output text
        output_text.append(pred_word)

    return " ".join(output_text)

**Original text**

At Bertram’s Hotel by Agatha Christie
- During the war, houses were demolished on the right of it, and a little farther down on the left of it, but Bertram’s itself remained unscathed.
- Naturally it could not escape being, as house agents would say, scratched, bruised and marked, but by the expenditure of only a reasonable amount of money it was restored to its original condition.

#### **5.1 LSTM model**

In [None]:
# Load model
model_lstm = load_model("/content/drive/MyDrive/Coursework_chatwipa/TASK 3/Model/task3_lstm_nopunc_model.keras")
# Load the tokenizer using pickle
with open('/content/drive/MyDrive/Coursework_chatwipa/TASK 3/Model/tokenizer_no_punctuation.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

In [None]:
# Seed text and references
seed_text = "During the war, houses were demolished on the right of it, and"
reference = "a little farther down on the left of it, but"

# Generating word
num_gen_words = 10
generated_text = generate_text(model_lstm, tokenizer, seed_text, num_gen_words=num_gen_words)
print("Predicted word: ", generated_text)

# Create a ROUGE scorer (using 'rouge1', 'rouge2', and 'rougeL')
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(reference, generated_text)

# Print the scores
print("\nROUGE Scores:")
scores

Predicted word:  omurphy got down the steps of him on a kind

ROUGE Scores:


{'rouge1': Score(precision=0.5, recall=0.5, fmeasure=0.5),
 'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0),
 'rougeL': Score(precision=0.3, recall=0.3, fmeasure=0.3)}

In [None]:
# Seed text and references
seed_text = "Naturally it could not escape being, as house agents would say,"
reference = "scratched, bruised and marked, but by the expenditure of only"

# Generating word
num_gen_words = 10
generated_text = generate_text(model_lstm, tokenizer, seed_text, num_gen_words=num_gen_words)
print("Predicted word: ", generated_text)

# Create a ROUGE scorer (using 'rouge1', 'rouge2', and 'rougeL')
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(reference, generated_text)

# Print the scores
print("\nROUGE Scores:")
scores

Predicted word:  i understand that he had a man having gone at

ROUGE Scores:


{'rouge1': Score(precision=0.0, recall=0.0, fmeasure=0.0),
 'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0),
 'rougeL': Score(precision=0.0, recall=0.0, fmeasure=0.0)}

#### **5.2 GRU model - no punctuation**

In [None]:
# Load model
model_gru = load_model("/content/drive/MyDrive/Coursework_chatwipa/TASK 3/Model/task3_gru_nopunc_model.keras")
# Load the tokenizer using pickle
with open('/content/drive/MyDrive/Coursework_chatwipa/TASK 3/Model/tokenizer_no_punctuation.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

In [None]:
# Seed text and references
seed_text = "During the war, houses were demolished on the right of it, and"
reference = "a little farther down on the left of it, but"

# Generating word
num_gen_words = 10
generated_text = generate_text(model_gru, tokenizer, seed_text, num_gen_words=num_gen_words)
print("Predicted word: ", generated_text)

# Create a ROUGE scorer (using 'rouge1', 'rouge2', and 'rougeL')
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(reference, generated_text)

# Print the scores
print("\nROUGE Scores:")
scores

Predicted word:  hidden somewhere by poirot shook his head no less one

ROUGE Scores:


{'rouge1': Score(precision=0.0, recall=0.0, fmeasure=0.0),
 'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0),
 'rougeL': Score(precision=0.0, recall=0.0, fmeasure=0.0)}

In [None]:
# Seed text and references
seed_text = "Naturally it could not escape being, as house agents would say,"
reference = "scratched, bruised and marked, but by the expenditure of only"

# Generating word
num_gen_words = 10
generated_text = generate_text(model_gru, tokenizer, seed_text, num_gen_words=num_gen_words)
print("Predicted word: ", generated_text)

# Create a ROUGE scorer (using 'rouge1', 'rouge2', and 'rougeL')
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(reference, generated_text)

# Print the scores
print("\nROUGE Scores:")
scores

Predicted word:  the doctor looked alone oh poirot seemed there said the

ROUGE Scores:


{'rouge1': Score(precision=0.1, recall=0.1, fmeasure=0.10000000000000002),
 'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0),
 'rougeL': Score(precision=0.1, recall=0.1, fmeasure=0.10000000000000002)}

#### **5.3 GRU model - with punctuation**

In [None]:
# Load model
model_gru_punc = load_model("/content/drive/MyDrive/Coursework_chatwipa/TASK 3/Model/task3_gru_withpunc_model.keras")
# Load the tokenizer using pickle
with open('/content/drive/MyDrive/Coursework_chatwipa/TASK 3/Model/tokenizer_no_punctuation.pkl', 'rb') as f:
    tokenizer_punc = pickle.load(f)

In [None]:
# Seed text and references
seed_text = "During the war, houses were demolished on the right of it, and"
reference = "a little farther down on the left of it, but"

# Generating word
num_gen_words = 10
generated_text = generate_text(model_gru_punc, tokenizer_punc, seed_text, num_gen_words=num_gen_words)
print("Predicted word: ", generated_text)

# Create a ROUGE scorer (using 'rouge1', 'rouge2', and 'rougeL')
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(reference, generated_text)

# Print the scores
print("\nROUGE Scores:")
scores

Predicted word:  i deal it time is them only nothing left the

ROUGE Scores:


{'rouge1': Score(precision=0.3, recall=0.3, fmeasure=0.3),
 'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0),
 'rougeL': Score(precision=0.1, recall=0.1, fmeasure=0.10000000000000002)}

In [None]:
# Seed text and references
seed_text = "Naturally it could not escape being, as house agents would say,"
reference = "scratched, bruised and marked, but by the expenditure of only"

# Generating word
num_gen_words = 10
generated_text = generate_text(model_gru_punc, tokenizer_punc, seed_text, num_gen_words=num_gen_words)
print("Predicted word: ", generated_text)

# Create a ROUGE scorer (using 'rouge1', 'rouge2', and 'rougeL')
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(reference, generated_text)

# Print the scores
print("\nROUGE Scores:")
scores

Predicted word:  doubtless the out poirot star have is traitor big poirot

ROUGE Scores:


{'rouge1': Score(precision=0.1, recall=0.1, fmeasure=0.10000000000000002),
 'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0),
 'rougeL': Score(precision=0.1, recall=0.1, fmeasure=0.10000000000000002)}

#### **5.4 ChatGPT result**

***Prompt: Please predict next 10 words***

1) **Seed text**: During the war, houses were demolished on the right of it, and

Result: new buildings were constructed to replace the damaged structures, while others

2) **Seed text**: Naturally it could not escape being, as house agents would say,

**Result**: a desirable residence with a rich historical background and unique charm.

In [None]:
# Seed text and references
seed_text = "During the war, houses were demolished on the right of it, and"
reference = "a little farther down on the left of it, but"

# Generating word
generate_text = "new buildings were constructed to replace the damaged structures, while others"

# Create a ROUGE scorer (using 'rouge1', 'rouge2', and 'rougeL')
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(reference, generated_text)

# Print the scores
print("\nROUGE Scores:")
scores


ROUGE Scores:


{'rouge1': Score(precision=0.2, recall=0.2, fmeasure=0.20000000000000004),
 'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0),
 'rougeL': Score(precision=0.1, recall=0.1, fmeasure=0.10000000000000002)}

In [None]:
# Seed text and references
seed_text = "Naturally it could not escape being, as house agents would say,"
reference = "scratched, bruised and marked, but by the expenditure of only"

# Generating word
generate_text = "a desirable residence with a rich historical background and unique charm."

# Create a ROUGE scorer (using 'rouge1', 'rouge2', and 'rougeL')
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(reference, generated_text)

# Print the scores
print("\nROUGE Scores:")
scores


ROUGE Scores:


{'rouge1': Score(precision=0.15789473684210525, recall=0.3, fmeasure=0.20689655172413793),
 'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0),
 'rougeL': Score(precision=0.15789473684210525, recall=0.3, fmeasure=0.20689655172413793)}


***Prompt: Please predict next 10 words and write in Agatha Christie way***

1) **Seed text:** During the war, houses were demolished on the right of it, and  

**Result:** the remnants stood solemnly, whispering echoes of lives once lived.  

2) **Seed text:** Naturally it could not escape being, as house agents would say,  

**Result:** “a residence of character, with an air of quiet dignity.”  

In [None]:
# Seed text and references
seed_text = "During the war, houses were demolished on the right of it, and"
reference = "a little farther down on the left of it, but"

# Generating word
generate_text = "the remnants stood solemnly, whispering echoes of lives once lived."

# Create a ROUGE scorer (using 'rouge1', 'rouge2', and 'rougeL')
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(reference, generated_text)

# Print the scores
print("\nROUGE Scores:")
scores


ROUGE Scores:


{'rouge1': Score(precision=0.2, recall=0.2, fmeasure=0.20000000000000004),
 'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0),
 'rougeL': Score(precision=0.1, recall=0.1, fmeasure=0.10000000000000002)}

In [None]:
# Seed text and references
seed_text = "Naturally it could not escape being, as house agents would say,"
reference = "scratched, bruised and marked, but by the expenditure of only"

# Generating word
generate_text = "a residence of character, with an air of quiet dignity."

# Create a ROUGE scorer (using 'rouge1', 'rouge2', and 'rougeL')
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(reference, generated_text)

# Print the scores
print("\nROUGE Scores:")
scores


ROUGE Scores:


{'rouge1': Score(precision=0.1, recall=0.1, fmeasure=0.10000000000000002),
 'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0),
 'rougeL': Score(precision=0.1, recall=0.1, fmeasure=0.10000000000000002)}

#### **Discussion**

- The ROUGE score is used to evaluate the quality of generated text by measuring its similarity to the reference text.
- LSTM and GRU models that train with text that remove punctuation show inconsistency in their ROUGE scores, while ChatGPT demonstrates more stability in its results.
- This inconsistency in LSTM and GRU, which are RNN-based models, comes from their difficulty in capturing long-term dependencies, leading to fluctuating ROUGE scores.
- In contrast, ChatGPT performs more consistently due to the transformer architecture, which handles longer-range context more effectively.
- Additionally, training the model with punctuation provides clearer segmentation, which slightly improves text coherence and ROUGE scores. However,using more complex tokenization might help by allowing the model to better handle rare words, subwords, and punctuation, which could lead to more consistent and accurate text generation.
- The difference prompt to ChatGPT will provide the different generated text result. The initial prompt yielded general information, while the second prompt requesting an Agatha Christie style produced a literary response. This shows that ChatGPT has learned from lots of different kinds of writing.
- Therefore, providing more training data, especially data with different styles and types of writing could help the model better recognize text patterns and successfully generate text that is closer to human-level sentences

## 6.  Report

**Types of Approach**

1) Data Preprocessing
  - Text cleaning
    - This experiment compares two preprocessing approaches including removing all punctuation and retaining some punctuation in the text
    - The results indicate that training with punctuation leads to more consistent ROUGE scores, suggesting that punctuation provides useful structural information for the model
  - Tokenization
    - This report uses only the Keras tokenizer for model training because applying Spacy tokenization resulted in a significant number of out-of-vocabulary (OOV) words, leading to blank or missing predicted words.
    - Therefore, increasing the vocabulary size or using subword tokenization may be considered in the future to improve text generation accuracy.

2) Model selection
  - LSTM (Long Short-Term Memory)
    - The result shows instability in ROUGE scores when trained on text without punctuation
  - GRU (Gated Recurrent Unit)
    - The ROUGE scores are slightly improve when trained on text with punctuation
  - Transformer-based models
    - The result shows that ROUGE scores from ChatGPT is more consistancy compared to RNN-based model
    - Therefore, this type of model could improve the performance of text generator since its ability to handle long-range context

---

**Performance Improvement Techniques (Ordered from Most to Least Impactful)**  

1) **Using Transformer-based Models**  
   - Trasformer-based models like ChatGPT shows better performance in in text generation by effectively handling long-range dependencies

2) **Advance Tokenization**  
   - Using subword tokenization could solve the out-of-vocabulary (OOV) issue, allowing the model to handle rare words more effictively.
   - Advanced tokenization also improves punctuation handling, leading to better sentence structure and coherence.

3) **Training with Punctuation**  
   - The experiment shows that models trained with punctuation exhibit more stable ROUGE scores compared to those trained on text with all punctuation removed
   - Therefore, including punctuation in training data with the better tokenization method could improve the performance of model

4) **Increasing training text**  
   - Expanding the training dataset could help the model's performance, reducing overfitting and improving the quality of generated text


## 7. Demo session

Install library

Load data

In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
import pandas as pd

# Load and preprocess the text
with open("/content/drive/MyDrive/Coursework_chatwipa/TASK 3/61262-0.txt", 'r', encoding='utf-8') as file:
    text = file.read().lower()

In [16]:
!pip install -q -U keras-tuner
!pip install rouge-score



In [17]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, GRU, LayerNormalization, BatchNormalization
import keras_tuner as kt
import re
from keras.optimizers import Adam, RMSprop, SGD
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import pickle
import spacy
from rouge_score import rouge_scorer

#### 7.1 Predict text

In [18]:
# Load model
model = load_model("/content/drive/MyDrive/Coursework_chatwipa/TASK 3/Model/task3_gru_withpunc_model.keras")

# Load the tokenizer using pickle
with open('/content/drive/MyDrive/Coursework_chatwipa/TASK 3/Model/tokenizer_with_punctuation.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

# Text generator function
def generate_text(model, tokenizer, input_text, num_gen_words):

    output_text = []
    for i in range(num_gen_words):

        # Predict word
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        pad_encoded = pad_sequences([encoded_text], maxlen=10, truncating='pre')
        prob_index = model.predict(pad_encoded, verbose=0)[0]

        # Map highest probability index with word
        highest_prob_index = np.argmax(prob_index)
        pred_word = tokenizer.index_word.get(highest_prob_index, '')

        # Update the input text for the next prediction
        input_text = input_text + ' ' + pred_word

        # Output text
        output_text.append(pred_word)

    return " ".join(output_text)

In [19]:
# Text
seed_text = "Once upon a midnight dreary, I was seeing the"

# Number of word to generate
num_gen_words = 20

# Generate text
generated_text = generate_text(model, tokenizer, seed_text, num_gen_words)
print(generated_text)

stared in a ivy and you see the fashionable frenchman that a uncommon point of mr . vavasour who is


#### 7.2 Training one epoch

Clean all punctuation

In [20]:
# Remove Gutenberg header and footer (if present)
text_no_punctuation = re.sub(r"\*\*\* START OF THE PROJECT GUTENBERG EBOOK.*?\*\*\*", "", text, flags=re.DOTALL)
text_no_punctuation = re.sub(r"\*\*\* END OF THE PROJECT GUTENBERG EBOOK.*?\*\*\*", "", text_no_punctuation, flags=re.DOTALL)

# Remove excessive newlines
text_no_punctuation = re.sub(r"\n{2,}", " ", text_no_punctuation).strip()
# Remove non-alphabet characters:
text_no_punctuation = re.sub(r'[^a-zA-Z\s]', '', text_no_punctuation)
# Remove HTML tags:
text_no_punctuation = re.sub(r'<.*?>', '', text_no_punctuation)
# Remove space:
text_no_punctuation = re.sub(r'\n', ' ', text_no_punctuation)

In [21]:
text_no_punctuation



Tokenize text

In [22]:
# Tokenize text
tokenizer = Tokenizer(lower=False)
tokenizer.fit_on_texts([text_no_punctuation])
word_index_no = tokenizer.word_index
vocabulary_size_keras = len(word_index_no) + 1

In [23]:
print("Vobaculary size:", vocabulary_size_keras)

Vobaculary size: 6264


Data preparation - No punctuation

In [24]:
# Tokenize the text into sequences
maxlen = 10  # Sequence length
step = 1  # Shift the window by 1 word

# Prepare training sequences
sequences = []

# Split the word into list
words = text_no_punctuation.split()

# Create sequences of maxlen words
for i in range(0, len(words) - maxlen, step):
    sequences.append([word_index_no[word] for word in words[i: i + maxlen]])

# Prepare X (inputs) and y (next word predictions)
x_no = np.array(sequences)
y_no = np.array([word_index_no[words[i + maxlen]] for i in range(0, len(words) - maxlen, step)])

# Print the prepared sequences and next words
print("Sequences (x):")
print(x_no)
print("\nNext words (y):")
print(y_no)

Sequences (x):
[[ 351    5    1 ... 1098   41    1]
 [   5    1 1571 ...   41    1  167]
 [   1 1571 1572 ...    1  167 3070]
 ...
 [1573    5   15 ...  394    5    1]
 [   5   15 1098 ...    5    1 1571]
 [  15 1098   41 ...    1 1571 1572]]

Next words (y):
[ 167 3070    1 ... 1571 1572 1573]


In [25]:
def model_builder():

  # Model
  model = Sequential()
  model.add(Embedding(vocabulary_size_keras, 256))
  model.add(GRU(128, return_sequences=True))
  model.add(GRU(256))
  model.add(Dense(512, activation='relu'))
  model.add(Dense(256, activation='relu'))
  model.add(Dense(vocabulary_size_keras, activation='softmax'))

  # Complie
  model.compile(optimizer = RMSprop(learning_rate = 0.001, clipvalue=5.0),
                loss='sparse_categorical_crossentropy', metrics=['accuracy'])
  return model

In [26]:
# Build the model using best hyperparameters
model = model_builder()
model.fit(x_no, y_no, epochs=1, validation_split=0.1, batch_size=64)

[1m738/738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 47ms/step - accuracy: 0.0585 - loss: 6.9652 - val_accuracy: 0.0705 - val_loss: 6.4221


<keras.src.callbacks.history.History at 0x7831146fc2d0>