<a href="https://colab.research.google.com/github/engmrgh/msc-degree/blob/nlp%2Fproject/2nd_semester/nlp/project/code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import backend
from datetime import datetime
from datasets import Dataset, load_dataset
from transformers import GPT2Config, AutoTokenizer, TFGPT2LMHeadModel

In [None]:
# Try to run on TPU if available
# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    print("Running on TPU ", tpu.cluster_spec().as_dict()["worker"])
except ValueError:
    tpu = None
if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()
print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


In [None]:
PATH_BASE = "/content/"

# Dataset

In [None]:
!git clone https://github.com/pengbaolin/SC-GPT.git

fatal: destination path 'SC-GPT' already exists and is not an empty directory.


In [None]:
train_dfs = []
test_dfs = []

for domain in ['attraction', 'hotel', 'laptop', 'restaurant', 'taxi', 'train', 'tv']:
    train_dfs.append(pd.read_json(f'/content/SC-GPT/data/{domain}/train.json'))
    test_dfs.append(pd.read_json(f'/content/SC-GPT/data/{domain}/test.json'))

train_df = pd.concat(train_dfs)
test_df = pd.concat(test_dfs)

train_df.columns = ['dialogue act', 'response', 'response-repeated']
train_df.drop(labels=['response-repeated'], inplace=True, axis=1)
test_df.columns = ['dialogue act', 'response', 'response-repeated']
test_df.drop(labels=['response-repeated'], inplace=True, axis=1)

In [None]:
train_ds = Dataset.from_pandas(train_df)
train_ds

Dataset({
    features: ['dialogue act', 'response', '__index_level_0__'],
    num_rows: 340
})

In [None]:
test_ds = Dataset.from_pandas(test_df)
test_ds

Dataset({
    features: ['dialogue act', 'response', '__index_level_0__'],
    num_rows: 3310
})

In [None]:
MAX_TOKENS = 128
BACT_TOKEN = "<|dialogue_act|>"
EACT_TOKEN = "<|end_dialogue_act|>"
BRESP_TOKEN = "<|dialogue_resp|>"
ERESP_TOKEN = "<|end_dialogue_resp|>"
BOS_TOKENS = BRESP_TOKEN
EOS_TOKEN = ERESP_TOKEN
PAD_TOKEN = "<|pad|>"

# this will download and initialize the pre trained tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "gpt2",
    bos_token=BOS_TOKENS,
    eos_token=EOS_TOKEN,
    pad_token=PAD_TOKEN,
    max_length=MAX_TOKENS,
    is_split_into_words=True,
)
tokenizer.add_tokens([BACT_TOKEN, EACT_TOKEN], special_tokens=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


2

In [None]:
output = {}
# texts to numeric vectors of MAX_TOKENS
def tokenize_function(examples, tokenizer=tokenizer):
    # Add start and end token to each comment
    processed_examples = list()
    for act, response in zip(examples['dialogue act'], examples['response']):
        processed_examples.append(BACT_TOKEN + act + EACT_TOKEN + BRESP_TOKEN + response + ERESP_TOKEN)
    # tokenizer created input_ids and attention_mask as output
    output = tokenizer(
        processed_examples,
        add_special_tokens=True,  # Only adds pad not eos and bos
        max_length=MAX_TOKENS,
        truncation=True,
        padding='max_length',
    )
    # shift labels for next token prediction
    # set padding token labels to -100 which is ignored in loss computation
    bresp_token_id = tokenizer.convert_tokens_to_ids(BRESP_TOKEN)
    pad_token_id = tokenizer.pad_token_id

    output["labels"] = [x[x.index(bresp_token_id):] for x in output["input_ids"]]
    output["labels"] = [x + [pad_token_id] * (MAX_TOKENS - len(x) - 1) for x in output["labels"]]
    output["labels"] = [
        [-100 if x == pad_token_id else x for x in y] for y in output["labels"]
    ]
    # truncate input ids and attention mask to account for label shift
    output["input_ids"] = [x[:-1] for x in output["input_ids"]]
    output["attention_mask"] = [x[:-1] for x in output["attention_mask"]]
    return output

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['dialogue act', 'response', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 340
})


In [None]:
train_data = train_ds.map(
    tokenize_function,
    batched=True,
    num_proc=strategy.num_replicas_in_sync,
    load_from_cache_file=True,
)
print(train_data)

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['dialogue act', 'response', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 340
})


In [None]:
test_data = test_ds.map(
    tokenize_function,
    batched=True,
    num_proc=strategy.num_replicas_in_sync,
    load_from_cache_file=True,
)
print(test_data)

  0%|          | 0/4 [00:00<?, ?ba/s]

Dataset({
    features: ['dialogue act', 'response', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 3310
})


In [None]:
train_tensor_inputs = tf.convert_to_tensor(train_data["input_ids"])
train_tensor_labels = tf.convert_to_tensor(train_data["labels"])
train_tensor_mask = tf.convert_to_tensor(train_data["attention_mask"])
train = tf.data.Dataset.from_tensor_slices(
    (
        {"input_ids": train_tensor_inputs, "attention_mask": train_tensor_mask},
        train_tensor_labels,
    )
)

test_tensor_inputs = tf.convert_to_tensor(test_data["input_ids"])
test_tensor_labels = tf.convert_to_tensor(test_data["labels"])
test_tensor_mask = tf.convert_to_tensor(test_data["attention_mask"])
test = tf.data.Dataset.from_tensor_slices(
    (
        {"input_ids": test_tensor_inputs, "attention_mask": test_tensor_mask},
        test_tensor_labels,
    )
)

# Model

In [None]:
# Model params
BATCH_SIZE_PER_REPLICA = 28
EPOCHS = 6
INITAL_LEARNING_RATE = 0.001
try:
    BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
except NameError as e:
    BATCH_SIZE = BATCH_SIZE_PER_REPLICA
BUFFER_SIZE = len(train)

# prepare data for consumption
train_ds = (
    train.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
)
test_ds = test.batch(BATCH_SIZE, drop_remainder=True)

In [None]:
def customized_sparse_categorical_cross_entropy(y_true, y_pred, from_logits=False, axis=-1):
    cond = (y_true != -100)
    y_pred = tf.convert_to_tensor(y_pred)
    return backend.sparse_categorical_crossentropy(
      y_true[cond], y_pred[cond], from_logits=from_logits, axis=axis)

In [None]:
# Drecreasing learning rate scheduler
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    INITAL_LEARNING_RATE,
    decay_steps=500,
    decay_rate=0.7,
    staircase=True)

# initialize model, use_cache=False important! else wrong shape at loss calc
with strategy.scope():
    model = TFGPT2LMHeadModel.from_pretrained(
        "gpt2",
        use_cache=False,
        bos_token_id=tokenizer.bos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    model.resize_token_embeddings(len(tokenizer))
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
    model.compile(optimizer=optimizer, loss=customized_sparse_categorical_cross_entropy)
    model.summary()

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Model: "tfgpt2lm_head_model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 transformer (TFGPT2MainLaye  multiple                 124443648 
 r)                                                              
                                                                 
Total params: 124,443,648
Trainable params: 124,443,648
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Stop training when validation acc starts dropping
# Save checkpoint of model after each period
now = datetime.now().strftime("%Y-%m-%d_%H%M")
# Create callbacks
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor="val_loss", verbose=1, patience=2, restore_best_weights=True
    ),
    tf.keras.callbacks.ModelCheckpoint(
        PATH_BASE + "/data/models/" + now + "_GPT2-Model_{epoch:02d}_{val_loss:.4f}.h5",
        monitor="val_loss",
        save_format='tf',
        save_best_only=True,
        verbose=1,
        save_weights_only=True
    ),
]

In [None]:
# Train Model
steps_per_epoch = int(BUFFER_SIZE // BATCH_SIZE)
print(
    f"Model Params:\nbatch_size: {BATCH_SIZE}\nEpochs: {EPOCHS}\n"
    f"Step p. Epoch: {steps_per_epoch}\n"
    f"Initial Learning rate: {INITAL_LEARNING_RATE}"
)
hist = model.fit(
    train_ds,
    validation_data=train_ds,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=callbacks,
    verbose=1,
)

Model Params:
batch_size: 28
Epochs: 6
Step p. Epoch: 12
Initial Learning rate: 0.001
Epoch 1/6

# SOLOIST

In [1]:
!git clone https://engmrgh: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@github.com/engmrgh/msc-degree.git

Cloning into 'msc-degree'...
remote: Enumerating objects: 3895, done.[K
remote: Counting objects: 100% (1060/1060), done.[K
remote: Compressing objects: 100% (857/857), done.[K
remote: Total 3895 (delta 275), reused 870 (delta 187), pack-reused 2835[K
Receiving objects: 100% (3895/3895), 288.31 MiB | 15.06 MiB/s, done.
Resolving deltas: 100% (333/333), done.
Checking out files: 100% (3131/3131), done.


In [2]:
%cd msc-degree

/content/msc-degree


In [3]:
!git checkout nlp/project

Branch 'nlp/project' set up to track remote branch 'nlp/project' from 'origin'.
Switched to a new branch 'nlp/project'


In [4]:
%cd 2nd_semester/nlp/project

/content/msc-degree/2nd_semester/nlp/project


In [None]:
!apt-get install python3.8

!sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 1
!sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 2

#check python version
!python --version
#3.9.6

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
The following additional packages will be installed:
  libpython3.8-minimal libpython3.8-stdlib python3.8-minimal
Suggested packages:
  python3.8-venv binfmt-support
The following NEW packages will be installed:
  libpython3.8-minimal libpython3.8-stdlib python3.8 python3.8-minimal
0 upgraded, 4 newly installed, 0 to remove and 49 not upgraded.
Need to get 4,691 kB of archives.
After this operation, 18.5 MB of additional disk space will be used.
Get:1 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic/main amd64 libpython3.8-minimal amd64 3.8.13-1+bionic2 [762 kB]
Get:2 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic/main amd64 python3.8-minimal amd64 3.8.13-1+bionic2 [1,837 kB]
Get:3 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic/main amd64 l

In [None]:
!apt-get install python3-pip
!python -m pip install --upgrade pip

In [None]:
!pip3 install torch torchdatasets transformers==2.5.1 pytorch-ignite tensorboardX

In [None]:
!python3 train.py