<a href="https://colab.research.google.com/github/csci544projectGroup18/DebaterAI/blob/main/colab/SequenceEncoderBlock.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## How to run the code
Go to `Runtime > Change runtime type > Hardware accelerator`, select `GPU` and click `save`


In [1]:
import os

PROJECT_ROOT_DIR = os.getcwd()
PRETRAINED_MODEL_DIR = os.path.join(PROJECT_ROOT_DIR, "models", "pretrained")
#assert os.path.isdir(PRETRAINED_MODEL_DIR)

#   Path to the directory where the pre-trained model will be saved.
os.environ["HUGGINGFACE_HUB_CACHE"] = PRETRAINED_MODEL_DIR
os.environ["TRANSFORMERS_CACHE"] = PRETRAINED_MODEL_DIR

%pip install transformers
%pip install adapter-transformers 

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
RESULTS_DIR = os.path.join(PROJECT_ROOT_DIR, "results")
LOG_DIR = os.path.join(PROJECT_ROOT_DIR, "logs")

#assert os.path.isdir(RESULTS_DIR) and os.path.isdir(LOG_DIR)

In [3]:
import torch
import torch.nn as nn

from transformers import GPT2Model, GPT2Tokenizer
from transformers import MAMConfig

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", padding_side="right")
baseGPT2 = GPT2Model.from_pretrained("gpt2")

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [5]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)

cuda


In [6]:
ADAPTER_NAME = "mam_adpater"
ADAPTER_CONFIG = MAMConfig()

MAX_SEQUENCE_LENGTH = 128
SEQUENCE_EMEBDDING_SIZE = 1024
CNN_WINDOW_SIZE = 9

In [7]:
class SequenceEncoderBlock(nn.Module):
    '''Sequence encoder block

    params: 
        max_sequence_length: Maximum sequence length
        adapter_name: Adapter used for fine-tuning pre-trained encoder
        adapter_config: Adapter config
        cnn_output_channels: Number of output channels of the CNN(=dimension of the sequence embedding)
        cnn_window_size: Window size of the CNN
    '''
    def __init__(
            self, 
            max_sequence_length,
            adapter_name,
            adapter_config,
            cnn_output_channels,
            cnn_window_size
        ):
        super(SequenceEncoderBlock, self).__init__()

        #   Pre-trained GPT-2 model
        self.gpt2 = GPT2Model.from_pretrained("gpt2")

        #   Freeze GPT-2 pre-trained parameters
        for param in self.gpt2.parameters():
            param.requires_grad = False

        #   Add adapter to GPT-2
        self.gpt2.add_adapter(adapter_name, config=adapter_config)
        self.gpt2.set_active_adapters(adapter_name)

        #   CNN layer
        self.cnn = nn.Conv1d(
            in_channels=self.gpt2.config.hidden_size * 2,
            out_channels=cnn_output_channels,
            kernel_size=cnn_window_size,
            padding=int(cnn_window_size / 2)
        )

        #   Max pooling layer
        self.max_pooling = nn.MaxPool1d(kernel_size=max_sequence_length)

        #   Batch normalization layers
        self.word_embedding_bn = nn.BatchNorm1d(num_features=self.gpt2.config.hidden_size)
        self.encoder_bn = nn.BatchNorm1d(num_features=self.gpt2.config.hidden_size)
        self.pooling_bn = nn.BatchNorm1d(cnn_output_channels)

    def forward(self, input_ids, attention_mask):
        '''Forward propagation

        params:
            input_ids: Tensor of shape (B, L) containing the input token IDs
            attention_mask: Tensor of shape (B, L) containing the attention mask
        '''
        #   Dimension notations:
        #   B: batch size
        #   L: sequence length
        #   H: hidden size
        #   C: number of output channels of the CNN (also the dimension of the sequence embedding)

        #   Get word embeddings and last hidden states from GPT-2
        outputs = self.gpt2(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True
        )

        word_embeddings = outputs.hidden_states[0]
        #   Dimension: (B, L, H)
        encoder_hidden_states = outputs.last_hidden_state
        #   Dimension: (B, L, H)

        #   Batch normalization
        bn_word_embeddings = self.word_embedding_bn(
            word_embeddings.permute(0, 2, 1)
        ).permute(0, 2, 1)
        bn_encoder_hidden_states = self.encoder_bn(
            encoder_hidden_states.permute(0, 2, 1)
        ).permute(0, 2, 1)

        #   Concatenate word embeddings and encoder hidden states
        concat_embeddings = torch.cat((bn_word_embeddings, bn_encoder_hidden_states), dim=-1)
        #   Dimension: (B, L, H * 2)

        #   Apply attention mask to the concatenated sequence representation
        #   The attetion mask is expanded to dimension (B, L, H * 2), 
        #   matching the dimension of the concatenated sequence representation.
        #   The concatenated sequence representation is multiplied element-wise with the attention mask
        #   to zero out the padded positions.
        masked_concat_embeddings = concat_embeddings * \
            attention_mask.unsqueeze(-1).expand(concat_embeddings.shape)
        
        #   Apply CNN layer
        cnn_out = self.cnn(masked_concat_embeddings.permute(0, 2, 1))
        #   Dimension: (B, C, L)

        #   Apply max pooling layer
        pooled_output = self.max_pooling(cnn_out)
        #   Dimension: (B, C, 1)

        #   Apply batch normalization
        #   This is the final sequence embedding
        sequence_embedding = self.pooling_bn(pooled_output.squeeze(-1))
        #   Dimension: (B, C)

        return sequence_embedding

In [8]:
MyEncoderModel = SequenceEncoderBlock(
    max_sequence_length=MAX_SEQUENCE_LENGTH,
    adapter_name=ADAPTER_NAME,
    adapter_config=ADAPTER_CONFIG,
    cnn_output_channels=SEQUENCE_EMEBDDING_SIZE,
    cnn_window_size=CNN_WINDOW_SIZE
)

In [9]:
sample_sequence = "This is a sample sequence for testing the sequence encoder block. It contains multiple sentences."

In [10]:
#   There is no default padding token in the GPT-2 tokenizer, 
#   it is set to the end-of-sequence token instead.
tokenizer.pad_token = tokenizer.eos_token

In [11]:
tokenized_sequence = tokenizer(
    sample_sequence,
    padding="max_length",
    max_length=MAX_SEQUENCE_LENGTH,
    truncation=True,
    return_tensors="pt"
)

sample_input_ids = tokenized_sequence["input_ids"].to(DEVICE)
sample_attention_mask = tokenized_sequence["attention_mask"].to(DEVICE)

In [12]:
MyEncoderModel.to(DEVICE)
MyEncoderModel.eval()

with torch.no_grad():
    encoded_result = MyEncoderModel(sample_input_ids, sample_attention_mask)

In [13]:
print(encoded_result[:10])

tensor([[1.6552, 3.4482, 5.0152,  ..., 4.9798, 1.7661, 1.3977]],
       device='cuda:0')
