In [1]:
import os

PROJECT_PATH = os.getcwd()
MODEL_PATH = os.path.join(PROJECT_PATH, "models", "pretrained")
os.environ["HUGGINGFACE_HUB_CACHE"] = MODEL_PATH
os.environ["TRANSFORMERS_CACHE"] = MODEL_PATH

In [2]:
import numpy as np

In [3]:
import torch
import torch.nn as nn

from transformers import GPT2Model, GPT2Tokenizer
from transformers import MAMConfig

from transformers.modeling_outputs import SequenceClassifierOutput

In [4]:
from transformers.utils import logging

logging.set_verbosity_info()
logger = logging.get_logger("transformers")
logger.info("INFO")
logger.warning("WARNING")

INFO


In [5]:
tokenizer = GPT2Tokenizer.from_pretrained(
    "gpt2", 
    padding_side="right"
)
baseGPT2 = GPT2Model.from_pretrained("gpt2")

loading file vocab.json from cache at h:\cs544\project\models\pretrained\models--gpt2\snapshots\e7da7f221d5bf496a48136c0cd264e630fe9fcc8\vocab.json
loading file merges.txt from cache at h:\cs544\project\models\pretrained\models--gpt2\snapshots\e7da7f221d5bf496a48136c0cd264e630fe9fcc8\merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at h:\cs544\project\models\pretrained\models--gpt2\snapshots\e7da7f221d5bf496a48136c0cd264e630fe9fcc8\config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": nul

In [6]:
RESULTS_PATH = os.path.join(PROJECT_PATH, "results")
LOG_PATH = os.path.join(PROJECT_PATH, "logs")

In [7]:
ADAPTER_NAME = "mam_adpater"
ADAPTER_CONFIG = MAMConfig()

BATCH_SIZE = 5
SEQUENCE_LENGTH = 128
SEQUENCE_EMEBDDING_SIZE = 512
CNN_WINDOW_SIZE = 5
FF_HIDDEN_SIZE = 2048
NUM_CLASSES = 3

In [8]:
class CNNGPT2(nn.Module):
    def __init__(
        self, 
        sequence_length,        #   Length of input sequence
        adapter_name,           #   Name of the adapter
        adapter_config,         #   Adapter config
        num_classes,            #   Number of classes
        cnn_output_channels,    #   Dimension of target sequence embedding
        cnn_kernel_size,        #   Window size of 1d CNN
        ff_hidden_size,         #   Hidden size of FeedForward layer
        loss_fn=None,           #   Loss function
    ):
        super(CNNGPT2, self).__init__()

        #   Pre-trained GPT-2 encoder
        self.gpt2 = GPT2Model.from_pretrained("gpt2")

        #   Freeze GPT-2 pre-trained parameters
        for param in self.gpt2.parameters():
            param.requires_grad = False

        #   Adapter on GPT-2
        self.gpt2.add_adapter(adapter_name, config=adapter_config)
        self.gpt2.set_active_adapters(adapter_name)

        #   CNN layer
        self.cnn = nn.Conv1d(
            in_channels=self.gpt2.config.hidden_size * 2,
            out_channels=cnn_output_channels,
            kernel_size=cnn_kernel_size,
            padding=np.floor(cnn_kernel_size / 2).astype(int)
        )

        #   Pooling layer (Max Pooling)
        self.max_pool = nn.MaxPool1d(kernel_size=sequence_length)

        #   FeedForward layers
        self.ff = nn.Sequential(
            nn.Linear(cnn_output_channels, ff_hidden_size),
            nn.ReLU(),
            nn.Linear(ff_hidden_size, num_classes)
        )

        #   Batch Normalization Layers
        self.word_embedding_bn = nn.BatchNorm1d(self.gpt2.config.hidden_size)
        self.encode_bn = nn.BatchNorm1d(self.gpt2.config.hidden_size)
        self.pooling_bn = nn.BatchNorm1d(cnn_output_channels)
        
        self.loss_fn = loss_fn
        if self.loss_fn is None:
            self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, batched_sequence, attention_mask, labels=None):
        #   Denote B := batch size, L := sequence length, H := hidden size, H' := cnn_output_channels

        #   Dimension of batched_sequence: (B, L)
        #   Dimension of attention_mask: (B, L)

        #   Use GPT-2 to create basic representations of sequences
        outputs = self.gpt2(
            batched_sequence, 
            attention_mask=attention_mask, 
            output_hidden_states=True
        )
        #   The outputs contains the hidden states from each layer of the GPT-2 encoder
        #   0th is the word embeddings, and the last is the last hidden state

        #   Extract word embeddings and last hidden states
        word_embeddings = outputs.hidden_states[0]
        #   Dimension of word_embeddings: (B, L, H)
        encoder_hidden_state = outputs.last_hidden_state
        #   Dimension of encoder_hidden_state: (B, L, H)

        #   Batch Normalization
        word_embeddings = self.word_embedding_bn(word_embeddings.permute(0, 2, 1)).permute(0, 2, 1)
        encoder_hidden_state = self.encode_bn(encoder_hidden_state.permute(0, 2, 1)).permute(0, 2, 1)

        #   Concatenate word embeddings and last hidden states
        concatenated_embeddings = torch.cat((word_embeddings, encoder_hidden_state), dim=-1)
        #   Dimension of concatenated_embeddings: (B, L, H*2)

        #   Apply attention mask to the concatenated sequence representation
        #   The attention mask is expanded to dimension (B, L, H*2), 
        #   and then multiplied element-wise with the concatenated sequence representation. 
        #   This is zeros out the values at the padded positions.
        concatenated_embeddings = concatenated_embeddings * \
            attention_mask.unsqueeze(-1).expand(concatenated_embeddings.shape)

        #   Apply CNN
        cnn_out = self.cnn(concatenated_embeddings.permute(0, 2, 1))
        #   Dimension of cnn_out: (B, H', L)

        # Pooling
        pooled = self.max_pool(cnn_out)
        #   Dimension of pooled: (B, H', 1)

        #   Batch Normalization on pooled sequence representation
        #   This is the final sequence embedding
        pooled = self.pooling_bn(pooled.squeeze(-1))
        #   Dimension of pooled: (B, H')

        #   Apply FeedForward
        logits = self.ff(pooled)
        #   Dimension of logits: (B, num_classes)

        #   Compute loss
        loss = None
        if labels is not None:
            #   The loss function is CrossEntropyLoss or otherwise specified at initialization
            assert self.loss_fn is not None

            loss = self.loss_fn(logits, labels)

        return SequenceClassifierOutput(loss=loss, logits=logits)


In [9]:
from transformers import Trainer, TrainingArguments, PreTrainedTokenizer

In [10]:
class CustomDataCollator:
    def __init__(self, tokenizer: PreTrainedTokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        #   Denote B := batch size, L := sequence length

        #   Extract sequences from batch
        sequences = [item["seq"] for item in batch]
        #   Dimension of sequences: (B, L)

        #   Extract labels from batch
        labels = [item["label"] for item in batch]
        #   Dimension of labels: (B,)

        #   Tokenize sequences
        tokenized_sequences = self.tokenizer(
            sequences, 
            padding="max_length", 
            max_length=SEQUENCE_LENGTH,
            truncation=True, 
            return_tensors="pt"
        )
        #   Dimension of tokenized_sequences["input_ids"]: (B, L)
        #   Dimension of tokenized_sequences["attention_mask"]: (B, L)

        #   Convert labels to tensor
        labels = torch.tensor(labels)

        return {"batched_sequence": tokenized_sequences, "labels": labels}

In [11]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        #   Denote B := batch size, L := sequence length

        #   Extract batched sequence and attention mask
        batched_sequence = inputs["batched_sequence"]["input_ids"]
        attention_mask = inputs["batched_sequence"]["attention_mask"]
        #   Dimension of batched_sequence: (B, L)
        #   Dimension of attention_mask: (B, L)

        #   Extract labels
        labels = inputs["labels"]
        #   Dimension of labels: (B,)

        #   Apply model to batch of inputs
        outputs = model(batched_sequence, attention_mask)
        logits = outputs.logits
        #   Dimension of logits: (B, num_classes)

        #   Compute loss with Cross Entropy Loss
        loss = nn.CrossEntropyLoss()(logits, labels)

        return (loss, outputs) if return_outputs else loss

In [12]:
from torch.utils.data.dataset import Dataset

class CustomDataset(Dataset):
    def __init__(self, items: list):
        self.items = items

    def __getitem__(self, index):
        return {
            "seq": self.items[index]["seq"], 
            "label": self.items[index]["label"]
        }

    def __len__(self):
        return len(self.items)

In [13]:
train_samples = [
    {"seq": "Hello world!", "label": 0},
    {"seq": "This is a random sequence", "label": 1},
    {"seq": "Just another example", "label": 2},
    {"seq": "Sample number four", "label": 0},
    {"seq": "Trying different things", "label": 1},
    {"seq": "Have a great day!", "label": 2},
    {"seq": "Keep up the good work", "label": 0},
    {"seq": "Today is a sunny day", "label": 1},
    {"seq": "Learning new things", "label": 2},
    {"seq": "Stay positive and motivated", "label": 0},
]

eval_samples = [
    {"seq": "Evaluate this sample", "label": 1},
    {"seq": "Checking for errors", "label": 2},
    {"seq": "A simple test case", "label": 0},
    {"seq": "This is interesting", "label": 1},
    {"seq": "Randomly selected label", "label": 2},
    {"seq": "Keep moving forward", "label": 0},
    {"seq": "One more example", "label": 1},
    {"seq": "Happy to help", "label": 2},
    {"seq": "Let's see how it works", "label": 0},
    {"seq": "The final evaluation sample", "label": 1},
]

TrainSet = CustomDataset(train_samples)
EvalSet = CustomDataset(eval_samples)

In [14]:
MyCollator = CustomDataCollator(tokenizer)

In [15]:
MyCollator.tokenizer.pad_token = tokenizer.eos_token

In [16]:
custom_model = CNNGPT2(
    sequence_length=SEQUENCE_LENGTH,
    adapter_name=ADAPTER_NAME,
    adapter_config=ADAPTER_CONFIG,
    num_classes=NUM_CLASSES,
    cnn_output_channels=SEQUENCE_EMEBDDING_SIZE,
    cnn_kernel_size=CNN_WINDOW_SIZE,
    ff_hidden_size=FF_HIDDEN_SIZE
)

training_args = TrainingArguments(
    output_dir=RESULTS_PATH,
    num_train_epochs=5,
    logging_dir=LOG_PATH,
    logging_first_step=True,
    logging_strategy="epoch",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    remove_unused_columns=False
)

trainer = CustomTrainer(
    model=custom_model,
    args=training_args,
    data_collator=MyCollator,
    train_dataset=TrainSet,
    eval_dataset=EvalSet
)

loading configuration file config.json from cache at h:\cs544\project\models\pretrained\models--gpt2\snapshots\e7da7f221d5bf496a48136c0cd264e630fe9fcc8\config.json
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.26.1",
  "us

In [17]:
train_loader = trainer.get_train_dataloader()
eval_loader = trainer.get_eval_dataloader()

In [18]:
train_result = trainer.train()
eval_result = trainer.evaluate()

***** Running training *****
  Num examples = 10
  Num Epochs = 5
  Instantaneous batch size per device = 5
  Total train batch size (w. parallel, distributed & accumulation) = 5
  Gradient Accumulation steps = 1
  Total optimization steps = 10
  Number of trainable parameters = 27487523


  0%|          | 0/10 [00:00<?, ?it/s]

{'loss': 1.2098, 'learning_rate': 4.5e-05, 'epoch': 0.5}
{'loss': 1.1961, 'learning_rate': 4e-05, 'epoch': 1.0}
{'loss': 0.8524, 'learning_rate': 3e-05, 'epoch': 2.0}
{'loss': 0.6093, 'learning_rate': 2e-05, 'epoch': 3.0}




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 10
  Batch size = 5


{'loss': 0.519, 'learning_rate': 1e-05, 'epoch': 4.0}
{'loss': 0.4656, 'learning_rate': 0.0, 'epoch': 5.0}
{'train_runtime': 2.8687, 'train_samples_per_second': 17.43, 'train_steps_per_second': 3.486, 'train_loss': 0.7298534274101257, 'epoch': 5.0}


  0%|          | 0/2 [00:00<?, ?it/s]

In [19]:
eval_result

{'eval_loss': 1.093057632446289,
 'eval_runtime': 0.078,
 'eval_samples_per_second': 128.178,
 'eval_steps_per_second': 25.636,
 'epoch': 5.0}

In [20]:
inference_sample = "This is a sample for inference"

inference_sample_tokenized = tokenizer(
    inference_sample,
    padding="max_length",
    max_length=SEQUENCE_LENGTH,
    truncation=True,
    return_tensors="pt"
)

inference_result = custom_model(
    inference_sample_tokenized["input_ids"].to("cuda:0"), 
    inference_sample_tokenized["attention_mask"].to("cuda:0")
)

In [21]:
print(inference_result.logits)
print(inference_result.logits.cpu().argmax())

tensor([[ 0.0383,  0.0343, -0.1080]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
tensor(0)
