<a href="https://colab.research.google.com/github/csci544projectGroup18/DebaterAI/blob/main/colab/StannceCls.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## How to run the code
Go to `Runtime > Change runtime type > Hardware accelerator`, select `GPU` and click `save`

In [1]:
import os

PROJECT_ROOT_DIR = os.getcwd()
PRETRAINED_MODEL_DIR = os.path.join(PROJECT_ROOT_DIR, "models", "pretrained")
#assert os.path.isdir(PRETRAINED_MODEL_DIR)

#   Path to the directory where the pre-trained model will be saved.
os.environ["HUGGINGFACE_HUB_CACHE"] = PRETRAINED_MODEL_DIR
os.environ["TRANSFORMERS_CACHE"] = PRETRAINED_MODEL_DIR

# Colab install the dependencies
# %pip install transformers
# %pip install adapter-transformers 


In [2]:
RESULTS_DIR = os.path.join(PROJECT_ROOT_DIR, "results")
LOG_DIR = os.path.join(PROJECT_ROOT_DIR, "logs")

#assert os.path.isdir(RESULTS_DIR) and os.path.isdir(LOG_DIR)

In [3]:
import torch
import torch.nn as nn

from transformers import GPT2Model, GPT2Tokenizer
from transformers import MAMConfig

In [4]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)

cuda


In [5]:
#   Initialize the GPT-2 tokenizer.
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", padding_side="right")

#   There is no default padding token in the GPT-2 tokenizer, 
#   it is set to the end-of-sequence token instead.
tokenizer.pad_token = tokenizer.eos_token

In [6]:
#   Hyperparameters for SequenceEncoder block.
ADAPTER_NAME = "mam_adpater"
ADAPTER_CONFIG = MAMConfig()

MAX_SEQUENCE_LENGTH = 128
SEQUENCE_EMEBDDING_SIZE = 1024
CNN_WINDOW_SIZE = 9

In [7]:
class SequenceEncoderBlock(nn.Module):
    '''Sequence encoder block

    params: 
        max_sequence_length: Maximum sequence length
        adapter_name: Adapter used for fine-tuning pre-trained encoder
        adapter_config: Adapter config
        cnn_output_channels: Number of output channels of the CNN(=dimension of the sequence embedding)
        cnn_window_size: Window size of the CNN
    '''
    def __init__(
            self, 
            max_sequence_length,
            adapter_name,
            adapter_config,
            cnn_output_channels,
            cnn_window_size
        ):
        super(SequenceEncoderBlock, self).__init__()

        #   Pre-trained GPT-2 model
        self.gpt2 = GPT2Model.from_pretrained("gpt2")

        #   Freeze GPT-2 pre-trained parameters
        for param in self.gpt2.parameters():
            param.requires_grad = False

        #   Add adapter to GPT-2
        self.gpt2.add_adapter(adapter_name, config=adapter_config)
        self.gpt2.set_active_adapters(adapter_name)

        #   CNN layer
        self.cnn = nn.Conv1d(
            in_channels=self.gpt2.config.hidden_size * 2,
            out_channels=cnn_output_channels,
            kernel_size=cnn_window_size,
            padding=int(cnn_window_size / 2)
        )

        #   Max pooling layer
        self.max_pooling = nn.MaxPool1d(kernel_size=max_sequence_length)

        #   Batch normalization layers
        self.word_embedding_bn = nn.BatchNorm1d(num_features=self.gpt2.config.hidden_size)
        self.encoder_bn = nn.BatchNorm1d(num_features=self.gpt2.config.hidden_size)
        self.pooling_bn = nn.BatchNorm1d(cnn_output_channels)

    def forward(self, input_ids, attention_mask):
        '''Forward propagation

        params:
            input_ids: Tensor of shape (B, L) containing the input token IDs
            attention_mask: Tensor of shape (B, L) containing the attention mask
        '''
        #   Dimension notations:
        #   B: batch size
        #   L: sequence length
        #   H: hidden size
        #   C: number of output channels of the CNN (also the dimension of the sequence embedding)

        #   Get word embeddings and last hidden states from GPT-2
        outputs = self.gpt2(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True
        )

        word_embeddings = outputs.hidden_states[0]
        #   Dimension: (B, L, H)
        encoder_hidden_states = outputs.last_hidden_state
        #   Dimension: (B, L, H)

        #   Batch normalization
        bn_word_embeddings = self.word_embedding_bn(
            word_embeddings.permute(0, 2, 1)
        ).permute(0, 2, 1)
        bn_encoder_hidden_states = self.encoder_bn(
            encoder_hidden_states.permute(0, 2, 1)
        ).permute(0, 2, 1)

        #   Concatenate word embeddings and encoder hidden states
        concat_embeddings = torch.cat((bn_word_embeddings, bn_encoder_hidden_states), dim=-1)
        #   Dimension: (B, L, H * 2)

        #   Apply attention mask to the concatenated sequence representation
        #   The attetion mask is expanded to dimension (B, L, H * 2), 
        #   matching the dimension of the concatenated sequence representation.
        #   The concatenated sequence representation is multiplied element-wise with the attention mask
        #   to zero out the padded positions.
        masked_concat_embeddings = concat_embeddings * \
            attention_mask.unsqueeze(-1).expand(concat_embeddings.shape)
        
        #   Apply CNN layer
        cnn_out = self.cnn(masked_concat_embeddings.permute(0, 2, 1))
        #   Dimension: (B, C, L)

        #   Apply max pooling layer
        pooled_output = self.max_pooling(cnn_out)
        #   Dimension: (B, C, 1)

        #   Apply batch normalization
        #   This is the final sequence embedding
        sequence_embedding = self.pooling_bn(pooled_output.squeeze(-1))
        #   Dimension: (B, C)

        return sequence_embedding

In [8]:
#   Hyperparameters for the combined classifier.
FF_HIDDEN_SIZE = 4 * SEQUENCE_EMEBDDING_SIZE
NUM_CLASSES = 3                                 #   0: Neutral, 1: Agree, 2: Disagree

In [9]:
from transformers.modeling_outputs import SequenceClassifierOutput

In [10]:
class StanceClassifier(nn.Module):
    '''Stance classifier

    params:
        parent_encoder: Sequence encoder block for parent comments
        child_encoder: Sequence encoder block for child comments
        context_encoder: Sequence encoder block for comment context
        sequence_embedding_size: Dimension of the sequence embedding
        ff_hidden_size: Hidden size of the feed-forward layer
        num_classes: Number of classes
    '''
    def __init__(
            self, 
            parent_encoder: SequenceEncoderBlock,
            child_encoder: SequenceEncoderBlock,
            context_encoder: SequenceEncoderBlock,
            loss_fn,
            sequence_embedding_size,
            ff_hidden_size,
            num_classes
        ):
        super(StanceClassifier, self).__init__()

        self.parent_encoder = parent_encoder
        self.child_encoder = child_encoder
        self.context_encoder = context_encoder

        self.loss_fn = loss_fn

        #   Feed-forward layer
        self.ff = nn.Sequential(
            nn.Linear(sequence_embedding_size * 2, ff_hidden_size),
            nn.ReLU(),
            nn.Linear(ff_hidden_size, sequence_embedding_size),
            nn.ReLU(),
            nn.Linear(sequence_embedding_size, num_classes)
        )

    def forward(self, input_ids, attention_masks, labels=None):
        '''Forward propagation

        params:
            input_ids: list tensors of shape (B, L) containing the input token IDs
            attention_masks: list tensors of shape (B, L) containing the attention masks
            labels: Tensor of shape (B,) containing the labels
        '''
        #   Dimension notations:
        #   B: batch size
        #   S: dimension of the sequence embedding
        #   C: number of classes

        parent_embeddings = self.parent_encoder(
            input_ids=input_ids[0],
            attention_mask=attention_masks[0]
        )
        child_embeddings = self.child_encoder(
            input_ids=input_ids[1],
            attention_mask=attention_masks[1]
        )
        context_embeddings = self.context_encoder(
            input_ids=input_ids[2],
            attention_mask=attention_masks[2]
        )
        #   Dimension: 3 * (B, S)

        #   Create the combined sequence embedding for classification
        combined_embeddings = torch.cat(
            (parent_embeddings + context_embeddings, child_embeddings + context_embeddings),
            dim=-1
        )
        #   Dimension: (B, S * 2)

        #   Feed-forward layer
        logits = self.ff(combined_embeddings)
        loss = None

        if labels is not None:
            loss = self.loss_fn(logits, labels)

        return SequenceClassifierOutput(loss=loss, logits=logits)

In [11]:
#   Collate function for the combined classifier
class CustomDataCollator:
    def __init__(self, tokenizer: GPT2Tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        #   Dimension notations:
        #   B: batch size
        #   L: sequence length

        parent_comment = [item['parent_comment'] for item in batch]
        child_comment = [item['child_comment'] for item in batch]
        context = [item['context'] for item in batch]
        labels = [item['label'] for item in batch]

        #   Tokenize the input sequences
        parent_tokenized = self.tokenizer(
            parent_comment,
            padding="max_length",
            max_length=MAX_SEQUENCE_LENGTH,
            truncation=True,
            return_tensors="pt"
        )
        child_tokenized = self.tokenizer(
            child_comment,
            padding="max_length",
            max_length=MAX_SEQUENCE_LENGTH,
            truncation=True,
            return_tensors="pt"
        )
        context_tokenized = self.tokenizer(
            context,
            padding="max_length",
            max_length=MAX_SEQUENCE_LENGTH,
            truncation=True,
            return_tensors="pt"
        )

        labels = torch.tensor(labels)

        input_ids = [
            parent_tokenized['input_ids'], 
            child_tokenized['input_ids'], 
            context_tokenized['input_ids']
        ]
        attention_masks = [
            parent_tokenized['attention_mask'],
            child_tokenized['attention_mask'],
            context_tokenized['attention_mask']
        ]

        return {"input_ids": input_ids, "attention_masks": attention_masks, "labels": labels}

In [12]:
from transformers import Trainer, TrainingArguments, EvalPrediction
from torch.optim import AdamW
from torch.optim.lr_scheduler import LambdaLR

import numpy as np
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

In [13]:
def custom_compute_metrics(eval_pred: EvalPrediction) -> dict:
    '''Compute metrics for the combined classifier

    params:
        eval_pred: EvalPrediction object
    '''
    #   Dimension notations:
    #   B: batch size
    #   C: number of classes

    #   Dimension of prediction logits: (B, C)

    #   Convert logits to predictions
    preds = np.argmax(eval_pred.predictions, axis=1)
    #   Dimension: (B,)

    #   Compute precision, recall, and F1 score
    precision, recall, f1, _ = precision_recall_fscore_support(
        eval_pred.label_ids, preds, average="weighted"
    )

    #   Compute confusion matrix
    cm = confusion_matrix(eval_pred.label_ids, preds)

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "accuracy": cm.diagonal() / cm.sum()
    }

In [14]:
class CustomTrainer(Trainer):
    def compute_loss(self, model: nn.Module, inputs: dict, return_outputs=False):
        #   Dimension notations:
        #   B: batch size
        #   L: sequence length
        #   C: number of classes

        input_ids = inputs["input_ids"]
        #   Dimension: 3 * (B, L)
        attention_masks = inputs["attention_masks"]
        #   Dimension: 3 * (B, L)
        labels = inputs["labels"]
        #   Dimension: (B,)

        outputs = model(input_ids, attention_masks, labels=labels)
        logits = outputs.logits
        loss = outputs.loss

        return (loss, logits) if return_outputs else loss

In [48]:
from torch.utils.data.dataset import Dataset
import pandas as pd
#   Dataset class for the combined classifier

class DebaterDataset(Dataset):
    def __init__(self, path, is_test):
        data = pd.read_csv(path)
        self.label_list = data.loc[:, 'label'].tolist()
        self.body_parent_list = data.loc[:, 'body_parent'].tolist()
        self.body_child_list = data.loc[:, 'body_child'].tolist()
        self.submission_text_list = data.loc[:, 'submission_text'].tolist()

        L = len(self.label_list)
        if not is_test:
            self.label_list = self.label_list[:int(L * 0.8)]
            self.body_parent_list = self.body_parent_list[:int(L * 0.8)]
            self.body_child_list = self.body_child_list[:int(L * 0.8)]
            self.submission_text_list = self.submission_text_list[:int(L * 0.8)]
        else:
            self.label_list = self.label_list[int(L * 0.8):]
            self.body_parent_list = self.body_parent_list[int(L * 0.8):]
            self.body_child_list = self.body_child_list[int(L * 0.8):]
            self.submission_text_list = self.submission_text_list[int(L * 0.8):]

    def __len__(self):
        return self.label_list.__len__()

    def __getitem__(self, index):
        '''
          The inidividual item returned by __getitem__ should be a dictionary
          containing the following keys:
          - parent_comment: string
          - child_comment: string
          - context: string
          - label: int (0: Neutral, 1: Agree, 2: Disagree)
        '''
        label = self.label_list[index]
        parent_comment = self.body_parent_list[index]
        child_comment = self.body_child_list[index]
        context = self.submission_text_list[index]

        return {'parent_comment': parent_comment, \
                'child_comment': child_comment, \
                'context': context, \
                'label': label}

In [36]:
#   Training arguments (need to be changed based on actual performance)
TRAINING_EPOCHS = 10
BACTH_SIZE = 64
LEARNING_RATE = 1e-5

In [37]:
SeqEncoder1 = SequenceEncoderBlock(
    max_sequence_length=MAX_SEQUENCE_LENGTH,
    adapter_name=ADAPTER_NAME,
    adapter_config=ADAPTER_CONFIG,
    cnn_output_channels=SEQUENCE_EMEBDDING_SIZE,
    cnn_window_size=CNN_WINDOW_SIZE
)

SeqEncoder2 = SequenceEncoderBlock(
    max_sequence_length=MAX_SEQUENCE_LENGTH,
    adapter_name=ADAPTER_NAME,
    adapter_config=ADAPTER_CONFIG,
    cnn_output_channels=SEQUENCE_EMEBDDING_SIZE,
    cnn_window_size=CNN_WINDOW_SIZE
)

SeqEncoder3 = SequenceEncoderBlock(
    max_sequence_length=MAX_SEQUENCE_LENGTH,
    adapter_name=ADAPTER_NAME,
    adapter_config=ADAPTER_CONFIG,
    cnn_output_channels=SEQUENCE_EMEBDDING_SIZE,
    cnn_window_size=CNN_WINDOW_SIZE
)

CLSModel = StanceClassifier(
    parent_encoder=SeqEncoder1,
    child_encoder=SeqEncoder2,
    context_encoder=SeqEncoder3,
    loss_fn=nn.CrossEntropyLoss(),
    sequence_embedding_size=SEQUENCE_EMEBDDING_SIZE,
    ff_hidden_size=FF_HIDDEN_SIZE,
    num_classes=NUM_CLASSES
)

#   Optimizer and LR scheduler may need to be changed based on actual performance
#   This is the default setting from the Trainer implementation
optimizer = AdamW(CLSModel.parameters(), lr=LEARNING_RATE)
lr_scheduler = LambdaLR(optimizer, lambda epoch: 1 / (epoch + 1))

loading configuration file config.json from cache at /lab/xingrui/DebaterAI/colab/models/pretrained/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/config.json
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.

In [49]:
# add dataset
train_dataset = DebaterDataset('/lab/xingrui/DebaterAI/data/labeled_data.csv', is_test=False)
eval_dataset = DebaterDataset('/lab/xingrui/DebaterAI/data/labeled_data.csv', is_test=True)

MyCollator = CustomDataCollator(tokenizer)

training_args = TrainingArguments(
    output_dir=RESULTS_DIR,
    logging_dir=LOG_DIR,
    num_train_epochs=TRAINING_EPOCHS,
    per_device_train_batch_size=BACTH_SIZE,
    per_device_eval_batch_size=BACTH_SIZE,
    remove_unused_columns=False
)

trainer = CustomTrainer(
    model=CLSModel,
    args=training_args,
    train_dataset=train_dataset,     #   Change this to the training dataset
    eval_dataset=eval_dataset,      #   Change this to the evaluation dataset
    data_collator=MyCollator,
    optimizers=(optimizer, lr_scheduler),
    compute_metrics=custom_compute_metrics
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [50]:
trainer.train()

    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
***** Running training *****
  Num examples = 34315
  Num Epochs = 10
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 2690
  Number of trainable parameters = 122558819


OutOfMemoryError: Caught OutOfMemoryError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/lab/xingrui/anaconda3/envs/544/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 64, in _worker
    output = module(*input, **kwargs)
  File "/lab/xingrui/anaconda3/envs/544/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/tmp/ipykernel_3178603/2431735171.py", line 56, in forward
    child_embeddings = self.child_encoder(
  File "/lab/xingrui/anaconda3/envs/544/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/tmp/ipykernel_3178603/2814622836.py", line 62, in forward
    outputs = self.gpt2(
  File "/lab/xingrui/anaconda3/envs/544/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/lab/xingrui/anaconda3/envs/544/lib/python3.9/site-packages/transformers/adapters/context.py", line 108, in wrapper_func
    results = f(self, *args, **kwargs)
  File "/lab/xingrui/anaconda3/envs/544/lib/python3.9/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 920, in forward
    outputs = block(
  File "/lab/xingrui/anaconda3/envs/544/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/lab/xingrui/anaconda3/envs/544/lib/python3.9/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 454, in forward
    feed_forward_hidden_states = self.mlp(hidden_states)
  File "/lab/xingrui/anaconda3/envs/544/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/lab/xingrui/anaconda3/envs/544/lib/python3.9/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 381, in forward
    hidden_states = self.act(hidden_states)
  File "/lab/xingrui/anaconda3/envs/544/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/lab/xingrui/anaconda3/envs/544/lib/python3.9/site-packages/transformers/activations.py", line 35, in forward
    return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 23.70 GiB total capacity; 18.37 GiB already allocated; 44.50 MiB free; 18.60 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
