In [None]:
pip install transformers datasets sentencepiece

In [None]:
!curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
!sudo apt-get install git-lfs

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
from pathlib import Path
from tqdm.auto import tqdm
import numpy as np
import pandas as pd

from transformers import AutoModel, AutoTokenizer
from datasets import load_dataset, DatasetDict
import torch

Constants

In [None]:
ls

In [None]:
dataset = load_dataset(
    'csv', 
    data_files={
        'train': "./review_references_title_abstracts_sample_train.csv",
        'test': "./review_references_title_abstracts_sample_test.csv"
    },
)

## Defining the model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-xsmall")

In [None]:
dataset

In [None]:
def preprocess_function(examples):
    examples['labels'] = examples['review']
    examples['context'] = np.column_stack((
        examples['references_count'],
        examples['references_contradicted_total'],
        examples['references_contradicted_avg'],
        examples['references_mentioned_total'],
        examples['references_mentioned_total_avg'],
        examples['references_supported_total'],
        examples['references_supported_avg'],
        examples['in_text_citations_to_references_total'],
        examples['in_text_citations_to_references_total_avg'],
        examples['citations_to_references_total'],
        examples['citations_to_references_total_avg'],
        examples['contradiction_percentage_avg'],
        examples['mentioning_percentage_avg'],
        examples['supporting_percentage_avg'],
        examples['contradiction_to_supporting_ratio_avg'],
        examples['contradiction_to_supporting_contradiction_ratio_avg'],
        examples['supporting_to_supporting_contradiction_ratio_avg']
    ))
    norm = np.linalg.norm(examples['context'])
    examples['context'] = torch.Tensor(examples['context']/norm)
    return tokenizer(examples["title_abstract"], truncation=True, max_length=512)

tokenized_dataset = dataset.map(
    preprocess_function, 
    batched=True,
    remove_columns=dataset['train'].column_names
  )

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from typing import Optional, Tuple, Union

import torch
from torch import nn
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers.models.deberta_v2.modeling_deberta_v2 import (
    DebertaV2Model,
    ContextPooler,
    StableDropout,
    DebertaV2PreTrainedModel
)

from typing import Optional, Tuple, Union

import torch
from torch import nn
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers.models.deberta_v2.modeling_deberta_v2 import (
    DebertaV2Model,
    ContextPooler,
    StableDropout,
    DebertaV2PreTrainedModel
)

class DebertaV2ForSequenceClassificationWithContext(DebertaV2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        num_labels = getattr(config, "num_labels", 2)
        
        self.num_labels = num_labels

        self.deberta = DebertaV2Model(config)
        self.pooler = ContextPooler(config)
        output_dim = self.pooler.output_dim
        context_size = getattr(config, "context_size", 17)
        hidden_size = getattr(config, "hidden_size", self.pooler.output_dim * 2)

        self.classifier = nn.Sequential(
            nn.Linear(output_dim + context_size, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, self.num_labels)
        )
        
        drop_out = getattr(config, "cls_dropout", None)
        drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
        self.dropout = StableDropout(drop_out)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.deberta.get_input_embeddings()

    def set_input_embeddings(self, new_embeddings):
        self.deberta.set_input_embeddings(new_embeddings)

    def forward(
        self,
        context: Optional[torch.Tensor] = None,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutput]:
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.deberta(
            input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        encoder_layer = outputs[0]
        pooled_output = self.pooler(encoder_layer)
        pooled_output = self.dropout(pooled_output)
        x = torch.cat([pooled_output, context], dim=1)
        logits = self.classifier(x)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
        )

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = DebertaV2ForSequenceClassificationWithContext.from_pretrained(
    "microsoft/deberta-v3-xsmall",
    num_labels=2)

In [None]:
model

In [None]:
from datasets import load_metric

def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(
    "deberta-v3-xsmall-with-biblio-context-finetuned-review_classifier",
    learning_rate=4.5e-5,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    num_train_epochs=2,
    weight_decay=0.01,
    warmup_steps=1000,
    evaluation_strategy="epoch",
    save_total_limit=2,
    push_to_hub=True,
    fp16=True # switch off if not using GPU
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub(commit_message="Training complete", tags="text-classification")