In [1]:
pip install transformers datasets sentencepiece protobuf==3.19.0 scipy sklearn

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install torch

Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
from pathlib import Path
from tqdm.auto import tqdm
import numpy as np
import pandas as pd

from transformers import AutoModel, AutoTokenizer
from datasets import load_dataset, DatasetDict
import torch

Constants

In [5]:
ls

'Getting Started.ipynb'
 [0m[01;34mSPECTER-with-biblio-context-finetuned-review_classifier[0m/
 [01;34mdeberta-v3-xsmall-with-biblio-context-finetuned-review_classifier[0m/
 [01;34mdeberta-v3-xsmall-with-biblio-context-frozenlm-finetuned-review_classifier[0m/
 finetune_DeBERTa_with_context.ipynb
 finetune_DeBERTa_with_context_frozen.ipynb
 finetune_SPECTER_with_context.ipynb
 [01;34mimages[0m/
 review_references_title_abstracts_sample_test.csv
 review_references_title_abstracts_sample_train.csv


In [6]:
dataset = load_dataset(
    'csv', 
    data_files={
        'train': "./review_references_title_abstracts_sample_train.csv",
        'test': "./review_references_title_abstracts_sample_test.csv"
    },
)

Using custom data configuration default-7e78073ed70db084
Reusing dataset csv (/home/studio-lab-user/.cache/huggingface/datasets/csv/default-7e78073ed70db084/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58)


  0%|          | 0/2 [00:00<?, ?it/s]

## Defining the model

In [7]:
tokenizer = AutoTokenizer.from_pretrained("allenai/specter")

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'abstract', 'doi', 'references_count', 'references_contradicted_total', 'references_contradicted_avg', 'references_mentioned_total', 'references_mentioned_total_avg', 'references_supported_total', 'references_supported_avg', 'in_text_citations_to_references_total', 'in_text_citations_to_references_total_avg', 'citations_to_references_total', 'citations_to_references_total_avg', 'contradiction_percentage_avg', 'mentioning_percentage_avg', 'supporting_percentage_avg', 'contradiction_to_supporting_ratio_avg', 'contradiction_to_supporting_contradiction_ratio_avg', 'supporting_to_supporting_contradiction_ratio_avg', 'title_abstract', 'review_in_title', 'review_in_abstract', 'review'],
        num_rows: 80000
    })
    test: Dataset({
        features: ['title', 'abstract', 'doi', 'references_count', 'references_contradicted_total', 'references_contradicted_avg', 'references_mentioned_total', 'references_mentioned_total_avg', 'r

In [9]:
def preprocess_function(examples):
    examples['labels'] = examples['review']
    examples['context'] = np.column_stack((
        examples['references_count'],
        examples['references_contradicted_total'],
        examples['references_contradicted_avg'],
        examples['references_mentioned_total'],
        examples['references_mentioned_total_avg'],
        examples['references_supported_total'],
        examples['references_supported_avg'],
        examples['in_text_citations_to_references_total'],
        examples['in_text_citations_to_references_total_avg'],
        examples['citations_to_references_total'],
        examples['citations_to_references_total_avg'],
        examples['contradiction_percentage_avg'],
        examples['mentioning_percentage_avg'],
        examples['supporting_percentage_avg'],
        examples['contradiction_to_supporting_ratio_avg'],
        examples['contradiction_to_supporting_contradiction_ratio_avg'],
        examples['supporting_to_supporting_contradiction_ratio_avg']
    ))
    norm = np.linalg.norm(examples['context'])
    examples['context'] = torch.Tensor(examples['context']/norm)
    return tokenizer(examples["title_abstract"], truncation=True, max_length=512)

tokenized_dataset = dataset.map(
    preprocess_function, 
    batched=True,
    remove_columns=dataset['train'].column_names
  )



  0%|          | 0/80 [00:00<?, ?ba/s]

  0%|          | 0/80 [00:00<?, ?ba/s]

In [10]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [11]:
from typing import Optional, Tuple, Union

import torch
from torch import nn
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers.models.bert.modeling_bert import (
    BertModel,
    BertPreTrainedModel
)

class BertForSequenceClassificationWithContextEncoder(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        num_labels = getattr(config, "num_labels", 2)
        
        self.num_labels = num_labels

        self.bert = BertModel(config)
        classifier_dropout = (
                config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        context_size = getattr(config, "context_size", 17)
        hidden_size = getattr(config, "hidden_size", config.hidden_size)

        self.encoder = nn.Sequential(
            nn.Linear(context_size, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size)
        )
        self.classifier = nn.Linear(hidden_size * 2, self.num_labels)
        
        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        context: Optional[torch.Tensor] = None,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutput]:
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        ctx_emb = self.encoder(context)
        x = torch.cat([pooled_output, ctx_emb], dim=1)
        logits = self.classifier(x)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
        )

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = BertForSequenceClassificationWithContextEncoder.from_pretrained(
    "allenai/specter",
    num_labels=2)

Some weights of BertForSequenceClassificationWithContextEncoder were not initialized from the model checkpoint at allenai/specter and are newly initialized: ['classifier.weight', 'encoder.6.bias', 'encoder.0.weight', 'encoder.0.bias', 'encoder.4.running_mean', 'encoder.4.bias', 'encoder.4.weight', 'encoder.1.bias', 'encoder.1.running_mean', 'encoder.6.weight', 'encoder.3.weight', 'encoder.1.num_batches_tracked', 'encoder.1.weight', 'encoder.4.num_batches_tracked', 'encoder.4.running_var', 'encoder.1.running_var', 'encoder.3.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# model.deberta.requires_grad_(False) # freeze deberta
# model.deberta.eval()

In [13]:
from datasets import load_metric

def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    recall = load_metric("recall")
    precision = load_metric("precision")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return {
        **metric.compute(predictions=predictions, references=labels),
        **recall.compute(predictions=predictions, references=labels),
        **precision.compute(predictions=predictions, references=labels)
    }

In [14]:
training_args = TrainingArguments(
    "SPECTER-with-biblio-context-finetuned-review_classifier",
    learning_rate=4.5e-5,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    num_train_epochs=2,
    weight_decay=0.01,
    warmup_steps=1000,
    evaluation_strategy="epoch",
    save_total_limit=2,
    push_to_hub=True,
    fp16=True # switch off if not using GPU
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

/home/studio-lab-user/sagemaker-studiolab-notebooks/SPECTER-with-biblio-context-finetuned-review_classifier is already a clone of https://huggingface.co/domenicrosati/SPECTER-with-biblio-context-finetuned-review_classifier. Make sure you pull the latest changes with `repo.git_pull()`.
Using cuda_amp half precision backend


In [None]:
trainer.train()

***** Running training *****
  Num examples = 80000
  Num Epochs = 2
  Instantaneous batch size per device = 12
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 1
  Total optimization steps = 13334


Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision
1,0.1956,0.180473,0.9514,0.725656,0.685966,0.770222


Saving model checkpoint to SPECTER-with-biblio-context-finetuned-review_classifier/checkpoint-500
Configuration saved in SPECTER-with-biblio-context-finetuned-review_classifier/checkpoint-500/config.json
Model weights saved in SPECTER-with-biblio-context-finetuned-review_classifier/checkpoint-500/pytorch_model.bin
tokenizer config file saved in SPECTER-with-biblio-context-finetuned-review_classifier/checkpoint-500/tokenizer_config.json
Special tokens file saved in SPECTER-with-biblio-context-finetuned-review_classifier/checkpoint-500/special_tokens_map.json
tokenizer config file saved in SPECTER-with-biblio-context-finetuned-review_classifier/tokenizer_config.json
Special tokens file saved in SPECTER-with-biblio-context-finetuned-review_classifier/special_tokens_map.json
Deleting older checkpoint [SPECTER-with-biblio-context-finetuned-review_classifier/checkpoint-12500] due to args.save_total_limit
Saving model checkpoint to SPECTER-with-biblio-context-finetuned-review_classifier/check

In [None]:
trainer.push_to_hub(commit_message="Training complete", tags="text-classification")