<a href="https://colab.research.google.com/github/danielsaggau/IR_LDC/blob/main/ecthr_b_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# classification scotus 
!pip install datasets transformers 
from datasets import load_dataset
dataset = load_dataset("lex_glue", "ecthr_b")

In [None]:
from huggingface_hub import notebook_login
notebook_login()
#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

In [3]:
train_dataset = dataset['train']

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [5]:
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    TrainingArguments,
    default_data_collator,
    set_seed,
    EarlyStoppingCallback,
    Trainer
)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/sbert_longformer_ecthr_b_simcse/train_simcse-_root_.local_share_jupyter_runtime_kernel-702ca39d-5545-4b00-b884-05176c135763.json-2022-10-21_08-44-17/1500",use_auth_token=True, num_labels=10)

In [7]:
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/sbert_longformer_ecthr_b_simcse/train_simcse-_root_.local_share_jupyter_runtime_kernel-702ca39d-5545-4b00-b884-05176c135763.json-2022-10-21_08-44-17/1500", use_auth_token=True)

In [8]:
import numpy as np
import torch as nn

In [9]:
    def preprocess_function(examples):
        # Tokenize the texts
        cases = []
        padding = "max_length"
        max_seq_length=4096
        for case in examples['text']:
            cases.append(f' {tokenizer.sep_token} '.join([fact for fact in case]))
        batch = tokenizer(cases, padding=padding, max_length=4096, truncation=True)
        # use global attention on CLS token
        global_attention_mask = np.zeros((len(cases),max_seq_length), dtype=np.int32)
        global_attention_mask[:, 0] = 1
        batch['global_attention_mask'] = list(global_attention_mask)
        
        batch["labels"] = [[1 if label in labels else 0 for label in label_list] for labels in examples["labels"]]

        return batch

In [10]:
  label_list = list(range(10))
  num_labels = len(label_list)

In [None]:
train_dataset = train_dataset.map(
                preprocess_function,
                batched=True,
                desc="Running tokenizer on train dataset")

In [None]:
train_dataset[1:10]

In [13]:
test_dataset = dataset['test']

In [None]:
test_dataset = test_dataset.map(
                preprocess_function,
                batched=True,
                desc="Running tokenizer on train dataset",
                remove_columns=['text'])

In [15]:
eval_dataset = dataset['validation']

In [None]:
eval_dataset = eval_dataset.map(
                preprocess_function,
                batched=True,
                desc="Running tokenizer on train dataset",
                remove_columns=['text'])

In [17]:
  def compute_metrics(p: EvalPrediction):
        # Fix gold labels
        y_true = np.zeros((p.label_ids.shape[0], p.label_ids.shape[1] + 1), dtype=np.int32)
        y_true[:, :-1] = p.label_ids
        y_true[:, -1] = (np.sum(p.label_ids, axis=1) == 0).astype('int32')
        # Fix predictions
        logits = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
        preds = (expit(logits) > 0.5).astype('int32')
        y_pred = np.zeros((p.label_ids.shape[0], p.label_ids.shape[1] + 1), dtype=np.int32)
        y_pred[:, :-1] = preds
        y_pred[:, -1] = (np.sum(preds, axis=1) == 0).astype('int32')
        # Compute scores
        macro_f1 = f1_score(y_true=y_true, y_pred=y_pred, average='macro', zero_division=0)
        micro_f1 = f1_score(y_true=y_true, y_pred=y_pred, average='micro', zero_division=0)
        return {'macro-f1': macro_f1, 'micro-f1': micro_f1}

In [22]:
def compute_metrics(eval_pred):
    metric1 = load_metric("f1")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    micro1 = metric1.compute(predictions=predictions, references=labels, average="micro")["f1"]
    macro1 = metric1.compute(predictions=predictions, references=labels, average="macro")["f1"]
    return { "f1-micro": micro1, "f1-macro": macro1}

In [18]:
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

In [19]:
training_args = TrainingArguments(
    output_dir="/slbert_ecthr_b_classsification",
    learning_rate=3e-5,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    num_train_epochs=1,
    weight_decay=0.01,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    #push_to_hub=True,
    metric_for_best_model="f1-micro",
    greater_is_better=True,
    load_best_model_at_end = True
)

In [24]:
class MultilabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = nn.BCEWithLogitsLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels),
                        labels.float().view(-1, self.model.config.num_labels))
        return (loss, outputs) if return_outputs else loss

In [26]:
from torch import cuda
cuda.empty_cache()

In [37]:
cuda.max_memory_allocated()

14679349248

In [22]:
%env 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512'

env: 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512'


In [27]:
trainer = Trainer(
    model=model,
    compute_metrics=compute_metrics,
    args=training_args,
    eval_dataset=eval_dataset,
    train_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)])
trainer.train()

***** Running training *****
  Num examples = 1000
  Num Epochs = 1
  Instantaneous batch size per device = 6
  Total train batch size (w. parallel, distributed & accumulation) = 6
  Gradient Accumulation steps = 1
  Total optimization steps = 167


RuntimeError: ignored

In [34]:
trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 1000
  Batch size = 6


ValueError: ignored

In [35]:
labels

NameError: ignored

In [21]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [27]:
cases = []
for case in dataset['train']['text']:
    cases.append(f' {tokenizer.sep_token} '.join([fact for fact in case]))

In [29]:
cases[10]

'7.  On 11 February 1980, the applicant pleaded guilty to manslaughter on the ground of diminished responsibility in respect of the death of his 62‑year-old landlady whom he had battered with an axe. He was sentenced to life imprisonment. The court acted on medical evidence that the applicant had a gross personality disorder to such a degree that he was amoral. The consultant psychiatrist said in his report at the trial that:\n“... although [the applicant’s] instability might get less over the years as he matured, should he be sent to prison, his eventual release should be approached with great caution.” [SEP] 8.  The applicant was initially sent to a Category A prison due to concerns about his dangerousness and risk of escape. He was involved in work to address his offending and other problems. He successfully completed an Anger Management and Skills Courts in the Hull Special Unit in or about 1989 and his conduct was noted as significantly improving from that point. In 1993, he was t