<a href="https://colab.research.google.com/github/danielsaggau/IR_LDC/blob/main/hyperparameter_tuning_classification_hf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebooks provides some hyperparameter tuning

# Set up Dataset

In [None]:
#load packages
!pip install transformers

In [None]:
#load data 
!pip install datasets
from datasets import load_dataset
dataset=load_dataset("lex_glue","scotus")
train_dataset=dataset['train']
train_dataset = train_dataset.shard(index=1, num_shards=5)

In [4]:
train_dataset=dataset['train']
train_dataset = train_dataset.shard(index=1, num_shards=4)

In [22]:
eval_dataset=dataset['validation']

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
padding="max_length"

tokenizer = AutoTokenizer.from_pretrained('/content/drive/MyDrive/bregman_scotus_k10_ep10', use_fast=True)

def preprocess_function(examples):
      return tokenizer(examples["text"], truncation=True, padding=padding)

tokenized_data = train_dataset.map(
      preprocess_function,
      batched=True,
      desc="tokenizing the entire dataset")

tokenizing the entire dataset:   0%|          | 0/2 [00:00<?, ?ba/s]

In [27]:
tokenized_data_eval = eval_dataset.map(
      preprocess_function,
      batched=True,
      desc="tokenizing the entire dataset")

tokenizing the entire dataset:   0%|          | 0/2 [00:00<?, ?ba/s]

# Set up Trainer 

load model


Compute Metric Function


In [8]:
def compute_metrics(eval_pred):
  metric1 = load_metric("f1")
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  micro1 = metric1.compute(predictions=predictions, references=labels, average="micro")["f1"]
  macro1 = metric1.compute(predictions=predictions, references=labels, average="macro")["f1"]
  return { "f1-micro": micro1, "f1-macro": macro1} 

In [31]:
from datasets import load_metric

Training Arguments

In [11]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir='scotus_max_linear',
    learning_rate=1e-3,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy="epoch",
    evaluation_strategy="epoch",
 #   push_to_hub=True,
    metric_for_best_model="f1-micro",
    fp16=True,
#    report_to="wandb",
    greater_is_better=True,
    lr_scheduler_type='linear',
 #   run_name="max",
    load_best_model_at_end = True
)

In [12]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) # fp16

In [14]:
def model_init():
    model = AutoModelForSequenceClassification.from_pretrained('danielsaggau/bregman_1.5', num_labels=14)
    for name, param in model.named_parameters():
      if name.startswith("longformer."): # choose whatever you like here
       param.requires_grad = False
    return model 

In [None]:
from transformers import Trainer, EarlyStoppingCallback
trainer = Trainer(
    compute_metrics=compute_metrics,
    args=training_args,
    train_dataset=tokenized_data,
    eval_dataset=tokenized_data_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,    
    model_init=model_init,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=5)]
      )

# Define hyperparameter space 

In [17]:
def hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0.01,0.05),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [2,3,4,6,8]),
        "num_train_epochs": trial.suggest_categorical("num_train_epochs", [15,20,30,40]),
    }

In [None]:
#hide_output
#!pip install optuna
import numpy as np
best_run = trainer.hyperparameter_search(
    n_trials=10, direction="maximize", hp_space=hp_space)

[32m[I 2022-12-15 13:17:36,054][0m A new study created in memory with name: no-name-80faa769-27c8-4abb-bfa8-cd2f9fb9c7fe[0m
Trial: {'learning_rate': 1.870284979831095e-05, 'weight_decay': 0.027584572880697003, 'per_device_train_batch_size': 8, 'num_train_epochs': 40}
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--danielsaggau--bregman_1.5/snapshots/363ae19253237bb845fc9861c93ac6033414e92d/config.json
Model config LongformerConfig {
  "_name_or_path": "danielsaggau/bregman_1.5",
  "architectures": [
    "LongformerModel"
  ],
  "attention_mode": "longformer",
  "attention_probs_dropout_prob": 0.1,
  "attention_window": [
    128,
    128,
    128,
    128,
    128,
    128
  ],
  "bos_token_id": 0,
  "classifier_dropout": null,
  "cls_token_id": 101,
  "eos_token_id": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 512,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
   

Epoch,Training Loss,Validation Loss,F1-micro,F1-macro
1,No log,2.103071,0.407143,0.11002
2,No log,1.927267,0.445,0.138437
3,No log,1.826636,0.498571,0.16985
4,1.979700,1.745524,0.516429,0.185264
5,1.979700,1.678888,0.531429,0.212027
6,1.979700,1.621415,0.548571,0.234717
7,1.570400,1.575529,0.555,0.24471
8,1.570400,1.536002,0.567857,0.261417
9,1.570400,1.504657,0.572857,0.270107
10,1.364900,1.474862,0.574286,0.282363


Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on C

Epoch,Training Loss,Validation Loss


[1;30;43mDie letzten 5000 Zeilen der Streamingausgabe wurden abgeschnitten.[0m
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...