### HuggingFace dependencies

In [1]:
from datasets import *
import datasets

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np 

### Load dataset

In [2]:
dataset = load_dataset('community-datasets/gnad10')

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9245
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1028
    })
})

### Load distilbert model

In [3]:
model_checkpoint = "allenai/longformer-base-4096"
model_output = "longformer-base-4096-gnad10"

# define label maps
id2label = {0: "Web", 1: "Panorama", 2: "International", 3: "Wirtschaft", 4: "Sport", 5: "Inland", 6: "Etat", 7: "Wissenschaft", 8: "Kultur"}
label2id = {"Web": 0, "Panorama": 1, "International": 2, "Wirtschaft": 3, "Sport": 4, "Inland": 5, "Etat": 6, "Wissenschaft": 7, "Kultur": 8}

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=9, id2label=id2label, label2id=label2id, trust_remote_code=True)

# display architecture
model

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LongformerForSequenceClassification(
  (longformer): LongformerModel(
    (embeddings): LongformerEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (position_embeddings): Embedding(4098, 768, padding_idx=1)
    )
    (encoder): LongformerEncoder(
      (layer): ModuleList(
        (0-11): 12 x LongformerLayer(
          (attention): LongformerAttention(
            (self): LongformerSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_global): Linear(in_features=768, out_features=768, bias=True)
          

### Preprocess data

In [4]:
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [5]:
# create tokenize function
def tokenize(batch):

    #tokenize and truncate text
    tokens = tokenizer(batch['text'], padding=True, truncation=True, max_length=512)
    print(batch['label'])
    return tokens


# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize, batched=True)

tokenized_dataset


Map:   0%|          | 0/1028 [00:00<?, ? examples/s]

[3, 4, 0, 4, 2, 0, 4, 1, 2, 0, 0, 0, 0, 7, 2, 5, 1, 7, 0, 0, 3, 2, 3, 8, 3, 8, 0, 6, 5, 1, 2, 6, 3, 6, 0, 1, 0, 2, 3, 3, 3, 1, 3, 8, 0, 7, 4, 1, 0, 1, 5, 2, 7, 0, 1, 7, 0, 0, 1, 6, 2, 2, 0, 2, 7, 0, 3, 8, 1, 4, 5, 6, 3, 3, 5, 4, 0, 3, 0, 1, 2, 1, 1, 1, 0, 7, 4, 2, 0, 2, 7, 1, 3, 8, 1, 1, 2, 5, 7, 3, 0, 6, 6, 5, 4, 5, 5, 7, 4, 4, 4, 2, 7, 5, 5, 8, 3, 5, 3, 3, 6, 1, 8, 0, 1, 5, 5, 1, 3, 2, 7, 3, 7, 0, 1, 3, 3, 5, 1, 4, 6, 1, 7, 7, 1, 1, 8, 3, 8, 1, 1, 6, 1, 7, 1, 6, 4, 3, 0, 5, 2, 1, 6, 0, 2, 0, 2, 2, 0, 2, 1, 1, 3, 2, 3, 2, 5, 2, 0, 2, 2, 3, 2, 2, 3, 6, 4, 2, 3, 2, 1, 6, 0, 4, 5, 8, 4, 2, 0, 3, 5, 5, 1, 2, 7, 4, 2, 2, 3, 2, 3, 0, 1, 2, 3, 4, 4, 8, 0, 1, 7, 5, 1, 2, 0, 8, 1, 8, 1, 0, 8, 1, 5, 5, 5, 2, 6, 0, 1, 7, 0, 4, 8, 1, 4, 2, 6, 1, 8, 4, 5, 4, 3, 3, 8, 0, 1, 3, 0, 2, 2, 4, 5, 6, 3, 3, 1, 2, 0, 1, 4, 6, 3, 7, 2, 8, 2, 8, 4, 0, 0, 5, 5, 3, 0, 1, 3, 5, 6, 2, 8, 2, 8, 1, 0, 2, 3, 1, 0, 5, 0, 1, 6, 0, 1, 2, 2, 5, 0, 0, 4, 1, 1, 7, 2, 3, 3, 6, 1, 1, 8, 7, 3, 2, 1, 0, 6, 8, 3, 4, 0, 7, 2, 

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 9245
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1028
    })
})

In [6]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Apply untrained model to text

In [7]:
# define list of examples
text_list = ["Ex-Chef der Bundespolizei solle insbesondere seine Kontakte in den USA nutzen. Der Volltext dieses auf Agenturmeldungen basierenden Artikels steht aus rechtlichen Gründen nicht mehr zur Verfügung."]
print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(id2label[predictions.tolist()])

Initializing global attention on CLS token...
Input ids are automatically padded to be a multiple of `config.attention_window`: 512


Untrained model predictions:
----------------------------
Panorama


### Train model

In [8]:

peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=16,
                        lora_alpha=32,
                        lora_dropout=0.05,
                        target_modules=["query", "key", "value"])

In [9]:
peft_config

LoraConfig(task_type='SEQ_CLS', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=16, target_modules={'query', 'key', 'value'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, eva_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)

In [10]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 1,482,249 || all params: 150,148,626 || trainable%: 0.9872


In [11]:
# hyperparameters
lr = 1e-3
batch_size = 1
num_epochs = 5

In [12]:
training_args = TrainingArguments(
    fp16=True,
    gradient_checkpointing=True,
    output_dir= model_output,
    learning_rate=lr,
    auto_find_batch_size=True,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [13]:
# function to show metrics for f1 and accuracy
from sklearn.metrics import f1_score, accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    accuracy = accuracy_score(labels, preds)
    return {
    'f1': f1,
    'accuracy': accuracy
    }

In [14]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    processing_class=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics
)

In [15]:
# train model
trainer.train()



Epoch,Training Loss,Validation Loss,F1,Accuracy
1,1.3506,1.255347,0.5525,0.565175
2,1.2343,0.964171,0.671482,0.672179
3,1.1224,1.031939,0.636683,0.642023
4,1.0623,0.862144,0.699764,0.70428
5,1.0311,0.850689,0.704506,0.706226




TrainOutput(global_step=5780, training_loss=1.190283007770261, metrics={'train_runtime': 3891.3316, 'train_samples_per_second': 11.879, 'train_steps_per_second': 1.485, 'total_flos': 1.53927313786368e+16, 'train_loss': 1.190283007770261, 'epoch': 5.0})

### Generate prediction with trained model

In [16]:
model.to('cpu') # moving to cpu for Windows (can alternatively do 'mps' for Mac)

print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cpu")

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
International


### Save lora adapters to HuggingFace

In [17]:
from huggingface_hub import notebook_login

notebook_login() # ensure token gives write access
hf_name = 'cyrp' # your hf username or org name
model_id = hf_name + "/" + model_output # name the model whatever you want

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Token has not been saved to git credential helper.


In [18]:
model.push_to_hub(model_id) # save model

README.md:   0%|          | 0.00/1.83k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/5.94M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/cyrp/longformer-base-4096-gnad10/commit/fdc04eeff5b87640bcd281be13f523d7f5029091', commit_message='Upload model', commit_description='', oid='fdc04eeff5b87640bcd281be13f523d7f5029091', pr_url=None, repo_url=RepoUrl('https://huggingface.co/cyrp/longformer-base-4096-gnad10', endpoint='https://huggingface.co', repo_type='model', repo_id='cyrp/longformer-base-4096-gnad10'), pr_revision=None, pr_num=None)

In [19]:
trainer.push_to_hub(model_output) # save trainer

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/cyrp/longformer-base-4096-gnad10/commit/485b969ef0063f21a29fd6f592c970d3944e9f07', commit_message='longformer-base-4096-gnad10', commit_description='', oid='485b969ef0063f21a29fd6f592c970d3944e9f07', pr_url=None, repo_url=RepoUrl('https://huggingface.co/cyrp/longformer-base-4096-gnad10', endpoint='https://huggingface.co', repo_type='model', repo_id='cyrp/longformer-base-4096-gnad10'), pr_revision=None, pr_num=None)