# Install Modules

In [1]:
!pip install transformers==4.28.0
!pip install datasets==2.14.6
!pip install evaluate==0.4.2
!pip install accelerate -U
!pip install keras

Collecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.19.1
    Uninstalling tokenizers-0.19.1:
      Successfully uninstalled tokenizers-0.19.1
  Attempting uninstall: transformers
    Found existing installation: transformers 4.41.2
    Uninstalling transformers-4.41.2:
      Successfully uninstalled transformers-4.41.2
Successfully installed tokenizers-0.13.3 transformers-4.28.0
Collecting datasets
  Download

# Connect with Google Drive

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Import Modules

In [2]:
import numpy as np
import pandas as pd

import torch
from datasets import ClassLabel, Dataset, Features, load_dataset, Value
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# Init Global Vars

In [3]:
DATA_PATH = "drive/MyDrive/Colab Notebooks/data/"

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
DEVICE

device(type='cuda')

# Apply Train Test Split

In [6]:
df = pd.read_csv(DATA_PATH + "dfd-motives.tsv", sep="\t")
df

Unnamed: 0,text,label
0,Abbenzeller,Herkunft
1,Abramovski,Herkunft
2,Abramowski,Herkunft
3,Abramowsky,Herkunft
4,Abramski,Herkunft
...,...,...
2795,Thümmel,Rufname
2796,Thümmes,Rufname
2797,Thün,Rufname
2798,Thünchen,Rufname


In [7]:
labels = set(df["label"])
labels

{'Beruf',
 'Herkunft',
 'Kompositionelles Motiv',
 'Rufname',
 'Rufnamenmuster',
 'Wohnstätte',
 'Übername'}

In [11]:
dataset = Dataset.from_pandas(df, features=Features({"text": Value("string"), "label": ClassLabel(names=list(labels))}))
dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 2800
})

In [12]:
dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=0)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 2240
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 560
    })
})

In [14]:
train_df = dataset["train"].to_pandas()
train_df["label"] = train_df["label"].apply(lambda x: {i: label for i, label in enumerate(labels)}[x]) # fix labels
train_df

Unnamed: 0,text,label
0,Akman,Rufnamenmuster
1,Kutbay,Kompositionelles Motiv
2,Haager,Herkunft
3,Gurol,Herkunft
4,Sarikaya,Kompositionelles Motiv
...,...,...
2235,Yigitsoy,Kompositionelles Motiv
2236,Adenau,Herkunft
2237,Kopfnagel,Beruf
2238,Gürbüzer,Kompositionelles Motiv


In [15]:
train_df.to_csv(DATA_PATH + "dfd-motives_train.tsv", sep="\t", index=False)

In [16]:
test_df = dataset["test"].to_pandas()
test_df["label"] = test_df["label"].apply(lambda x: {i: label for i, label in enumerate(labels)}[x]) # fix labels
test_df

Unnamed: 0,text,label
0,Scheinemann,Beruf
1,Baytekin,Kompositionelles Motiv
2,Akkus,Rufnamenmuster
3,Olejarczyk,Beruf
4,Ayaydin,Kompositionelles Motiv
...,...,...
555,Celikkol,Kompositionelles Motiv
556,Kubitscheck,Rufname
557,Metag,Rufname
558,Pühlhofer,Herkunft


In [17]:
test_df.to_csv(DATA_PATH + "dfd-motives_test.tsv", sep="\t", index=False)

# Load Dataset

In [19]:
data_files = {"train": DATA_PATH + "dfd-motives_train.tsv", "test": DATA_PATH + "dfd-motives_test.tsv"}

label_names = ["Beruf", "Herkunft", "Kompositionelles Motiv", "Rufname", "Rufnamenmuster", "Wohnstätte", "Übername"]
features = Features({"text": Value("string"), "label": ClassLabel(names=label_names)})
dataset = load_dataset("csv", data_files=data_files, features=features, delimiter="\t")

dataset

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 2240
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 560
    })
})

# Init Tokenizer from Base Model

In [20]:
base_model = "deepset/gbert-base"

tokenizer = AutoTokenizer.from_pretrained(base_model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/83.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/362 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/240k [00:00<?, ?B/s]

# Fine-tune Model

In [21]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

labels = tokenized_datasets["train"].features["label"].names
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}

model = AutoModelForSequenceClassification.from_pretrained(base_model, num_labels=len(labels), id2label=id2label, label2id=label2id)

Map:   0%|          | 0/2240 [00:00<?, ? examples/s]

Map:   0%|          | 0/560 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly

In [22]:
model_path = "drive/MyDrive/Colab Notebooks/models/gbert-base-dfd-motives"

In [23]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return evaluate.load("accuracy").compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir=model_path + "_trainer",
    num_train_epochs=5,
    evaluation_strategy="epoch",
    seed=0
)

trainer = Trainer(
    model=model, tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

In [24]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.961028,0.657143
2,1.136500,0.639135,0.805357
3,1.136500,0.706688,0.832143
4,0.366800,0.69158,0.858929


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.961028,0.657143
2,1.136500,0.639135,0.805357
3,1.136500,0.706688,0.832143
4,0.366800,0.69158,0.858929
5,0.366800,0.728584,0.864286


TrainOutput(global_step=1400, training_loss=0.5918342753819057, metrics={'train_runtime': 1148.068, 'train_samples_per_second': 9.756, 'train_steps_per_second': 1.219, 'total_flos': 2946976112640000.0, 'train_loss': 0.5918342753819057, 'epoch': 5.0})

In [25]:
trainer.save_model(model_path)

# Inspect Fine-tuned Model

In [26]:
print(model.config)
model

BertConfig {
  "_name_or_path": "deepset/gbert-base",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "Beruf",
    "1": "Herkunft",
    "2": "Kompositionelles Motiv",
    "3": "Rufname",
    "4": "Rufnamenmuster",
    "5": "Wohnst\u00e4tte",
    "6": "\u00dcbername"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "Beruf": 0,
    "Herkunft": 1,
    "Kompositionelles Motiv": 2,
    "Rufname": 3,
    "Rufnamenmuster": 4,
    "Wohnst\u00e4tte": 5,
    "\u00dcbername": 6
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version":

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31102, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,