# 00 - Setup

In [1]:
#%%capture
#!pip install datasets transformers seqeval[gpu]
#!pip install wandb -q

# previous code used within google colab, following code for usage in different environment

import sys
!{sys.executable} -m pip install datasets transformers seqeval wandb -q

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0m[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0m

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
import datasets
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from transformers import AutoModel, AutoTokenizer, BertTokenizerFast, BertConfig, BertForTokenClassification, AdamW, get_linear_schedule_with_warmup
import os
import wandb

In [None]:
!wandb login

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
# check whether GPU is available
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [None]:
# map id to label and vice versa
id_to_label = {0: 'O',
 1: 'B-EP_POL',
 2: 'I-EP_POL',
 3: 'B-EP_WIRT',
 4: 'I-EP_WIRT',
 5: 'B-EP_FINANZ',
 6: 'I-EP_FINANZ',
 7: 'B-EP_MEDIA',
 8: 'I-EP_MEDIA',
 9: 'B-EP_SCI',
 10: 'I-EP_SCI',
 11: 'B-EP_REL',
 12: 'I-EP_REL',
 13: 'B-EP_KULT',
 14: 'I-EP_KULT',
 15: 'B-EP_MIL',
 16: 'I-EP_MIL',
 17: 'B-EP_NGO',
 18: 'I-EP_NGO',
 19: 'B-EP_MOV',
 20: 'I-EP_MOV',
 21: 'B-EP_OWN',
 22: 'I-EP_OWN',
 23: 'B-EO_POL',
 24: 'I-EO_POL',
 25: 'B-EO_WIRT',
 26: 'I-EO_WIRT',
 27: 'B-EO_FINANZ',
 28: 'I-EO_FINANZ',
 29: 'B-EO_MEDIA',
 30: 'I-EO_MEDIA',
 31: 'B-EO_SCI',
 32: 'I-EO_SCI',
 33: 'B-EO_REL',
 34: 'I-EO_REL',
 35: 'B-EO_KULT',
 36: 'I-EO_KULT',
 37: 'B-EO_MIL',
 38: 'I-EO_MIL',
 39: 'B-EO_NGO',
 40: 'I-EO_NGO',
 41: 'B-EO_MOV',
 42: 'I-EO_MOV',
 43: 'B-P_NAT',
 44: 'I-P_NAT',
 45: 'B-P_ETH',
 46: 'I-P_ETH',
 47: 'B-P_FUNC',
 48: 'I-P_FUNC',
 49: 'B-P_AGE',
 50: 'I-P_AGE',
 51: 'B-P_SOZ',
 52: 'I-P_SOZ',
 53: 'B-P_GEN',
 54: 'I-P_GEN',
 55: 'B-GPE',
 56: 'I-GPE'}

label_to_id = {v: k for k, v in id_to_label.items()}

In [None]:
import time
import datetime

def format_time(elapsed):
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

# 00 - Define Paths

In [4]:
train_path = 'data/training/training_data/training_data_sentences.csv'
label_count_path = 'data/training/training_data/label_count_training_data.csv'

gold_data_path = 'data/gold/processed/min_annotations_gold_num.csv'
label_count_path_gold = 'data/gold/processed/count_labels_gold_data.csv'

lex_data_path = 'data/lexicon/lex_data.csv'
lex_directory = 'data/lexicon/'

# 01 - Load Data, compute Class Distribution
Load:
- Automatically labeled data
- Gold data (with minimal annotations)
- Entity count from automatically labeled data
- Label count from gold data


In [None]:
train_data = pd.read_csv(train_path)

In [None]:
train_data_gold = pd.read_csv(gold_data_path)

In [None]:
train_data.count()

sentence_id    167661
token          167661
label          167661
dtype: int64

In [None]:
train_data_gold.count()

sentence_id    4796
token          4796
label          4796
dtype: int64

In [None]:
# load entity count and group by label
label_count = pd.read_csv(label_count_path)
label_count = label_count.rename(columns={'0': 'count'})
label_count = label_count.groupby(['label']).sum()
label_count = label_count.sort_values(by=['count'])
label_count['l'] = label_count.index
label_count

In [None]:
# load label count
label_count_gold = pd.read_csv(label_count_path_gold)
label_count_gold = label_count_gold.groupby(['label']).sum()
label_count_gold = label_count_gold.sort_values(by=['count'])
label_count_gold['l'] = label_count_gold.index
label_count_gold

Unnamed: 0_level_0,count,l
label,Unnamed: 1_level_1,Unnamed: 2_level_1
35,0,35
11,1,11
33,3,33
17,5,17
7,6,7
15,6,15
5,9,5
41,12,41
3,14,3
19,15,19


# 02 - Create Representative Labels for each Sample
- Assign each sample a representative label (for both automatically generated training data and gold data)

In [None]:
import ast
import random

def label_sent(row):
  labels = ast.literal_eval(row)
  l = []
  for label in labels:
    if label != 0:
      if label not in l:
        l.append(label)

  if len(l) == 0:
    return 0
  elif 19 in l:
    return 19
  elif 35 in l:
    return 35
  elif 11 in l:
    return 11
  elif 13 in l:
    return 13
  elif 3 in l:
    return 3
  elif 27 in l:
    return 27
  else:
    random.seed(42)
    return l[random.randint(0, len(l)-1)]

train_data['repr_label'] = train_data['label'].apply(label_sent)
train_data_gold['repr_label'] = train_data_gold['label'].apply(label_sent)


In [None]:
count_repr = train_data['repr_label'].value_counts()
count_repr

0     117144
55     17608
53     15312
1       4951
49      3415
23      3371
43      1079
29       957
41       857
45       565
47       471
51       317
17       259
24       151
7        134
25       127
2        126
37       119
31       113
39       105
5         81
15        79
9         77
33        31
18        30
30        23
6         19
8         15
27        14
3         14
40        14
42        12
10        11
56        11
13        10
11         8
35         7
16         6
32         5
26         5
19         3
50         2
38         2
34         1
Name: repr_label, dtype: int64

In [None]:
count_repr_gold = train_data_gold['repr_label'].value_counts()
count_repr_gold

0     1335
23     748
21     662
1      628
47     391
55     316
49     160
51     107
53     101
43      79
27      69
45      53
37      37
25      27
9       15
39      14
13      13
3       12
19      10
7        5
29       5
31       4
15       2
41       2
11       1
Name: repr_label, dtype: int64

# 03 - Resample - Automatically Labeled Data
- Remove samples with no entities
- Downsample majority classes

In [None]:
# drop samples with no entities
train_data = train_data.drop(train_data[train_data.repr_label == 0].index)

# downsample majority classes
df_55 = train_data[train_data.repr_label == 55]
train_data = train_data.drop(train_data[train_data.repr_label == 55].index)
df_53 = train_data[train_data.repr_label == 53]
train_data = train_data.drop(train_data[train_data.repr_label == 53].index)
df_55 = df_55.sample(frac = 0.2, random_state = 42)
df_53 = df_53.sample(frac = 0.2, random_state = 42)

train_data = pd.concat([train_data, df_55, df_53]).sample(frac = 1, random_state = 42).reset_index(drop=True)
print(train_data)

       sentence_id                                              token  \
0            21032  ['Wissen', 'Sie', ':', 'Von', 'Linken', 'und',...   
1            97695  ['Meine', 'Damen', 'und', 'Herren', ',', 'ich'...   
2           152460  ['Ihre', 'Missinterpretation', 'dieses', 'ganz...   
3           156972  ['Das', 'Wort', 'hat', 'der', 'Kollege', 'Oliv...   
4           115508  ['Natürlich', 'ist', 'es', 'sinnvoll', ',', 'd...   
...            ...                                                ...   
24176       126393  ['Verbraucherrechte', 'allein', 'aber', 'nütze...   
24177        52016  ['Es', 'hat', 'hier', 'ja', 'sehr', 'vielversp...   
24178         9122  ['Dagegen', 'Alexander', 'Dobrindt', '–', 'ich...   
24179       150051  ['Ekin', 'Deligöz', 'hat', 'gerade', 'gefragt'...   
24180         9222  ['Sie', 'sagen', ':', 'Bis', 'zum', '31', '.',...   

                                                   label  repr_label  
0      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55

In [None]:
count_repr = train_data[train_data['repr_label'] % 2 != 0]['repr_label'].value_counts()
count_repr

1     4951
55    3522
49    3415
23    3371
53    3062
43    1079
29     957
41     857
45     565
47     471
51     317
17     259
7      134
25     127
37     119
31     113
39     105
5       81
15      79
9       77
33      31
3       14
27      14
13      10
11       8
35       7
19       3
Name: repr_label, dtype: int64

# 04 - Train/Dev Split - Automatically Labeled Data 

In [None]:
# 60/20/20
train_split, validate_split, test_split = np.split(train_data.sample(frac = 1, random_state = 42), [int(.6*len(train_data)), int(.8*len(train_data))])

print(len(train_data))
print(len(train_split))
print(len(validate_split))
print(len(test_split))

train_split = train_split.reset_index(drop=True)
validate_split = validate_split.reset_index(drop=True)
# test split not used, as model will be tested with gold data
train_split

24181
14508
4836
4837


Unnamed: 0,sentence_id,token,label,repr_label
0,136851,"['Die', 'nächste', 'Rednerin', 'ist', 'die', '...","[0, 0, 0, 0, 0, 0, 0, 23, 24, 24, 0, 1, 2, 0]",23
1,132640,"['Herr', 'Präsident', '!', 'Meine', 'Damen', '...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",29
2,94605,"['Außerdem', 'ändert', 'sich', 'im', 'Hinblick...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",55
3,131179,"['Danke', 'sehr', '.', '–', 'Kai', 'Gehring', ...","[0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0]",1
4,105554,"['Herr', 'Präsident', '!', 'Meine', 'sehr', 'v...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2
...,...,...,...,...
14503,92510,"['Ich', 'will', 'auch', 'sagen', ':', 'Der', '...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",6
14504,103723,"['Frau', 'Kollegin', 'Konrad', ',', 'herzliche...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",23
14505,152670,"['Für', 'Bündnis', '90/Die', 'Grünen', 'hat', ...","[0, 23, 24, 24, 0, 0, 0, 0, 0, 1, 2, 0]",23
14506,87553,"['Zu', 'den', 'Kosten', 'der', 'Unterkunft', '...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",51


# 05 - Resample - Gold Data
- Remove samples with no entities

In [None]:
train_data_gold = train_data_gold.drop(train_data_gold[train_data_gold.repr_label == 0].index)
train_data_gold

Unnamed: 0,sentence_id,token,label,repr_label
0,0,"['Sehr', 'geehrter', 'Herr', 'Präsident', '!',...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 23, 0,...",1
1,1,"['Es', 'gibt', 'nach', 'der', 'derzeitigen', '...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55,...",55
2,2,"['Obwohl', 'Schengen', 'und', 'Dublin', 'von',...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",9
4,4,"['Der', 'Sachverständige', 'Gerald', 'Knaus', ...","[0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 0]",9
6,6,"['Nun', 'wird', 'laut', 'Masterplan', 'fest', ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 47, 0, 0, 23, 0...",47
...,...,...,...,...
4788,4788,"['Es', 'wäre', 'sehr', 'schön', ',', 'wenn', '...","[0, 0, 0, 0, 0, 0, 0, 51, 0, 0, 0, 0, 0, 0, 0,...",51
4789,4789,"['Wir', 'profitieren', 'von', 'Arbeitnehmerinn...","[0, 0, 0, 47, 0, 47, 0, 0, 0, 0, 0, 0, 0, 0, 0...",47
4790,4790,"['Diese', 'Menschen', 'arbeiten', 'in', 'der',...","[0, 47, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",47
4793,4793,"['Und', 'Sie', 'von', 'der', 'AfD', 'wollen', ...","[0, 23, 0, 0, 0, 0, 0, 47, 0, 47, 0, 0, 0, 0, ...",23


# 06 - Train/Dev/Test Split - Gold Data

In [None]:
# first, create test split - identical with test split for baseline model
test_split_gold = train_data_gold.sample(frac = 0.2, random_state = 42)
print(test_split_gold)
train_data_gold = pd.concat([train_data_gold, test_split_gold]).drop_duplicates(keep=False)

# create train/dev splits
train_split_gold, validate_split_gold = np.split(train_data_gold, [int(.75*len(train_data_gold))])
print(len(train_split_gold))
print(len(validate_split_gold))
print(len(test_split_gold))

train_split_gold = train_split_gold.reset_index(drop=True)
validate_split_gold = validate_split_gold.reset_index(drop=True)
test_split_gold = test_split_gold.reset_index(drop=True)
train_split_gold

      sentence_id                                              token  \
3405         3405  ['Getrennt', 'davon', 'ist', 'das', 'Thema', '...   
559           559  ['Kein', 'Tarifvertrag', ',', 'meine', 'Damen'...   
4073         4073     ['Sehr', 'geehrter', 'Herr', 'Präsident', '!']   
2417         2417  ['Ich', 'möchte', 'Ihnen', ',', 'liebe', 'Frau...   
4375         4375  ['Das', 'ist', 'besonders', 'für', 'die', 'Kin...   
...           ...                                                ...   
999           999  ['Ja', ',', 'auch', 'wir', 'als', 'Union', 'tr...   
1565         1565  ['Ich', 'weiß', ',', 'dass', 'Sie', 'zumindest...   
1190         1190  ['Ich', 'kann', 'es', 'nur', 'wiederholen', '–...   
4661         4661  ['Nehmen', 'Sie', 'doch', 'einmal', 'folgendes...   
805           805  ['Wir', 'sind', 'schwer', 'getroffen', '.', 'K...   

                                                  label  repr_label  
3405  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...          

Unnamed: 0,sentence_id,token,label,repr_label
0,1,"['Es', 'gibt', 'nach', 'der', 'derzeitigen', '...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55,...",55
1,2,"['Obwohl', 'Schengen', 'und', 'Dublin', 'von',...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",9
2,4,"['Der', 'Sachverständige', 'Gerald', 'Knaus', ...","[0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 0]",9
3,6,"['Nun', 'wird', 'laut', 'Masterplan', 'fest', ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 47, 0, 0, 23, 0...",47
4,8,"['Nun', 'sollen', 'dank', 'der', 'künftigen', ...","[0, 0, 0, 0, 0, 0, 0, 0, 47, 0, 0, 0, 0, 0, 0,...",47
...,...,...,...,...
2071,3395,"['Denn', 'was', 'hilft', 'uns', 'eigentlich', ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 51, 0,...",51
2072,3397,"['Frau', 'Kollegin', 'Bauer', ',', 'gestatten'...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]",1
2073,3401,"['Sehr', 'geehrter', 'Herr', 'Präsident', '!',...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 21,...",47
2074,3402,"['–', 'Ja', ',', 'ich', 'glaube', ',', 'da', '...","[0, 0, 0, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21...",55


# 07 - Define Parameters, Load Tokenizer

In [None]:
MAX_LEN = 340
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16
EPOCHS = 2
LEARNING_RATE = 2e-05
MAX_GRAD_NORM = 10

tokenizer = BertTokenizerFast.from_pretrained("dbmdz/bert-base-german-uncased")

Downloading:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/242k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

# 08 - Define Dataset Class, Create Train-/Dev-Set and DataLoaders for Automatically Labeled Data
- Implementation adapted from [this Blog by Chris McCormick](https://mccormickml.com/2019/07/22/BERT-fine-tuning/) and [this Notebook by Niels Rogge](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Custom_Named_Entity_Recognition_with_BERT_only_first_wordpiece.ipynb#scrollTo=DWgnNJrYW2GP)

In [None]:
import ast
class dataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

  def __getitem__(self, index):
        # step 1: get the sentence and word labels
        #sentence = self.data.sentence[index].strip().split()  
        #word_labels = self.data.word_labels[index].split(",")

        #sentence = self.data.token[index].replace('[', '').replace(']', '').replace('\'', '').replace(',,', '%&%').replace(',', ''). replace('%&%', ',').split()
        #labels = self.data.label[index].replace('[', '').replace(']', '').replace(' ', '').split(',')

        sentence = self.data.token[index].strip('][').replace('\'', '').split(', ')
        labels = ast.literal_eval(self.data.label[index])

        #print(sentence)
        #print(labels)
        if len(sentence) != len(labels):
          raise ValueError('ERROR -> different lengths: ' + str(sentence))

        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        encoding = self.tokenizer(sentence,
                             is_split_into_words=True, 
                             return_offsets_mapping=True, 
                             padding='max_length', 
                             truncation=True, 
                             max_length=self.max_len)
        
        # step 3: create token labels only for first word pieces of each tokenized word
        labels = [label for label in labels] 
        # code based on https://huggingface.co/transformers/custom_datasets.html#tok-ner
        # create an empty array of -100 of length max_length
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
        
        # set only labels whose first offset position is 0 and the second is not 0
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
          if mapping[0] == 0 and mapping[1] != 0:
            # overwrite label
            encoded_labels[idx] = labels[i]
            i += 1

        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)
        
        return item

  def __len__(self):
        return self.len

In [None]:
training_set = dataset(train_split, tokenizer, MAX_LEN)

In [None]:
validation_set = dataset(validate_split, tokenizer, MAX_LEN)

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

val_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }                

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
validation_loader = DataLoader(validation_set, **val_params)

# 09 - Load BERT Model

In [None]:
model = BertForTokenClassification.from_pretrained("dbmdz/bert-base-german-uncased", num_labels = len(label_to_id))
model.to(device)

Downloading:   0%|          | 0.00/422M [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-base-german-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint a

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31102, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

# 10 - Define Optimizer and Scheduler
- Implementation adapted from [this Blog by Chris McCormick](https://mccormickml.com/2019/07/22/BERT-fine-tuning/)

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = LEARNING_RATE,
                  eps = 1e-8 
)



In [None]:
total_steps = len(training_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps)

# 11 - Define Training/Validation Loop
- Implementation adapted from [this Blog by Chris McCormick](https://mccormickml.com/2019/07/22/BERT-fine-tuning/) and [this Notebook by Niels Rogge](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Custom_Named_Entity_Recognition_with_BERT_only_first_wordpiece.ipynb#scrollTo=DWgnNJrYW2GP)

In [None]:
import random
def train(epochs):
   
    seed_val = 42

    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

    training_stats = []

    # Measure the total training time for the whole run.
    total_t0 = time.time()

    # For each epoch...
    for epoch_i in range(0, epochs):
    
      # ========================================
      #               Training
      # ========================================
    
      # Perform one full pass over the training set.

      print("")
      print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, EPOCHS))
      print('Training...')

      tr_epoch_loss = 0
      nb_tr_examples, nb_tr_steps = 0, 0
      tr_preds, tr_labels = [], []

      # Measure how long the training epoch takes.
      t0 = time.time()


      # put model in training mode
      model.train()
    
      for idx, batch in enumerate(training_loader):
        
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)

        model.zero_grad()  

        
        outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss = outputs[0]
        tr_logits = outputs[1]  

        tr_epoch_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        loss_step = tr_epoch_loss/nb_tr_steps
        wandb.log({"Training Loss / Step": loss_step,
                   "Learning Rate / Step": scheduler.get_last_lr()[0]})
        
        
        if idx % 100==0 and idx != 0:
            #loss_step = tr_epoch_loss/nb_tr_steps
            print(f"Training loss per 100 training batches: {loss_step}")
            elapsed = format_time(time.time() - t0)
            print(' Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(idx, len(training_loader), elapsed))

    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

      tr_epoch_loss = tr_epoch_loss / nb_tr_steps
      print(f"Training loss epoch: {tr_epoch_loss}")
      # Measure how long this epoch took.
      training_time = format_time(time.time() - t0)
      print("  Training epoch took: {:}".format(training_time))
      # ========================================
      #               Validation
      # ========================================
      # After the completion of each training epoch, measure our performance on
      # our validation set.

      print("")
      print("Running Validation...")

      t0 = time.time()

      # Put the model in evaluation mode--the dropout layers behave differently
      # during evaluation.
      model.eval()

      # Tracking variables 
      va_loss, va_accuracy = 0, 0
      nb_va_examples, nb_va_steps = 0, 0
      va_preds, va_labels = [], []



      for idx, batch in enumerate(validation_loader):
        
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)

        model.zero_grad()

        with torch.no_grad():
          outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss = outputs[0]
        va_logits = outputs[1]
        va_loss += loss.item()

        nb_va_steps += 1
        nb_va_examples += labels.size(0)
        loss_step = va_loss/nb_va_steps
        wandb.log({"Validation Loss / Step": loss_step})
        
        if idx % 100==0 and idx != 0:
          #loss_step = va_loss/nb_va_steps
          print(f"Validation loss per 100 validation batches: {loss_step}")
          elapsed = format_time(time.time() - t0)
          print(' Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(idx, len(validation_loader), elapsed))
           
        # compute validation accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = va_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        va_labels.extend(labels)
        va_preds.extend(predictions)

        tmp_va_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        va_accuracy += tmp_va_accuracy

      va_loss = va_loss / nb_va_steps
      va_accuracy = va_accuracy / nb_va_steps
      print(f"Validation loss epoch: {va_loss}")
      print(f"Validation accuracy epoch: {va_accuracy}")
      # Measure how long this epoch took.
      validation_time = format_time(time.time() - t0)
      print("Validation epoch took: {:}".format(validation_time))

      # Record all statistics from this epoch.
      training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': tr_epoch_loss,
            'Valid. Loss': va_loss,
            'Valid. Acc.': va_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        })
      

    v_labels = [id_to_label[id.item()] for id in va_labels]
    v_predictions = [id_to_label[id.item()] for id in va_preds]

    # Display floats with two decimal places.
    pd.set_option('precision', 4)

    # Create a DataFrame from our training statistics.
    df_stats = pd.DataFrame(data=training_stats)

    # Use the 'epoch' as the row index.
    df_stats = df_stats.set_index('epoch')

    # A hack to force the column headers to wrap.
    #df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

    print("")
    print("Training complete!")
    print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

    return df_stats, v_labels, v_predictions

# 12 - Train Model on Automatically Labeled Data

In [None]:
wandb.finish()
wandb.init(settings=wandb.Settings(start_method="thread"))
wandb.watch(model, log="all")
df_stats, v_labels, v_predictions = train(EPOCHS)

[34m[1mwandb[0m: Currently logged in as: [33mdonatomonti[0m (use `wandb login --relogin` to force relogin)



Training...
Training loss per 100 training batches: 0.473595063137536
 Batch   100  of    907.    Elapsed: 0:02:55.
Training loss per 100 training batches: 0.291160709803822
 Batch   200  of    907.    Elapsed: 0:05:49.
Training loss per 100 training batches: 0.21492839696201374
 Batch   300  of    907.    Elapsed: 0:08:44.
Training loss per 100 training batches: 0.17389955374173022
 Batch   400  of    907.    Elapsed: 0:11:38.
Training loss per 100 training batches: 0.1473106249605497
 Batch   500  of    907.    Elapsed: 0:14:33.
Training loss per 100 training batches: 0.12844947787054417
 Batch   600  of    907.    Elapsed: 0:17:27.
Training loss per 100 training batches: 0.1146302574377948
 Batch   700  of    907.    Elapsed: 0:20:22.
Training loss per 100 training batches: 0.10410655980299549
 Batch   800  of    907.    Elapsed: 0:23:16.
Training loss per 100 training batches: 0.09542948739410902
 Batch   900  of    907.    Elapsed: 0:26:10.
Training loss epoch: 0.0949899467784406

In [None]:
l = [v_labels]
p = [v_predictions]

In [None]:
from seqeval.metrics import classification_report
from seqeval.scheme import IOB2
print(classification_report(l, p, mode='strict', scheme=IOB2))

              precision    recall  f1-score   support

   EO_FINANZ       0.00      0.00      0.00         6
     EO_KULT       0.00      0.00      0.00         2
    EO_MEDIA       0.91      0.87      0.89       296
      EO_MIL       1.00      0.59      0.74        27
      EO_MOV       0.95      0.95      0.95       221
      EO_NGO       0.75      0.07      0.13        43
      EO_POL       0.96      0.97      0.97      1024
      EO_REL       0.00      0.00      0.00         8
      EO_SCI       0.78      0.40      0.53        35
     EO_WIRT       0.72      0.60      0.65        30
   EP_FINANZ       0.94      0.50      0.65        34
     EP_KULT       0.00      0.00      0.00         2
    EP_MEDIA       1.00      0.75      0.86        36
      EP_MIL       0.00      0.00      0.00        18
      EP_MOV       0.00      0.00      0.00         1
      EP_NGO       0.92      0.54      0.68        65
      EP_POL       0.82      0.98      0.90      1143
      EP_REL       0.00    

  _warn_prf(average, modifier, msg_start, len(result))


# 13 - Create DataLoaders, Parameters and Optimizer/Scheduler for Gold Data Training

In [None]:
training_set = dataset(train_split_gold, tokenizer, MAX_LEN)
validation_set = dataset(validate_split_gold, tokenizer, MAX_LEN)
test_set = dataset(test_split_gold, tokenizer, MAX_LEN)
EPOCHS = 4

In [None]:
training_loader = DataLoader(training_set, **train_params)
validation_loader = DataLoader(validation_set, **val_params)
testing_loader = DataLoader(test_set, **test_params)

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = LEARNING_RATE,
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
)

total_steps = len(training_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)



# 14 - Train Model on Gold Data

In [None]:
df_stats_2, v_labels_2, v_predictions_2 = train(EPOCHS)


Training...
Training loss per 100 training batches: 0.14584506801006816
 Batch   100  of    130.    Elapsed: 0:02:55.
Training loss epoch: 0.13195032778267676
  Training epoch took: 0:03:47

Running Validation...
Validation loss epoch: 0.07864970074627887
Validation accuracy epoch: 0.9759774788602376
Validation epoch took: 0:00:29

Training...
Training loss per 100 training batches: 0.06288897622339797
 Batch   100  of    130.    Elapsed: 0:02:56.
Training loss epoch: 0.06292687686016926
  Training epoch took: 0:03:48

Running Validation...
Validation loss epoch: 0.06363530440086668
Validation accuracy epoch: 0.9812204237427481
Validation epoch took: 0:00:29

Training...
Training loss per 100 training batches: 0.050689334123589024
 Batch   100  of    130.    Elapsed: 0:02:57.
Training loss epoch: 0.04980058502405882
  Training epoch took: 0:03:49

Running Validation...
Validation loss epoch: 0.06065453936091878
Validation accuracy epoch: 0.9832656058146593
Validation epoch took: 0:00:

In [None]:
l = [v_labels_2]
p = [v_predictions_2]

In [None]:
from seqeval.metrics import classification_report
from seqeval.scheme import IOB2
print(classification_report(l, p, mode='strict', scheme=IOB2))

              precision    recall  f1-score   support

   EO_FINANZ       0.25      1.00      0.40         1
    EO_MEDIA       0.50      0.33      0.40         3
      EO_MIL       0.33      1.00      0.50         1
      EO_NGO       0.00      0.00      0.00         7
      EO_POL       0.86      0.91      0.88       340
      EO_SCI       1.00      0.50      0.67         2
     EO_WIRT       1.00      1.00      1.00         2
   EP_FINANZ       0.00      0.00      0.00         2
     EP_KULT       0.00      0.00      0.00         2
    EP_MEDIA       0.00      0.00      0.00         1
      EP_MIL       0.00      0.00      0.00         1
      EP_MOV       0.00      0.00      0.00         1
      EP_NGO       0.00      0.00      0.00         1
      EP_OWN       0.98      1.00      0.99       309
      EP_POL       0.86      0.86      0.86       232
      EP_SCI       0.00      0.00      0.00         1
         GPE       0.93      0.93      0.93       206
       P_AGE       0.81    

  _warn_prf(average, modifier, msg_start, len(result))


# 15 - Evaluate Model on Test Set

In [None]:
def test(model, testing_loader):
    # put model in evaluation mode
    model.eval()
    
    test_loss, test_accuracy = 0, 0
    nb_test_examples, nb_test_steps = 0, 0
    test_preds, test_labels = [], []

    t0 = time.time()
    
    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            
            outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss = outputs[0]
            test_logits = outputs[1]
            test_loss += loss.item()

            nb_test_steps += 1
            nb_test_examples += labels.size(0)
        
            if idx % 100==0 and idx != 0:
                loss_step = test_loss/nb_test_steps
                print(f"Test loss per 100 evaluation steps: {loss_step}")
                elapsed = format_time(time.time() - t0)
                print(' Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(idx, len(testing_loader), elapsed))
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = test_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            
            
            test_labels.extend(labels)
            test_preds.extend(predictions)
            
            tmp_test_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            test_accuracy += tmp_test_accuracy

    labels = [id_to_label[id.item()] for id in test_labels]
    predictions = [id_to_label[id.item()] for id in test_preds]
    
    test_loss = test_loss / nb_test_steps
    test_accuracy = test_accuracy / nb_test_steps
    print(f"Test Loss: {test_loss}")
    print(f"Test Accuracy: {test_accuracy}")

    return labels, predictions

In [None]:
labels, predictions = test(model, testing_loader)

Test Loss: 0.06911808108402924
Test Accuracy: 0.9816091296424356


In [None]:
l = [labels]
p = [predictions]

In [None]:
from seqeval.metrics import classification_report
from seqeval.scheme import IOB2

print(classification_report(l, p, mode='strict', scheme=IOB2))

              precision    recall  f1-score   support

   EO_FINANZ       0.67      0.73      0.70        30
    EO_MEDIA       0.75      0.60      0.67         5
      EO_MIL       0.79      0.75      0.77        36
      EO_MOV       0.00      0.00      0.00         4
      EO_NGO       0.00      0.00      0.00        10
      EO_POL       0.83      0.88      0.85       499
      EO_REL       0.00      0.00      0.00         2
      EO_SCI       1.00      1.00      1.00         1
     EO_WIRT       0.68      0.72      0.70        18
   EP_FINANZ       0.00      0.00      0.00         2
     EP_KULT       0.00      0.00      0.00        10
      EP_MIL       0.00      0.00      0.00         1
      EP_MOV       0.00      0.00      0.00         5
      EP_NGO       0.00      0.00      0.00         1
      EP_OWN       0.98      1.00      0.99       350
      EP_POL       0.78      0.77      0.78       329
      EP_SCI       0.00      0.00      0.00         9
     EP_WIRT       0.00    

  _warn_prf(average, modifier, msg_start, len(result))


# 16 - Save Fine-Tuned Model

In [None]:
import os

directory = '/trained-bert-combined'

if not os.path.exists(directory):
    os.makedirs(directory)

# save vocabulary of the tokenizer
tokenizer.save_vocabulary(directory)
# save the model weights and its configuration file
model.save_pretrained(directory)
print('All files saved')


All files saved


In [None]:
model = BertForTokenClassification.from_pretrained('trained-bert-combined', num_labels = len(label_to_id))
model.to(device)
tokenizer = BertTokenizerFast.from_pretrained('trained-bert-combined')
