# 00 - Setup

In [None]:
#%%capture
#!pip install datasets transformers seqeval[gpu]
#!pip install wandb -q

# previous code used within google colab, following code for usage in different environment

import sys
!{sys.executable} -m pip install datasets transformers seqeval wandb -q

In [15]:
import torch
from torch.utils.data import Dataset, DataLoader
import datasets
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from transformers import  BertTokenizerFast,  BertForTokenClassification, AdamW, get_linear_schedule_with_warmup
import os
import wandb

In [None]:
!wandb login

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
# check whether GPU is available
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cpu


In [None]:
# map id to label and vice versa
id_to_label = {0: 'O',
 1: 'B-EP_POL',
 2: 'I-EP_POL',
 3: 'B-EP_WIRT',
 4: 'I-EP_WIRT',
 5: 'B-EP_FINANZ',
 6: 'I-EP_FINANZ',
 7: 'B-EP_MEDIA',
 8: 'I-EP_MEDIA',
 9: 'B-EP_SCI',
 10: 'I-EP_SCI',
 11: 'B-EP_REL',
 12: 'I-EP_REL',
 13: 'B-EP_KULT',
 14: 'I-EP_KULT',
 15: 'B-EP_MIL',
 16: 'I-EP_MIL',
 17: 'B-EP_NGO',
 18: 'I-EP_NGO',
 19: 'B-EP_MOV',
 20: 'I-EP_MOV',
 21: 'B-EP_OWN',
 22: 'I-EP_OWN',
 23: 'B-EO_POL',
 24: 'I-EO_POL',
 25: 'B-EO_WIRT',
 26: 'I-EO_WIRT',
 27: 'B-EO_FINANZ',
 28: 'I-EO_FINANZ',
 29: 'B-EO_MEDIA',
 30: 'I-EO_MEDIA',
 31: 'B-EO_SCI',
 32: 'I-EO_SCI',
 33: 'B-EO_REL',
 34: 'I-EO_REL',
 35: 'B-EO_KULT',
 36: 'I-EO_KULT',
 37: 'B-EO_MIL',
 38: 'I-EO_MIL',
 39: 'B-EO_NGO',
 40: 'I-EO_NGO',
 41: 'B-EO_MOV',
 42: 'I-EO_MOV',
 43: 'B-P_NAT',
 44: 'I-P_NAT',
 45: 'B-P_ETH',
 46: 'I-P_ETH',
 47: 'B-P_FUNC',
 48: 'I-P_FUNC',
 49: 'B-P_AGE',
 50: 'I-P_AGE',
 51: 'B-P_SOZ',
 52: 'I-P_SOZ',
 53: 'B-P_GEN',
 54: 'I-P_GEN',
 55: 'B-GPE',
 56: 'I-GPE'}

label_to_id = {v: k for k, v in id_to_label.items()}

In [None]:
import time
import datetime

def format_time(elapsed):
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

# 00 - Define Paths

In [17]:
# path to training data
train_path = 'data/training/training_data/training_data_sentences.csv'

# path to file with entity counts
label_count_path = 'data/training/training_data/label_count_training_data.csv'

# path to lexicon data file
lex_data_path = 'data/lexicon/lex_data.csv'

# directory for lexicon data
lex_directory = 'data/lexicon/'

# directory containing gold data
path_gold_data = 'data/gold/'

# 01 - Load Training Data and Entity Count, compute Class Distribution (based only on B-labels)

In [None]:
train_data = pd.read_csv(train_path)

In [None]:
train_data.count()

sentence_id    167661
token          167661
label          167661
dtype: int64

In [None]:
label_count = pd.read_csv(label_count_path)

# group entities together per class
label_count = label_count.rename(columns={'0': 'count'})
label_count = label_count.groupby(['label']).sum()
label_count = label_count.sort_values(by=['count'])
label_count['l'] = label_count.index

# only B-labels are relevant
label_count = label_count[label_count['l'] % 2 != 0]
label_count

Unnamed: 0_level_0,count,l
label,Unnamed: 1_level_1,Unnamed: 2_level_1
19.0,4,19.0
35.0,8,35.0
11.0,8,11.0
13.0,11,13.0
3.0,15,3.0
27.0,17,27.0
33.0,55,33.0
15.0,111,15.0
9.0,112,9.0
5.0,141,5.0


In [None]:
label_count.sum()

count    81281.0
l          763.0
dtype: float64

# 02 - Create Representative Label for each Sample

In [None]:
import ast
import random

def label_sent(row):
  labels = ast.literal_eval(row)
  l = []
  for label in labels:
    if label != 0:
      if label not in l:
        l.append(label)

  if len(l) == 0:
    return 0
  elif 19 in l:
    return 19
  elif 35 in l:
    return 35
  elif 11 in l:
    return 11
  elif 13 in l:
    return 13
  elif 3 in l:
    return 3
  elif 27 in l:
    return 27
  else:
    random.seed(42)
    return l[random.randint(0, len(l)-1)]

train_data['repr_label'] = train_data['label'].apply(label_sent)
train_data


Unnamed: 0,sentence_id,token,label,repr_label
0,0,"['Guten', 'Morgen', ',', 'liebe', 'Kolleginnen...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0
1,1,"['Meine', 'sehr', 'verehrten', 'Damen', 'und',...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
2,2,"['§', '1', 'Absatz', '2', 'der', 'Geschäftsord...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
3,3,"['Die', 'Fraktion', 'der', 'AfD', 'widersprich...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
4,4,"['Enthaltungen', '?', '–', 'Der', 'Antrag', 'i...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
...,...,...,...,...
167656,167656,"['Aber', 'Ihre', 'Vorschläge', 'sind', 'weder'...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0
167657,167657,"['Bevor', 'ich', 'zu', 'den', 'Herausforderung...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
167658,167658,"['Auch', 'wenn', 'bei', 'diesem', 'Fall', 'vie...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
167659,167659,"['Und', ':', 'Er', 'ist', 'in', 'Haft', ',', '...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0


In [None]:
count_repr = train_data['repr_label'].value_counts()

count_repr

0     117144
55     17608
53     15312
1       4951
49      3415
23      3371
43      1079
29       957
41       857
45       565
47       471
51       317
17       259
24       151
7        134
25       127
2        126
37       119
31       113
39       105
5         81
15        79
9         77
33        31
18        30
30        23
6         19
8         15
27        14
3         14
40        14
42        12
10        11
56        11
13        10
11         8
35         7
16         6
32         5
26         5
19         3
50         2
38         2
34         1
Name: repr_label, dtype: int64

# 03 - Resample

In [None]:
# remove samples not containing any entities
train_data = train_data.drop(train_data[train_data.repr_label == 0].index)

# downsample majority classes
df_55 = train_data[train_data.repr_label == 55]
train_data = train_data.drop(train_data[train_data.repr_label == 55].index)
df_53 = train_data[train_data.repr_label == 53]
train_data = train_data.drop(train_data[train_data.repr_label == 53].index)
df_55 = df_55.sample(frac = 0.2, random_state = 42)
df_53 = df_53.sample(frac = 0.2, random_state = 42)

train_data = pd.concat([train_data, df_55, df_53]).sample(frac = 1, random_state = 42).reset_index(drop=True)
print(train_data)

       sentence_id                                              token  \
0            21032  ['Wissen', 'Sie', ':', 'Von', 'Linken', 'und',...   
1            97695  ['Meine', 'Damen', 'und', 'Herren', ',', 'ich'...   
2           152460  ['Ihre', 'Missinterpretation', 'dieses', 'ganz...   
3           156972  ['Das', 'Wort', 'hat', 'der', 'Kollege', 'Oliv...   
4           115508  ['Natürlich', 'ist', 'es', 'sinnvoll', ',', 'd...   
...            ...                                                ...   
24176       126393  ['Verbraucherrechte', 'allein', 'aber', 'nütze...   
24177        52016  ['Es', 'hat', 'hier', 'ja', 'sehr', 'vielversp...   
24178         9122  ['Dagegen', 'Alexander', 'Dobrindt', '–', 'ich...   
24179       150051  ['Ekin', 'Deligöz', 'hat', 'gerade', 'gefragt'...   
24180         9222  ['Sie', 'sagen', ':', 'Bis', 'zum', '31', '.',...   

                                                   label  repr_label  
0      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55

In [None]:
# count representative labels after downsampling
count_repr = train_data[train_data['repr_label'] % 2 != 0]['repr_label'].value_counts()
count_repr

1     4951
55    3522
49    3415
23    3371
53    3062
43    1079
29     957
41     857
45     565
47     471
51     317
17     259
7      134
25     127
37     119
31     113
39     105
5       81
15      79
9       77
33      31
3       14
27      14
13      10
11       8
35       7
19       3
Name: repr_label, dtype: int64

In [None]:
import ast
import random

count_ls = {0: 0,
 1: 0,
 2: 0,
 3: 0,
 4: 0,
 5: 0,
 6: 0,
 7: 0,
 8: 0,
 9: 0,
 10: 0,
 11: 0,
 12: 0,
 13: 0,
 14: 0,
 15: 0,
 16: 0,
 17: 0,
 18: 0,
 19: 0,
 20: 0,
 21: 0,
 22: 0,
 23: 0,
 24: 0,
 25: 0,
 26: 0,
 27: 0,
 28: 0,
 29: 0,
 30: 0,
 31: 0,
 32: 0,
 33: 0,
 34: 0,
 35: 0,
 36: 0,
 37: 0,
 38: 0,
 39: 0,
 40: 0,
 41: 0,
 42: 0,
 43: 0,
 44: 0,
 45: 0,
 46: 0,
 47: 0,
 48: 0,
 49: 0,
 50: 0,
 51: 0,
 52: 0,
 53: 0,
 54: 0,
 55: 0,
 56: 0}

def count_labs(row):
  labels = ast.literal_eval(row)
  for label in labels:
    if label != 0:
      count_ls[label] += 1

# count total number of entities per class in training data after resampling
train_data['label'].apply(count_labs)
count_ls

{0: 0,
 1: 5779,
 2: 5950,
 3: 15,
 4: 15,
 5: 121,
 6: 119,
 7: 164,
 8: 164,
 9: 109,
 10: 109,
 11: 8,
 12: 7,
 13: 11,
 14: 6,
 15: 104,
 16: 103,
 17: 345,
 18: 345,
 19: 4,
 20: 4,
 21: 0,
 22: 0,
 23: 5242,
 24: 6521,
 25: 168,
 26: 35,
 27: 17,
 28: 8,
 29: 1300,
 30: 321,
 31: 148,
 32: 164,
 33: 37,
 34: 1,
 35: 8,
 36: 14,
 37: 157,
 38: 17,
 39: 208,
 40: 80,
 41: 1187,
 42: 219,
 43: 1564,
 44: 0,
 45: 982,
 46: 0,
 47: 580,
 48: 0,
 49: 5112,
 50: 3,
 51: 411,
 52: 4,
 53: 5637,
 54: 0,
 55: 8359,
 56: 40}

# 04 - Train/Dev/Test Split

In [None]:
# create a 60/20/20 split for train/dev/test
train_split, validate_split, test_split = np.split(train_data.sample(frac = 1, random_state = 42), [int(.6*len(train_data)), int(.8*len(train_data))])
print(len(train_data))
print(len(train_split))
print(len(validate_split))
print(len(test_split))

train_split = train_split.reset_index(drop=True)
validate_split = validate_split.reset_index(drop=True)
test_split = test_split.reset_index(drop=True)
train_split

24181
14508
4836
4837


Unnamed: 0,sentence_id,token,label,repr_label
0,136851,"['Die', 'nächste', 'Rednerin', 'ist', 'die', '...","[0, 0, 0, 0, 0, 0, 0, 23, 24, 24, 0, 1, 2, 0]",23
1,132640,"['Herr', 'Präsident', '!', 'Meine', 'Damen', '...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",29
2,94605,"['Außerdem', 'ändert', 'sich', 'im', 'Hinblick...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",55
3,131179,"['Danke', 'sehr', '.', '–', 'Kai', 'Gehring', ...","[0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0]",1
4,105554,"['Herr', 'Präsident', '!', 'Meine', 'sehr', 'v...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2
...,...,...,...,...
14503,92510,"['Ich', 'will', 'auch', 'sagen', ':', 'Der', '...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",6
14504,103723,"['Frau', 'Kollegin', 'Konrad', ',', 'herzliche...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",23
14505,152670,"['Für', 'Bündnis', '90/Die', 'Grünen', 'hat', ...","[0, 23, 24, 24, 0, 0, 0, 0, 0, 1, 2, 0]",23
14506,87553,"['Zu', 'den', 'Kosten', 'der', 'Unterkunft', '...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",51


# 05 - Define Parameters, Load Tokenizer




In [None]:
MAX_LEN = 340
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16
EPOCHS = 2
LEARNING_RATE = 2e-05
MAX_GRAD_NORM = 10

tokenizer = BertTokenizerFast.from_pretrained("dbmdz/bert-base-german-uncased")

Downloading:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/242k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

# 06 - Define Dataset Class, Create Train-/Dev-/Test-Sets and DataLoaders
- Implementation adapted from [this Notebook by Niels Rogge](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Custom_Named_Entity_Recognition_with_BERT_only_first_wordpiece.ipynb#scrollTo=DWgnNJrYW2GP)

In [None]:
import ast
class dataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

  def __getitem__(self, index):
        # step 1: get the sentence and word labels
        sentence = self.data.token[index].strip('][').replace('\'', '').split(', ')
        labels = ast.literal_eval(self.data.label[index])


        if len(sentence) != len(labels):
          raise ValueError('ERROR -> different lengths: ' + str(sentence))

        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        encoding = self.tokenizer(sentence,
                             is_split_into_words=True, 
                             return_offsets_mapping=True, 
                             padding='max_length', 
                             truncation=True, 
                             max_length=self.max_len)
        
        # step 3: create token labels only for first word pieces of each tokenized word
        labels = [label for label in labels] 
        # code based on https://huggingface.co/transformers/custom_datasets.html#tok-ner
        # create an empty array of -100 of length max_length
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
        
        # set only labels whose first offset position is 0 and the second is not 0
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
          if mapping[0] == 0 and mapping[1] != 0:
            # overwrite label
            encoded_labels[idx] = labels[i]
            i += 1

        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)
        
        return item

  def __len__(self):
        return self.len

In [None]:
training_set = dataset(train_split, tokenizer, MAX_LEN)

In [None]:
validation_set = dataset(validate_split, tokenizer, MAX_LEN)

In [None]:
test_set = dataset(test_split, tokenizer, MAX_LEN)

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

val_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }                

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
validation_loader = DataLoader(validation_set, **val_params)
testing_loader = DataLoader(test_set, **test_params)

# 07 - Load BERT Model

In [None]:
model = BertForTokenClassification.from_pretrained("dbmdz/bert-base-german-uncased", num_labels = len(label_to_id))
model.to(device)

Downloading:   0%|          | 0.00/422M [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-base-german-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint a

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31102, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

# 08 - Define Optimizer and Scheduler
- Implementation adapted from [this Blog by Chris McCormick](https://mccormickml.com/2019/07/22/BERT-fine-tuning/)

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = LEARNING_RATE,
                  eps = 1e-8 
)



In [None]:
total_steps = len(training_loader) * EPOCHS


scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps)

# 09 - Define Training/Validation Loop
- Implementation adapted from [this Blog by Chris McCormick](https://mccormickml.com/2019/07/22/BERT-fine-tuning/) and [this Notebook by Niels Rogge](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Custom_Named_Entity_Recognition_with_BERT_only_first_wordpiece.ipynb#scrollTo=DWgnNJrYW2GP)

In [None]:
import random
def train(epochs):
   
    
    seed_val = 42
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

    training_stats = []

    # Measure the total training time for the whole run.
    total_t0 = time.time()

    # For each epoch...
    for epoch_i in range(0, epochs):
    
      # ========================================
      #               Training
      # ========================================
    
      # Perform one full pass over the training set.

      print("")
      print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, EPOCHS))
      print('Training...')

      tr_epoch_loss = 0
      nb_tr_examples, nb_tr_steps = 0, 0

      # Measure how long the training epoch takes.
      t0 = time.time()


      # put model in training mode
      model.train()
    
      for idx, batch in enumerate(training_loader):
        
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)

        model.zero_grad()  

        
        outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss = outputs[0]

        tr_epoch_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        loss_step = tr_epoch_loss/nb_tr_steps
        wandb.log({"Training Loss / Step": loss_step,
                   "Learning Rate / Step": scheduler.get_last_lr()[0]})
        
        
        if idx % 100==0 and idx != 0:
            #loss_step = tr_epoch_loss/nb_tr_steps
            print(f"Training loss per 100 training batches: {loss_step}")
            elapsed = format_time(time.time() - t0)
            print(' Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(idx, len(training_loader), elapsed))

    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

      tr_epoch_loss = tr_epoch_loss / nb_tr_steps
      print(f"Training loss epoch: {tr_epoch_loss}")
      
      training_time = format_time(time.time() - t0)
      print("  Training epoch took: {:}".format(training_time))
      # ========================================
      #               Validation
      # ========================================
      # After the completion of each training epoch, measure our performance on
      # our validation set.

      print("")
      print("Running Validation...")

      t0 = time.time()

      # Put the model in evaluation mode--the dropout layers behave differently
      # during evaluation.
      model.eval()

      # Tracking variables 
      va_loss, va_accuracy = 0, 0
      nb_va_examples, nb_va_steps = 0, 0
      va_preds, va_labels = [], []



      for idx, batch in enumerate(validation_loader):
        
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)

        model.zero_grad()

        with torch.no_grad():
          outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss = outputs[0]
        va_logits = outputs[1]
        va_loss += loss.item()

        nb_va_steps += 1
        nb_va_examples += labels.size(0)
        loss_step = va_loss/nb_va_steps
        wandb.log({"Validation Loss / Step": loss_step})
        
        if idx % 100==0:
          #loss_step = va_loss/nb_va_steps
          print(f"Validation loss per 100 validation batches: {loss_step}")
          elapsed = format_time(time.time() - t0)
          print(' Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(idx, len(validation_loader), elapsed))
           
        # compute validation accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = va_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        va_labels.extend(labels)
        va_preds.extend(predictions)

        tmp_va_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        va_accuracy += tmp_va_accuracy

      va_loss = va_loss / nb_va_steps
      va_accuracy = va_accuracy / nb_va_steps
      print(f"Validation loss epoch: {va_loss}")
      print(f"Validation accuracy epoch: {va_accuracy}")
      # Measure how long this epoch took.
      validation_time = format_time(time.time() - t0)
      print("Validation epoch took: {:}".format(validation_time))

      # Record all statistics from this epoch.
      training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': tr_epoch_loss,
            'Valid. Loss': va_loss,
            'Valid. Acc.': va_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        })
      

    v_labels = [id_to_label[id.item()] for id in va_labels]
    v_predictions = [id_to_label[id.item()] for id in va_preds]

   
    pd.set_option('precision', 4)
    df_stats = pd.DataFrame(data=training_stats)
    df_stats = df_stats.set_index('epoch')

    print("")
    print("Training complete!")
    print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

    return df_stats, v_labels, v_predictions

# 10 - Train Model

In [None]:
wandb.finish()
wandb.init(settings=wandb.Settings(start_method="thread"))
wandb.watch(model, log="all")
df_stats, v_labels, v_predictions = train(EPOCHS)

[34m[1mwandb[0m: Currently logged in as: [33mdonatomonti[0m (use `wandb login --relogin` to force relogin)



Training...
Training loss per 100 training batches: 0.41290217010986685
 Batch   100  of    907.    Elapsed: 0:03:04.
Training loss per 100 training batches: 0.25328112927390567
 Batch   200  of    907.    Elapsed: 0:06:08.
Training loss per 100 training batches: 0.18782948301579072
 Batch   300  of    907.    Elapsed: 0:09:12.
Training loss per 100 training batches: 0.15229959576448746
 Batch   400  of    907.    Elapsed: 0:12:17.
Training loss per 100 training batches: 0.12915577335160472
 Batch   500  of    907.    Elapsed: 0:15:21.
Training loss per 100 training batches: 0.11278013143294836
 Batch   600  of    907.    Elapsed: 0:18:25.
Training loss per 100 training batches: 0.10088014806230179
 Batch   700  of    907.    Elapsed: 0:21:29.
Training loss per 100 training batches: 0.0916725159701211
 Batch   800  of    907.    Elapsed: 0:24:34.
Training loss per 100 training batches: 0.08411720200321873
 Batch   900  of    907.    Elapsed: 0:27:38.
Training loss epoch: 0.08373430609

In [None]:
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Valid. Acc.,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.0837,0.0195,0.9956,0:27:50,0:03:52
2,0.0169,0.0135,0.9969,0:28:15,0:03:35


# 11 - Evaluate Performance on Dev Test

In [None]:
l = [v_labels]
p = [v_predictions]

In [None]:
from seqeval.metrics import classification_report
from seqeval.scheme import IOB2

print(classification_report(l, p, mode='strict', scheme=IOB2))

              precision    recall  f1-score   support

   EO_FINANZ       0.00      0.00      0.00         6
     EO_KULT       0.00      0.00      0.00         2
    EO_MEDIA       0.90      0.90      0.90       296
      EO_MIL       1.00      0.89      0.94        27
      EO_MOV       0.95      0.92      0.94       221
      EO_NGO       1.00      0.35      0.52        43
      EO_POL       0.97      0.98      0.98      1024
      EO_REL       1.00      0.38      0.55         8
      EO_SCI       0.56      0.43      0.48        35
     EO_WIRT       0.78      0.83      0.81        30
   EP_FINANZ       0.95      0.56      0.70        34
     EP_KULT       0.00      0.00      0.00         2
    EP_MEDIA       1.00      0.75      0.86        36
      EP_MIL       1.00      0.17      0.29        18
      EP_MOV       0.00      0.00      0.00         1
      EP_NGO       0.95      0.55      0.70        65
      EP_POL       0.88      0.97      0.92      1143
      EP_REL       0.00    

  _warn_prf(average, modifier, msg_start, len(result))


# 12 - Define Test Function 
- Implementation adapted from [this Blog by Chris McCormick](https://mccormickml.com/2019/07/22/BERT-fine-tuning/) and [this Notebook by Niels Rogge](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Custom_Named_Entity_Recognition_with_BERT_only_first_wordpiece.ipynb#scrollTo=DWgnNJrYW2GP)

In [None]:
def test(model, testing_loader):
    # put model in evaluation mode
    model.eval()
    
    test_loss, test_accuracy = 0, 0
    nb_test_examples, nb_test_steps = 0, 0
    test_preds, test_labels = [], []

    t0 = time.time()
    
    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            
            outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss = outputs[0]
            test_logits = outputs[1]
            test_loss += loss.item()

            nb_test_steps += 1
            nb_test_examples += labels.size(0)
        
            if idx % 100==0 and idx != 0:
                loss_step = test_loss/nb_test_steps
                print(f"Test loss per 100 evaluation steps: {loss_step}")
                elapsed = format_time(time.time() - t0)
                print(' Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(idx, len(testing_loader), elapsed))
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = test_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            
            
            test_labels.extend(labels)
            test_preds.extend(predictions)
            
            tmp_test_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            test_accuracy += tmp_test_accuracy

    labels = [id_to_label[id.item()] for id in test_labels]
    predictions = [id_to_label[id.item()] for id in test_preds]
    
    test_loss = test_loss / nb_test_steps
    test_accuracy = test_accuracy / nb_test_steps
    print(f"Test Loss: {test_loss}")
    print(f"Test Accuracy: {test_accuracy}")

    return labels, predictions

# 13 - Evaluate Model on Test Set

In [None]:
labels, predictions = test(model, testing_loader)

Test loss per 100 evaluation steps: 0.014875422503296655
 Batch   100  of    303.    Elapsed: 0:01:10.
Test loss per 100 evaluation steps: 0.01390025761836806
 Batch   200  of    303.    Elapsed: 0:02:19.
Test loss per 100 evaluation steps: 0.014517016680617244
 Batch   300  of    303.    Elapsed: 0:03:28.
Test Loss: 0.014433121752637829
Test Accuracy: 0.9966797730468823


In [None]:
l = [labels]
p = [predictions]

In [None]:
from seqeval.metrics import classification_report
from seqeval.scheme import IOB2


print(classification_report(l, p, mode='strict', scheme=IOB2))

              precision    recall  f1-score   support

   EO_FINANZ       0.00      0.00      0.00         2
     EO_KULT       0.00      0.00      0.00         1
    EO_MEDIA       0.91      0.93      0.92       267
      EO_MIL       0.97      0.89      0.93        36
      EO_MOV       0.95      0.92      0.93       233
      EO_NGO       1.00      0.22      0.37        49
      EO_POL       0.98      0.98      0.98      1128
      EO_REL       1.00      0.50      0.67         2
      EO_SCI       0.38      0.45      0.41        29
     EO_WIRT       0.61      0.65      0.63        34
   EP_FINANZ       0.94      0.68      0.79        25
     EP_KULT       0.00      0.00      0.00         4
    EP_MEDIA       1.00      0.74      0.85        34
      EP_MIL       1.00      0.11      0.20        18
      EP_NGO       1.00      0.46      0.63        63
      EP_POL       0.86      0.96      0.90      1156
      EP_REL       0.00      0.00      0.00         5
      EP_SCI       1.00    

  _warn_prf(average, modifier, msg_start, len(result))


# 14 - Save Fine-Tuned Model 

In [None]:
import os

directory = '/trained-bert'

if not os.path.exists(directory):
    os.makedirs(directory)

# save vocabulary of the tokenizer
tokenizer.save_vocabulary(directory)
# save the model weights and its configuration file
model.save_pretrained(directory)
print('All files saved')

All files saved


# 15 - Load Fine-Tuned Model

In [None]:
model = BertForTokenClassification.from_pretrained('trained-bert', num_labels = len(label_to_id))
model.to(device)
tokenizer = BertTokenizerFast.from_pretrained('trained-bert')

# 16 - Load Lexicon Data, Convert to Sentences and Sample for Lexicon Creation

In [18]:
lex_data = pd.read_csv(lex_data_path)
t = []
l = []
s_ids = []
test = lex_data.groupby(['sentence_id'])
for name,group in test:
  t.append(group.token.values.tolist())
  l.append(group.label.values.tolist())
  s_ids.append(name)

lex_data = pd.DataFrame({'sentence_id' : s_ids, 'token' : t, 'label' : l })
lex_data

Unnamed: 0,sentence_id,token,label
0,167660,"[an, denen, wir, arbeiten, müssen, ., Das, gro...","[0.0, 0.0, 21.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,167661,"[Das, ist, ein, Zeichen, des, geeinten, Europa...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,167662,"[Liebe, Kolleginnen, und, Kollegen, ,, wir, wi...","[0.0, 0.0, 0.0, 0.0, 0.0, 21.0, 0.0, 0.0, 0.0,..."
3,167663,"[Darüber, sprechen, wir, eigentlich, schon, vi...","[0.0, 0.0, 21.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
4,167664,"[Der, Hauptauftrag, für, die, neue, EU-Kommiss...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...
166611,334271,"[Der, letzte, Punkt, ;, Frau, Deligöz, hat, es...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 21.0, 0.0,..."
166612,334272,"[mit, der, Begründung, ,, die, Bundesregierung...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
166613,334273,"[An, die, Kommunen, und, an, die, Familien, se...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 21.0,..."
166614,334274,"[Herzlichen, Dank, .]","[0.0, 0.0, 0.0]"


In [None]:
lex_data.to_csv(lex_directory + 'lex_data_sentences.csv', index=False)

In [19]:
lex_data = pd.read_csv(lex_directory + 'lex_data_sentences.csv')
lex_data = lex_data.sample(frac = 0.3, random_state = 42).reset_index(drop=True)
lex_data

Unnamed: 0,sentence_id,token,label
0,308146,"['Sehr', 'geehrter', 'Herr', 'Präsident', '!',...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,287090,"['Wer', 'Syrien', 'aber', 'nicht', 'erwähnt', ...","[0.0, 55.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,175325,"['Herr', 'Präsident', '!', 'Liebe', 'Kolleginn...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,224246,"['Ich', 'weiß', 'nicht', ',', 'ob', 'Sie', 'in...","[21.0, 0.0, 0.0, 0.0, 0.0, 21.0, 0.0, 0.0, 0.0..."
4,252663,"['In', 'diese', 'Liste', 'reiht', 'sich', 'im'...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...
49980,298298,"['braucht', 'es', 'ganz', 'klare', 'Schritte',...","[0.0, 21.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
49981,251090,"['Europa', 'ist', 'eine', 'Notwendigkeit', ','...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
49982,280427,"['Das', 'ist', 'keine', 'Wertschätzung', ',', ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
49983,321833,"['Deshalb', ':', 'Schaffen', 'Sie', 'Klarheit'...","[0.0, 0.0, 0.0, 21.0, 0.0, 0.0, 0.0, 21.0, 0.0..."


# 17 - Generate Lexicon

In [None]:
lex_set = dataset(lex_data, tokenizer, MAX_LEN)

BATCH_SIZE_LEX = 1

lex_params = {'batch_size': BATCH_SIZE_LEX,
                'shuffle': False,
                'num_workers': 0
                }

lex_loader = DataLoader(lex_set, **lex_params)

# implementation similar to the previous test function, again, adapted from the mentioned sources
def predict(model, lex_loader):
    prediction = []
    tok = []

    model.eval()
    
    t0 = time.time()
    
    with torch.no_grad():
        for idx, batch in enumerate(lex_loader):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            
            outputs = model(input_ids=ids, attention_mask=mask)
            logits = outputs[0]

        
            if idx % 1500 == 0:
                elapsed = format_time(time.time() - t0)
                print(' Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(idx, len(lex_loader), elapsed))
              
            active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

            tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
            token_predictions = [id_to_label[i] for i in flattened_predictions.cpu().numpy()]
            wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)


            i = 0

            for token_pred, mapping in zip(wp_preds, batch["offset_mapping"].squeeze().tolist()):
            #only predictions on first word pieces are important
              if mapping[0] == 0 and mapping[1] != 0:

                # ggf raus und auch O mit rein nehmen
                # idx speichern für satz id!! 
                if token_pred[1] != 'O':
                  prediction.append(token_pred[1])
                  #print(token_pred[1])
                  #t = lex_data.iloc[idx]['token'][i]
                  t = lex_data.token[idx].strip('][').replace('\'', '').split(', ')[i]
                  #print(t)
                  tok.append(t)

              else:
                continue
              i += 1


    return tok, prediction

In [None]:
toks, preds = predict(model, lex_loader)

 Batch     0  of  49,985.    Elapsed: 0:00:00.
 Batch 1,500  of  49,985.    Elapsed: 0:00:38.
 Batch 3,000  of  49,985.    Elapsed: 0:01:19.
 Batch 4,500  of  49,985.    Elapsed: 0:01:59.
 Batch 6,000  of  49,985.    Elapsed: 0:02:39.
 Batch 7,500  of  49,985.    Elapsed: 0:03:20.
 Batch 9,000  of  49,985.    Elapsed: 0:04:00.
 Batch 10,500  of  49,985.    Elapsed: 0:04:41.
 Batch 12,000  of  49,985.    Elapsed: 0:05:21.
 Batch 13,500  of  49,985.    Elapsed: 0:06:01.
 Batch 15,000  of  49,985.    Elapsed: 0:06:42.
 Batch 16,500  of  49,985.    Elapsed: 0:07:22.
 Batch 18,000  of  49,985.    Elapsed: 0:08:03.
 Batch 19,500  of  49,985.    Elapsed: 0:08:43.
 Batch 21,000  of  49,985.    Elapsed: 0:09:23.
 Batch 22,500  of  49,985.    Elapsed: 0:10:04.
 Batch 24,000  of  49,985.    Elapsed: 0:10:44.
 Batch 25,500  of  49,985.    Elapsed: 0:11:24.
 Batch 27,000  of  49,985.    Elapsed: 0:12:05.
 Batch 28,500  of  49,985.    Elapsed: 0:12:45.
 Batch 30,000  of  49,985.    Elapsed: 0:13:25.

In [None]:
lex_df = pd.DataFrame(list(zip(toks,preds)), columns = ('token', 'predictions'))

In [None]:
lex_df.to_csv(lex_directory + 'lex_predictions.csv', index=False)

In [None]:
lex_df = pd.read_csv(lex_directory + 'lex_predictions.csv')
lex_df

Unnamed: 0,token,predictions
0,Bürger,B-P_GEN
1,Syrien,B-GPE
2,Jemen,B-GPE
3,Bürger,B-P_GEN
4,Deutschlands,B-GPE
...,...,...
32656,deutsch,B-P_NAT
32657,Kindern,B-P_AGE
32658,Bürger,B-P_GEN
32659,Wissenschaftler,B-P_FUNC


In [None]:
# combine predicted tokens to entities based on the IOB2 format
toks = lex_df.token.to_list()
predictions = lex_df.predictions.to_list()
toks_combined = []
predictions_combined = []
current_word = ''
current_label= ''
i = 0
for p in predictions:
  if p.startswith('B-'):
    if current_word == '':
      current_word = toks[i]
      current_label = p.replace('B-', '')
    else: 
      toks_combined.append(current_word)
      predictions_combined.append(current_label)
      current_word = toks[i]
      current_label = p.replace('B-', '')
  elif p.startswith('I-'):
    if p.replace('I-', '') == current_label:
      current_word += ' ' + toks[i]
    else:
      toks_combined.append(current_word)
      predictions_combined.append(current_label)
      current_word = ''
      current_label = ''
  i += 1

lex_df = pd.DataFrame(list(zip(toks_combined,predictions_combined)), columns = ('token', 'predictions'))
lex_df


Unnamed: 0,token,predictions
0,Bürger,P_GEN
1,Syrien,GPE
2,Jemen,GPE
3,Bürger,P_GEN
4,Deutschlands,GPE
...,...,...
26578,Kinder,P_AGE
26579,deutsch,P_NAT
26580,Kindern,P_AGE
26581,Bürger,P_GEN


In [None]:
lex_df.columns = ['entity', 'label']
lex_df = lex_df.drop_duplicates().sort_values(['label'])
lex_df['label'].replace('', np.nan, inplace=True)
lex_df = lex_df.dropna(subset=['label']).reset_index(drop=True)
lex_df.to_csv(lex_directory + 'lex.csv', index=False)
lex_df

Unnamed: 0,entity,label
0,Frankfurter Rundschau,EO_MEDIA
1,BMWi,EO_MEDIA
2,Anne,EO_MEDIA
3,DiGAs,EO_MEDIA
4,BMU,EO_MEDIA
...,...,...
2253,geflüchteten,P_SOZ
2254,Mittelschichten,P_SOZ
2255,Ober-,P_SOZ
2256,Oberschicht,P_SOZ


In [None]:
num = lex_df.label.value_counts()
num.to_csv(lex_directory + 'lex_label_counts.csv')

In [None]:
num

EP_POL       1010
P_FUNC        313
GPE           245
EO_MEDIA      196
P_NAT         118
EO_POL         65
EO_SCI         59
EO_WIRT        55
P_ETH          42
P_GEN          39
EO_MOV         38
P_AGE          30
P_SOZ          19
EO_MIL         10
EP_NGO          8
EO_NGO          4
EP_SCI          2
EP_MEDIA        2
EP_FINANZ       1
EO_REL          1
EP_MIL          1
Name: label, dtype: int64

# 18 - Sample Entities from Lexicon for Manual Evaluation

In [None]:
import random
lex_df = pd.read_csv(lex_directory + 'lex.csv')
lex_df_group = lex_df.groupby(['label'])
random.seed(42)
eval_ent = []
eval_lab = []
for name,group in lex_df_group:
  if len(group.entity.values.tolist()) < 20:
    eval_ent.extend(group.entity.values.tolist())
    eval_lab.extend(group.label.values.tolist())  
  else:
    for i in range(0,20):
      idx = random.randrange(0, len(group.entity.values.tolist()))
      while group.entity.values.tolist()[idx] in eval_ent:
        idx = random.randrange(0, len(group.entity.values.tolist()))
      eval_ent.append(group.entity.values.tolist()[idx])
      eval_lab.append(group.label.values.tolist()[0])
lex_eval_df = pd.DataFrame({'entity' : eval_ent, 'label' : eval_lab})
lex_eval_df

Unnamed: 0,entity,label
0,BfR,EO_MEDIA
1,Statistische,EO_MEDIA
2,Bildes,EO_MEDIA
3,Rheinischen Post,EO_MEDIA
4,Hier,EO_MEDIA
...,...,...
283,geflüchteten,P_SOZ
284,Mittelschichten,P_SOZ
285,Ober-,P_SOZ
286,Oberschicht,P_SOZ


In [None]:
lex_eval_df.to_csv(lex_directory + 'lex_eval.csv', index = False)
lex_eval_df.label.value_counts()

EO_MEDIA     20
EP_POL       20
EO_MOV       20
P_NAT        20
EO_POL       20
P_GEN        20
EO_SCI       20
EO_WIRT      20
P_FUNC       20
P_ETH        20
P_AGE        20
GPE          20
P_SOZ        19
EO_MIL       10
EP_NGO        8
EO_NGO        4
EP_SCI        2
EP_MEDIA      2
EP_FINANZ     1
EO_REL        1
EP_MIL        1
Name: label, dtype: int64

In [None]:
# after manual evaluation, check number of evaluations
evaled_df = pd.read_csv(lex_directory + 'lex_eval_final.csv')
evaled_df.value_counts('evaluation')

evaluation
1    156
0    130
dtype: int64

In [None]:
# sort generated lexicon 
lex_df = pd.read_csv(lex_directory + 'lex.csv')
lex_df = lex_df.sort_values(['label', 'entity'], ascending = (True, True))
lex_df.to_csv(lex_directory + 'lex.csv', index = False)
lex_df

Unnamed: 0,entity,label
0,ABC,EO_MEDIA
1,ADVA,EO_MEDIA
2,AZR,EO_MEDIA
3,Agora,EO_MEDIA
4,Alexa,EO_MEDIA
...,...,...
2253,Schicht,P_SOZ
2254,Schichten,P_SOZ
2255,geflüchtet,P_SOZ
2256,geflüchtete,P_SOZ


# 19 - Compare Generated Lexicon with Seed Lexicon

In [None]:
seed_df = pd.read_csv(lex_directory + 'seed_lex.csv')
seed_df.value_counts('label')

label
EP_WIRT      500
EP_POL       500
EP_MEDIA     500
EO_WIRT      499
P_ETH        498
P_FUNC       497
EO_KULT      495
EP_MIL       493
EP_SCI       491
EO_SCI       491
EP_KULT      490
EP_FINANZ    489
EP_NGO       483
EO_POL       479
EP_REL       475
EO_MIL       457
EO_NGO       429
EO_MOV       423
EO_FINANZ    300
GPE          262
EO_MEDIA     252
EO_REL       233
P_NAT        181
EP_MOV       170
P_SOZ        158
P_AGE         22
EP_OWN        12
P_GEN          5
dtype: int64

In [None]:
seed_df.count()

entity    10284
label     10284
dtype: int64

In [None]:
# get only those (new) entities, that are not contained in the seed lexicon
new_ents = lex_df[~lex_df.apply(tuple,1).isin(seed_df.apply(tuple,1))]
new_ents.value_counts('label')

label
EP_POL      724
P_FUNC      300
EO_MEDIA    181
GPE         126
P_NAT        79
EO_POL       57
EO_SCI       48
EO_WIRT      45
EO_MOV       34
P_GEN        34
P_ETH        26
P_AGE        24
P_SOZ        16
EO_MIL        7
EP_NGO        5
EO_NGO        2
EP_MEDIA      1
EP_SCI        1
dtype: int64

In [None]:
new_ents.count()

entity    1710
label     1710
dtype: int64

# 20 - Check Coverage of Lexicon based on Min Annotations of Gold Data

In [None]:
def create_tags_from_nums(row):
  return id_to_label[row].replace('B-','').replace('I-','')


gold_ents = pd.read_csv(path_gold_data + 'processed/lex_min_annotations_gold.csv')
lex_df = pd.read_csv(lex_directory + 'lex.csv')

gold_ents['label'] = gold_ents['label'].apply(create_tags_from_nums)
gold_ents

Unnamed: 0,entity,label
0,16,EP_POL
1,265,EP_POL
2,287,EP_POL
3,314,EP_POL
4,330,EP_POL
...,...,...
2172,unser,GPE
2173,wir,GPE
2174,Äthiopien,GPE
2175,Österreich,GPE


In [None]:
gold_ents.count()

entity    2177
label     2177
dtype: int64

In [None]:
gold_ents.value_counts('label')

label
P_FUNC       513
EO_POL       405
EP_POL       375
GPE          240
P_AGE         85
EO_WIRT       72
P_NAT         70
P_ETH         66
P_SOZ         63
EO_FINANZ     39
EO_MIL        39
EO_NGO        31
P_GEN         26
EO_MEDIA      24
EP_SCI        23
EP_KULT       19
EO_SCI        15
EP_MOV        13
EP_WIRT       12
EO_MOV        10
EP_FINANZ      9
EP_OWN         8
EP_MEDIA       6
EP_MIL         6
EP_NGO         5
EO_REL         2
EP_REL         1
dtype: int64

In [None]:
found_ents = lex_df[lex_df.apply(tuple,1).isin(gold_ents.apply(tuple,1))]
found_ents

Unnamed: 0,entity,label
35,Bild,EO_MEDIA
69,Deutschlandfunk,EO_MEDIA
80,FAZ,EO_MEDIA
94,Handelsblatt,EO_MEDIA
115,Morgenmagazin,EO_MEDIA
...,...,...
2180,Kanadier,P_NAT
2203,Rumänen,P_NAT
2204,Russen,P_NAT
2222,Türken,P_NAT


In [None]:
found_ents.value_counts('label')

label
GPE         80
P_FUNC      45
P_AGE       11
EO_MEDIA    10
P_NAT        9
P_GEN        7
EO_WIRT      6
EP_POL       6
EO_MIL       4
EO_POL       3
EO_NGO       2
EO_SCI       2
P_ETH        2
P_SOZ        1
dtype: int64

In [None]:
found_ents.count()

entity    188
label     188
dtype: int64