# To successfully execute this notebook, make sure you have access to a GPU :)

In [2]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from keras.preprocessing.sequence import pad_sequences

import transformers
from transformers import BertTokenizer, BertConfig
from transformers import get_linear_schedule_with_warmup
from transformers import BertForTokenClassification, AdamW

from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pickle

In [9]:
import pandas as pd
import numpy as np
import csv

### Set-up data

In [10]:
data = pd.read_csv('clean_data.csv', sep='|', quoting=csv.QUOTE_NONE).fillna(method='ffill')
data.tail(5)

Unnamed: 0,sen_no,word,tagged
2157043,66722,135,I-GS
2157044,66722,Abs.,I-GS
2157045,66722,1,I-GS
2157046,66722,FGO,I-GS
2157047,66722,.,O


### Set-up data iterator

In [7]:
class GetSentence(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg = lambda s: [(w, t) for w, t in zip(s['word'].values.tolist(), s['tagged'].values.tolist())]
        self.grouped = self.data.groupby('sen_no').apply(agg)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try:
            s = self.grouped['{}'.format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [8]:
getter = GetSentence(data)

In [9]:
sentences = [[word[0] for word in sentence] for sentence in getter.sentences]
sentences[0]

['In',
 'der',
 'Beschwerdesache',
 'betreffend',
 'die',
 'Marke',
 '30',
 '2012',
 '044',
 '496',
 'hat',
 'der',
 '25.',
 'Senat',
 '(',
 'Marken-Beschwerdesenat',
 ')',
 'des',
 'Bundespatentgerichts',
 'am',
 '18.',
 'Oktober',
 '2017',
 'unter',
 'Mitwirkung',
 'des',
 'Vorsitzenden',
 'Richters',
 'Knoll',
 ',',
 'der',
 'Richterin',
 'Kriener',
 'und',
 'des',
 'Richters',
 'Dr.',
 'Nielsen',
 'beschlossen',
 ':']

In [10]:
labels = [[s[1] for s in sentence] for sentence in getter.sentences]
print(labels[0])

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-GRT', 'I-GRT', 'I-GRT', 'I-GRT', 'I-GRT', 'I-GRT', 'I-GRT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-RR', 'O', 'O', 'O', 'B-RR', 'O', 'O', 'O', 'O', 'B-RR', 'O', 'O']


### Set of unique tags and its indices

In [11]:
tag_values = list(set(data['tagged'].values))
tag_values.append('PAD')
tag2idx = {t: i for i, t in enumerate(tag_values)}

Save **`tag_values`** as it will be required for later use.

In [12]:
t_values = open("tag_values.pkl", "wb")
pickle.dump(tag_values, t_values)
t_values.close()

### Set-up BERT tokenizer from pre-trained 

In [None]:
tokenizer = BertTokenizer.from_pretrained('invoiced_base_german_cased', do_lower_case=False)

In [14]:
save_tokenizer = open("tokenizer.pkl", "wb")
pickle.dump(tokenizer, save_tokenizer)
save_tokenizer.close()

In [15]:
def tokenize_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []
    
    for word, label in zip(sentence, text_labels):
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)
        tokenized_sentence.extend(tokenized_word)
        labels.extend([label] * n_subwords)
        
    return tokenized_sentence, labels

In [16]:
%%time
tokenized_texts_labels = [tokenize_preserve_labels(sent, labels) for sent, labels in zip(sentences, labels)]

CPU times: user 1min 45s, sys: 419 ms, total: 1min 45s
Wall time: 1min 45s


Extract tokens and labels

In [17]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_labels]

### Apply padding and generate attention_mask

In [18]:
MAX_LEN = 75
BATCH_SIZE = 64

In [19]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts], maxlen=MAX_LEN, dtype='long', value=0.0, truncating='post', padding='post')

In [20]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels], maxlen=MAX_LEN, value=tag2idx['PAD'], padding='post', dtype='long', truncating='post')

In [21]:
attention_mask = [[float(i != 0.0) for i in ii] for ii in input_ids]

### Prepare training and testing data

Split data and attention mask.

In [22]:
X_train, X_test, y_train, y_test = train_test_split(input_ids, tags, random_state=42, test_size=0.1)
tr_mask, val_mask, _, _ = train_test_split(attention_mask, input_ids, random_state=42, test_size=0.1)

In [23]:
X_train, X_test, y_train, y_test = torch.tensor(X_train), torch.tensor(X_test), torch.tensor(y_train), torch.tensor(y_test)
tr_mask, val_mask = torch.tensor(tr_mask), torch.tensor(val_mask)

Create data-loaders.

In [24]:
train_data = TensorDataset(X_train, tr_mask, y_train)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

valid_data = TensorDataset(X_test, val_mask, y_test)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=BATCH_SIZE)

### Pull and fine-tune **`bert-base-german-cased`** model

In [None]:
model = BertForTokenClassification.from_pretrained('invoiced_base_german_cased', num_labels=len(tag2idx), output_attentions=False, output_hidden_states=False)

In [26]:
model.cuda();

In [27]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters)
    optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer]}]

In [28]:
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5, eps=1e-8)

### Training and evaluation

In [29]:
EPOCHS = 3
MAX_GRAD_NORM = 1.0

total_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [30]:
%%time
loss_values, validation_loss_values = [], []

for e in range(EPOCHS):
    print(f'- Epoch 0{e+1} -')
    model.train()
    total_loss = 0
    
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        model.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        loss.backward()
        total_loss += loss.item()
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=MAX_GRAD_NORM)
        optimizer.step()
        scheduler.step()
        
    avg_train_loss = total_loss / len(train_dataloader)
    print('Average train loss:\t{:.5f}'.format(avg_train_loss))
    loss_values.append(avg_train_loss)
    
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    predictions, true_labels = [], []
    
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        eval_loss += outputs[0].mean().item()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)
        
    eval_loss = eval_loss / len(valid_dataloader)
    validation_loss_values.append(eval_loss)
    print('Validation loss:\t{:.5f}'.format(eval_loss))
    
    pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels) for p_i, l_i in zip(p, l) if tag_values[l_i] != 'PAD']
    valid_tags = [tag_values[l_i] for l in true_labels for l_i in l if tag_values[l_i] != 'PAD']

    print('Validation accuracy:\t{:.5f}'.format(accuracy_score(pred_tags, valid_tags)))
    print('Validation precision:\t{:.5f}'.format(precision_score(pred_tags, valid_tags, average='micro')))
    print('Validation recall:\t{:.5f}'.format(recall_score(pred_tags, valid_tags, average='micro')))
    print('Validation f1-score:\t{:.5f}\n'.format(f1_score(pred_tags, valid_tags, average='micro')))

- Epoch 01 -
Average train loss:	0.08197
Validation loss:	0.02941
Validation accuracy:	0.99195
Validation precision:	0.99195
Validation recall:	0.99195
Validation f1-score:	0.99195

- Epoch 02 -
Average train loss:	0.01605
Validation loss:	0.02284
Validation accuracy:	0.99389
Validation precision:	0.99389
Validation recall:	0.99389
Validation f1-score:	0.99389

- Epoch 03 -
Average train loss:	0.00752
Validation loss:	0.02323
Validation accuracy:	0.99488
Validation precision:	0.99488
Validation recall:	0.99488
Validation f1-score:	0.99488

CPU times: user 45min 54s, sys: 16.1 s, total: 46min 11s
Wall time: 46min 17s


Calculate confusion matrix to identify **TP**, **TN**, **FP**, and **FN**. This is required to calculate **Micro- precision**, **recall**, and **F1-Score**.

In [31]:
tags = list(set(valid_tags))

In [32]:
matrix = multilabel_confusion_matrix(valid_tags, pred_tags, labels=tags)

In [33]:
tags_eval = {}
for t, m in zip(tags, matrix):
    tag = t.split('-')[-1]
    if tag not in tags_eval:
        tags_eval[tag] = [[], [], [], []] # tp, tn, fp, fn

    tn, fp = m[0]
    fn, tp = m[1]

    tags_eval[tag][0].append(tp)
    tags_eval[tag][1].append(tn)
    tags_eval[tag][2].append(fp)
    tags_eval[tag][3].append(fn)

Map fine-grained classes to actual classes.

In [34]:
classes = {'Person': 'PER', 'Judge': 'RR', 'Lawyer': 'AN',
           'Country': 'LD', 'City': 'ST', 'Street': 'STR', 'Landscape': 'LDS',
           'Organization': 'ORG', 'Company': 'UN', 'Institution': 'INN', 'Court': 'GRT', 'Brand': 'MRK',
           'Law': 'GS', 'Ordinance': 'VO', 'European legal norm': 'EUN',
           'Regulation': 'VS', 'Contract': 'VT',
           'Court decision': 'RS',
           'Legal literature': 'LIT'}

Calculate Micro averaged performance metrics.

In [35]:
for c in classes:
    t = classes[c]
    v = tags_eval[t]

    precision = sum(v[0])/(sum(v[0]) + sum(v[2]))
    recall = sum(v[0])/(sum(v[0]) + sum(v[3]))
    f1 = 2 * ((precision * recall) / (precision + recall))

    classes[c] = [round(precision*100, 2), round(recall*100, 2), round(f1*100, 2)]

In [36]:
classes

{'Brand': [83.13, 40.59, 54.55],
 'City': [82.09, 90.91, 86.27],
 'Company': [96.99, 93.88, 95.41],
 'Contract': [95.36, 92.26, 93.78],
 'Country': [91.67, 93.29, 92.47],
 'Court': [99.11, 98.02, 98.56],
 'Court decision': [99.11, 99.77, 99.44],
 'European legal norm': [95.82, 97.54, 96.67],
 'Institution': [92.86, 93.85, 93.35],
 'Judge': [96.97, 98.6, 97.78],
 'Landscape': [68.0, 45.95, 54.84],
 'Law': [99.3, 99.51, 99.4],
 'Lawyer': [86.96, 76.92, 81.63],
 'Legal literature': [98.17, 96.03, 97.09],
 'Ordinance': [98.86, 97.86, 98.36],
 'Organization': [85.06, 91.39, 88.11],
 'Person': [91.01, 88.36, 89.66],
 'Regulation': [93.25, 94.7, 93.97],
 'Street': [82.0, 75.93, 78.85]}

Finally, save our model for later use.

In [37]:
torch.save(model.state_dict(), "model.pt")