In [2]:
import json
from typing import List
from transformers import BertTokenizer, BertModel, pipeline
from scipy.spatial import distance

# Loading data

In [3]:
def read_json(file_path):
    with open(file_path, encoding='utf-8') as f:
        data = json.load(f)
    return data

In [20]:
FILE_PATH = '..\\Data\\aligned\\20170207_test_alignment.json'
data = read_json(FILE_PATH)

# Dividing the alignments into sentences

In [21]:
def alignments_to_sentences(alignments: List[List[str]]):
    all_sentences = []
    new_sent_transcript, new_sent_gold_label = [], []
    for alignment in alignments:
        new_sent_transcript.append(alignment[0])
        new_sent_gold_label.append(alignment[1])
        if alignment[0] in ['.', '!', '?']:
            all_sentences.append([new_sent_transcript, new_sent_gold_label])
            new_sent_transcript, new_sent_gold_label = [], []
    return all_sentences

In [22]:
sentences = alignments_to_sentences(alignments=data)
print(sentences[15])

[['Ketil', 'kjenseth', 'vær', 'så', 'god', '.'], ['Ketil', 'Kjenseth', 'vær', 'så', 'god', '.']]


# Two approaches for SWER
## Approach 1
Function that receives BERT tokenizer & model as input and returns embeddings

Challenges:
 - Cosine distance between similar words
 - What to do with [CLS] and [SEP] tokens

## Approach 2
Function that receives huggingface feature-extractor pipeline as input

Challenges:
 - Normalize SWER(?)

In [12]:
def get_model_and_tokenizer(bert_model):
    tokenizer = BertTokenizer.from_pretrained(bert_model)
    model = BertModel.from_pretrained(bert_model)
    return model, tokenizer


def list_to_bert_embeddings(words_list, model, tokenizer):
    input_ids = tokenizer.encode(words_list, return_tensors='pt')
    output = model(input_ids)
    embeddings = output[0][0, :, :]
    return embeddings


def wer(sent1: List[str], sent2: List[str], case_sensitive=True):
    assert len(sent1) == len(sent2)
    errors = 0
    for w1, w2 in zip(sent1, sent2):
        if case_sensitive:
            if w1 != w2:
                errors += 1
        else:
            if w1.lower() != w2.lower():
                errors += 1
    return errors / len(sent1)


def swer(pipe: pipeline, sent1: List[str], sent2: List[str], case_sensitive=True):
    # Implement some kind of weight / factor?
    # Add option for case sensitivity
    embeddings1, embeddings2 = pipe(sent1), pipe(sent2)
    cosine_distance = 0
    for e1, e2 in zip(embeddings1, embeddings2):
        cosine_distance += distance.cosine(e1[0][1], e2[0][1])
    return cosine_distance / len(sent1)

In [8]:
MODEL_NAME = 'ltgoslo/norbert2'
model, tokenizer = get_model_and_tokenizer(MODEL_NAME)
pipe = pipeline('feature-extraction', model=MODEL_NAME, tokenizer=MODEL_NAME)


Some weights of the model checkpoint at ltgoslo/norbert2 were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at ltgoslo/norbert2 were not used when initializing BertModel: ['cls.predictions.decode

In [18]:
example_sentences = [
    [['Det', 'er', 'bra'], ['Det', 'var', 'bra']],
    [['hei', 'på', 'deg'], ['hvor', 'er', 'statsministeren']],
    [['i', 'går', 'regnet', 'det', 'ikke', 'i', 'bergen'], ['i', 'går', 'regnet', 'det', 'igjen', 'i', 'bergen']],
]

for sent in example_sentences:
    print('Transcript:', ' '.join(sent[0]))
    print('Gold label', ' '.join(sent[1]))
    print('SWER:', swer(pipe, sent[0], sent[1]))
    print('WER:',wer(sent[0], sent[1]))
    print()

Transcript: Det er bra
Gold label Det var bra
SWER: 0.06289718300035559
WER: 0.3333333333333333

Transcript: hei på deg
Gold label hvor er statsministeren
SWER: 0.34909317233832926
WER: 1.0

Transcript: i går regnet det ikke i bergen
Gold label i går regnet det igjen i bergen
SWER: 0.04143750337862958
WER: 0.14285714285714285



In [146]:
sent1 = ['Jeg', 'liker', 'været', 'i', 'Bergen', '.']
sent2 = ['Jeg', 'hater', 'været', 'i', 'Bergen', '.']

example_sentence = [sent1, sent2]
embeddings_transcript = list_to_bert_embeddings(example_sentence[0], model, tokenizer)
embeddings_goldlabel = list_to_bert_embeddings(example_sentence[1], model, tokenizer)

for e1, e2 in zip(embeddings_transcript, embeddings_goldlabel):
    e1, e2 = e1.detach().numpy(), e2.detach().numpy()
    print(distance.cosine(e1, e2))

0.09794670343399048
0.07474690675735474
0.33774852752685547
0.09423786401748657
0.09536200761795044
0.04666769504547119
0.10574841499328613
0.04152554273605347
