In [16]:
import json
from typing import List
import random
from transformers import BertTokenizer, BertModel, pipeline
from scipy.spatial import distance
import os
os.environ["HF_ENDPOINT"] = "https://huggingface.co"

# Loading data

In [3]:
def read_json(file_path):
    with open(file_path, encoding='utf-8') as f:
        data = json.load(f)
    return data

In [4]:
FILE_PATH = '..\\Data\\aligned\\20170207_test_alignment.json'
data = read_json(FILE_PATH)

# Dividing the alignments into sentences

In [5]:
def alignments_to_sentences(alignments: List[List[str]]):
    all_sentences = []
    new_sent_transcript, new_sent_gold_label = [], []
    for alignment in alignments:
        new_sent_transcript.append(alignment[0])
        new_sent_gold_label.append(alignment[1])
        if alignment[0] in ['.', '!', '?']:
            all_sentences.append([new_sent_transcript, new_sent_gold_label])
            new_sent_transcript, new_sent_gold_label = [], []
    return all_sentences

In [6]:
sentences = alignments_to_sentences(alignments=data)
print(sentences[15])

[['Ketil', 'kjenseth', 'vær', 'så', 'god', '.'], ['Ketil', 'Kjenseth', 'vær', 'så', 'god', '.']]


# Two approaches for SWER
## Approach 1
Function that receives BERT tokenizer & model as input and returns embeddings

Challenges:
 - Cosine distance between similar words
 - What to do with [CLS] and [SEP] tokens

## Approach 2
Function that receives huggingface feature-extractor pipeline as input

Challenges:
 - Normalize SWER(?)

In [46]:
def get_model_and_tokenizer(bert_model):
    tokenizer = BertTokenizer.from_pretrained(bert_model)
    model = BertModel.from_pretrained(bert_model)
    return model, tokenizer


def list_to_bert_embeddings(words_list, model, tokenizer):
    input_ids = tokenizer.encode(words_list, return_tensors='pt')
    output = model(input_ids)
    embeddings = output[0][0, :, :]
    return embeddings

def equalize_lists(list1, list2):
    if len(list1) < len(list2):
        list1 += [''] * (len(list2) - len(list1))
    elif len(list2) < len(list1):
        list2 += [''] * (len(list1) - len(list2))
    return list1, list2

def wer(sent1: List[str], sent2: List[str], case_sensitive=True):
    if not len(sent1) == len(sent2):
        sent1, sent2 = equalize_lists(sent1, sent2)

    errors = 0
    for w1, w2 in zip(sent1, sent2):
        if case_sensitive:
            if w1 != w2:
                errors += 1
        else:
            if w1.lower() != w2.lower():
                errors += 1
    return errors / len(sent1)


def swer(pipe: pipeline, sent1: List[str], sent2: List[str], case_insensitive=False):
    # Implement some kind of weight / factor?
    if not len(sent1) == len(sent2):
        sent1, sent2 = equalize_lists(sent1, sent2)
        
    if case_insensitive:
        sent1 = [word.lower() for word in sent1]
        sent2 = [word.lower() for word in sent2]

    embeddings1, embeddings2 = pipe(sent1), pipe(sent2)
    cosine_distance = 0
    for e1, e2 in zip(embeddings1, embeddings2):
        cosine_distance += distance.cosine(e1[0][1], e2[0][1])
    return cosine_distance / len(sent1)

In [10]:
# model, tokenizer = get_model_and_tokenizer(MODEL_NAMES[0])
MODEL_NAMES = ['ltg/norbert', 'ltgoslo/norbert2', 'NbAiLab/nb-bert-base']
norbert = pipeline('feature-extraction', model=MODEL_NAMES[0], tokenizer=MODEL_NAMES[0])
norbert2 = pipeline('feature-extraction', model=MODEL_NAMES[1], tokenizer=MODEL_NAMES[1])
nbbert = pipeline('feature-extraction', model=MODEL_NAMES[2], tokenizer=MODEL_NAMES[2])

Some weights of the model checkpoint at ltg/norbert were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at ltgoslo/norbert2 were not used when initializing BertModel: ['cls.predictions.transform.L

# Experimenting
## Trying the different models on random sentences

In [22]:
example_sentences = sentences[19:20]

def test_swer(pipe: pipeline, sentences):
    for sent in sentences:
        print('Transcript:', ' '.join(sent[0]))
        print('Gold label', ' '.join(sent[1]))
        print('SWER:', swer(pipe, sent[0], sent[1]))
        print('Case-insensitive SWER', swer(pipe, sent[0], sent[1], True))
        print('WER:',wer(sent[0], sent[1]))
        print()

for model in [norbert, norbert2, nbbert]:
    test_swer(model, example_sentences)
    # print(example_sentences)

[[['Det', 'opereres', 'med', 'ulike', 'tall', 'på', 'forekomst', 'av', 'autisme', ',', 'men', 'cirka', '1', '%', 'av', 'befolkningen', 'har', 'autisme', '.'], ['det', 'opereres', 'med', 'ulike', 'tall', 'på', 'forekomst', 'av', 'autisme', '', 'men', 'cirka', 'én', 'prosent', 'av', 'befolkningen', 'har', 'autisme', '.']]]
[[['Det', 'opereres', 'med', 'ulike', 'tall', 'på', 'forekomst', 'av', 'autisme', ',', 'men', 'cirka', '1', '%', 'av', 'befolkningen', 'har', 'autisme', '.'], ['det', 'opereres', 'med', 'ulike', 'tall', 'på', 'forekomst', 'av', 'autisme', '', 'men', 'cirka', 'én', 'prosent', 'av', 'befolkningen', 'har', 'autisme', '.']]]
[[['Det', 'opereres', 'med', 'ulike', 'tall', 'på', 'forekomst', 'av', 'autisme', ',', 'men', 'cirka', '1', '%', 'av', 'befolkningen', 'har', 'autisme', '.'], ['det', 'opereres', 'med', 'ulike', 'tall', 'på', 'forekomst', 'av', 'autisme', '', 'men', 'cirka', 'én', 'prosent', 'av', 'befolkningen', 'har', 'autisme', '.']]]


# Testing random sentences
To compare the magnitude of SWER to WER

In [49]:
def test_two_random_sents(n: int, pipe: pipeline):
    sents = list()
    for _ in range(n):
        sent1 = random.sample(random.sample(sentences, 1)[0], 1)[0]
        sent2 = random.sample(random.sample(sentences, 1)[0], 1)[0]
        sents.append([sent1, sent2])
    test_swer(pipe, sents)

for model in [norbert, norbert2, nbbert]:
    test_two_random_sents(3, model)
    break

Transcript: Pårørende interesseorganisasjoner , arbeidstakerorganisasjoner og fagmiljøet .
Gold label De regionale retningslinjer og annen materiell kan støtte klinisk praksis i kommuner og sykehus , og er tilgjengelig for alle i helsebiblioteket dott no .
SWER: 0.48754958670399
Case-insensitive SWER 0.4849737598449223
WER: 1.0

Transcript: Jeg ser jo noen punkter som .
Gold label og det 
SWER: 0.3516760622545088
Case-insensitive SWER 0.31533645679276257
WER: 1.0

Transcript:  Helse Sør - Øst har et TIOBA - nettverk som har etablert samarbeid med tilsvarende nettverk og fagmiljøer i de andre helseregionene .
Gold label og så er det en del mystikk knyttet til det òg sånn at kanskje det er kansj dette området her 
SWER: 0.43911258756057525
Case-insensitive SWER 0.4155948344847831
WER: 1.0

