In [2]:
import os
import json

os.getcwd()

'/beegfs/homes/rsari/notebooks'

In [3]:
id2tag = {0: 'Anrede', 1: 'Diagnosen', 2: 'AllergienUnverträglichkeitenRisiken', 3: 'Anamnese', 4: 'Medikation', 5: 'KUBefunde', 6: 'Befunde', 7: 'EchoBefunde', 8: 'Zusammenfassung', 9: 'Mix', 10: 'Abschluss'}
tag2id = {tag: id for id, tag in id2tag.items()}

labels = list(id2tag.values())

In [5]:
scores = {}
lines = []
with open("../BertSeqCA.txt", "r") as f:
    lines = f.readlines()
    lines = lines[35884:]
    for l in lines:
        if any([i for i in labels if i in l]):
            lab, pre, rec, f1, count = l.split()
            scores[lab] = {"pre": pre, "rec": rec, "f1": f1, "count": count}

In [6]:
from tabulate import tabulate

l = iter(labels)
table = [[n:=next(l), scores[n]["pre"], scores[n]["rec"], scores[n]["f1"], scores[n]["count"]] for lab in labels]
table = sorted(table, key=lambda x:float(x[3]), reverse = True)
table.insert(0, ["Label", "Precision", "Recall", "F1-score", "Test instance count"])
         
print(tabulate(table, headers="firstrow"))

Label                                  Precision    Recall    F1-score    Test instance count
-----------------------------------  -----------  --------  ----------  ---------------------
Anrede                                      1         1           1                        99
Medikation                                  0.99      0.98        0.98                   1627
KUBefunde                                   0.99      0.97        0.98                   1105
Diagnosen                                   0.96      0.97        0.96                   1738
AllergienUnverträglichkeitenRisiken         0.97      0.94        0.96                    236
Abschluss                                   0.94      0.99        0.96                   2472
Befunde                                     0.93      0.86        0.9                    2519
Zusammenfassung                             0.9       0.9         0.9                    2138
Anamnese                                    0.9       0.81  

In [7]:
# take second element for sort
def takeSecond(elem):
    return elem[1]

keys = iter(list(scores.keys()))
sorted_pre = sorted([(next(keys), i[y]) for i in scores.values() for y in i if y == "pre"], key=takeSecond, reverse=True)

print("Precision: (High FP: Other labels retrieved)")
sorted_pre

Precision: (High FP: Other labels retrieved)


[('Anrede', '1.00'),
 ('Medikation', '0.99'),
 ('KUBefunde', '0.99'),
 ('AllergienUnverträglichkeitenRisiken', '0.97'),
 ('Diagnosen', '0.96'),
 ('Abschluss', '0.94'),
 ('Befunde', '0.93'),
 ('Zusammenfassung', '0.90'),
 ('Anamnese', '0.90'),
 ('Mix', '0.76'),
 ('EchoBefunde', '0.60')]

In [8]:
keys = iter(list(scores.keys()))
sorted_rec = sorted([(next(keys), i[y]) for i in scores.values() for y in i if y == "rec"], key=takeSecond, reverse=True)

print("Recall: (High FN: Same label not retrieved)")
sorted_rec

Recall: (High FN: Same label not retrieved)


[('Anrede', '1.00'),
 ('Abschluss', '0.99'),
 ('Medikation', '0.98'),
 ('Diagnosen', '0.97'),
 ('KUBefunde', '0.97'),
 ('AllergienUnverträglichkeitenRisiken', '0.94'),
 ('EchoBefunde', '0.94'),
 ('Zusammenfassung', '0.90'),
 ('Befunde', '0.86'),
 ('Mix', '0.83'),
 ('Anamnese', '0.81')]

In [9]:
keys = iter(list(scores.keys()))
sorted_f1 = sorted([(next(keys), i[y]) for i in scores.values() for y in i if y == "f1"], key=takeSecond, reverse=True)

print("F1-Measure:")
sorted_f1

F1-Measure:


[('Anrede', '1.00'),
 ('Medikation', '0.98'),
 ('KUBefunde', '0.98'),
 ('Abschluss', '0.96'),
 ('Diagnosen', '0.96'),
 ('AllergienUnverträglichkeitenRisiken', '0.96'),
 ('Zusammenfassung', '0.90'),
 ('Befunde', '0.90'),
 ('Anamnese', '0.85'),
 ('Mix', '0.79'),
 ('EchoBefunde', '0.73')]

### Get Down to Low vs. High Precision

Approach:
1. Get training data statistic for lowest label scoring (EchoBefunde), e.g. vocab size, token size
    1. Compare to highest scoring label (Anrede)

In [15]:
texts = {l:[] for l in labels}

with open("../doctoral_letters/MIEdeep/data/PETsectionclass/full/full_main.tsv", "r") as f:
    lines = f.readlines()
    for d, l in enumerate(lines):
        text, lab = l.strip().split("\t")
        texts[lab].append(text)
sent_count = d

In [22]:
from numpy import testing

l = iter(labels)
table = [[n:=next(l), round(len(texts[n])/d,2)] for lab in labels]
table = sorted(table, key=lambda x: x[1], reverse = True)
testing.assert_almost_equal(sum(i[1] for i in table), 1, decimal=3)
table.insert(0, ["Label", "Sentence percentage"])
         
print(tabulate(table, headers="firstrow"))

Label                                  Sentence percentage
-----------------------------------  ---------------------
Abschluss                                             0.18
Befunde                                               0.17
Zusammenfassung                                       0.17
Diagnosen                                             0.14
Medikation                                            0.11
Anamnese                                              0.08
KUBefunde                                             0.07
EchoBefunde                                           0.03
AllergienUnverträglichkeitenRisiken                   0.02
Mix                                                   0.02
Anrede                                                0.01


In [21]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("/beegfs/scratch/rsari/BertSeqCA/checkpoint-10000")

In [22]:
def percent(num1, num2):
    return "{:.1%}".format(num1 / num2)

In [23]:
tokens = {l:[] for l in labels}

for lab in labels:
    tokens[lab] = [w for sent in texts[lab] for w in tokenizer.tokenize(sent, padding = True, truncation = True)]

vocab = {l:[] for l in labels}

for lab in labels:
    vocab[lab] = set(tokens[lab])

l = iter(labels)
table = [[n:=next(l), len(tokens[n]), len(vocab[n]), percent(len(vocab[n]),len(tokens[n]))] for lab in labels]
table = sorted(table, key=lambda x:int(x[1]), reverse = True)
table.insert(0, ["Label", "Token count", "Vocab size", "Ratio"])
         
print(tabulate(table, headers="firstrow"))

Label                                  Token count    Vocab size  Ratio
-----------------------------------  -------------  ------------  -------
Befunde                                     319795          7108  2.2%
Zusammenfassung                             258136          6602  2.6%
Diagnosen                                   222154          5560  2.5%
EchoBefunde                                  91260          2148  2.4%
Anamnese                                     87463          4282  4.9%
Medikation                                   82140          2189  2.7%
Abschluss                                    82102           630  0.8%
KUBefunde                                    49686          1381  2.8%
Anrede                                       26644           342  1.3%
AllergienUnverträglichkeitenRisiken          24239          1435  5.9%
Mix                                          21592          2071  9.6%


In [134]:
# For each label: Choose randomly 10 samples and calculate their homogeneity among each other:
import random
from statistics import mean
unis = {l: [] for l in labels}

for label in labels:
    for i in range(3):
        l = random.choices(texts[label], k=10)
        ratio = 0
        for d, s in enumerate(l):
            if d == 0:
                start = set(tokenizer.tokenize(s, padding = True, truncation = True))
            else:
                sent = set(tokenizer.tokenize(s, padding = True, truncation = True))
                uni_rat = (len((sent).intersection(start)))/len(sent.union(start))
                unis[label].append(round(uni_rat,2))
    unis[label] = mean(unis[label])
    
l = iter(labels)
table = [[n:=next(l), "{:.0%}".format(unis[n])] for lab in labels]
table = sorted(table, key = lambda x:int(x[1].strip("%")), reverse=True)
table.insert(0, ["Label", "Data Homogeneity"])
         
print(tabulate(table, headers="firstrow"))

Label                                Data Homogeneity
-----------------------------------  ------------------
Anrede                               69%
AllergienUnverträglichkeitenRisiken  19%
KUBefunde                            15%
Medikation                           13%
Diagnosen                            8%
EchoBefunde                          6%
Mix                                  5%
Abschluss                            5%
Zusammenfassung                      4%
Befunde                              3%
Anamnese                             2%


**Finding:**  
Though **Anrede** also has small training data like **Mix**, it scores high because of its low vocab size (since introductory sentences identical) → Low entropy as opposed to Mix being lowest scoring label.

In [27]:
# Correlation of Token/Vocab size with Precision/Recall/F1-Measure
from scipy.stats import spearmanr
import numpy as np

data1 = [len(tokens[l]) for l in labels]
data2 = [len(vocab[l]) for l in labels]

data3 = [float(scores[l]["f1"]) for l in labels]

corr_sanity, _ = spearmanr(np.array(data1).flatten(), np.array(data2).flatten())
corr, _ = spearmanr(np.array(data2).flatten(), np.array(data3).flatten())

print('Spearmans correlation of token to vocab size: %.3f and of vocab size to score: %.3f' % (corr_sanity, corr))

Spearmans correlation of token to vocab size: 0.818 and of vocab size to score: -0.465


In [28]:
tok_freq = {l:[] for l in labels}

for lab in labels:
    tok_freq[lab] = sorted([(tok, percent(tokens[lab].count(tok), len(tokens[lab]))) for tok in vocab[lab]], key=lambda x:float(x[1][:-1]), reverse=True)
    
l = iter(labels)
table = [[n:=next(l), tok_freq[n][:5]] for lab in labels]
table.insert(0, ["Label", "Token freq"])

print(tabulate(table, headers="firstrow"))

with open("TokenDistrib.json", "w") as f:
    json.dump(tok_freq, f)

Label                                Token freq
-----------------------------------  --------------------------------------------------------------------------------
Anrede                               [('-', '10.9%'), ('B', '6.6%'), ('P', '4.5%'), ('I', '4.3%'), ('##D', '3.3%')]
Diagnosen                            [('-', '12.0%'), ('/', '2.7%'), ('.', '2.5%'), ('##R', '2.4%'), ('##B', '2.3%')]
AllergienUnverträglichkeitenRisiken  [('-', '6.1%'), (',', '4.6%'), (':', '4.0%'), ('##R', '2.8%'), ('##B', '2.7%')]
Anamnese                             [('-', '4.8%'), ('.', '4.8%'), ('##e', '2.0%'), (',', '1.6%'), ('##R', '1.3%')]
Medikation                           [('-', '14.1%'), ('0', '9.2%'), ('1', '6.5%'), ('mg', '3.2%'), ('##m', '1.7%')]
KUBefunde                            [(':', '7.8%'), ('.', '5.5%'), (',', '5.1%'), ('keine', '2.9%'), ('/', '2.0%')]
Befunde                              [('-', '5.2%'), ('.', '4.6%'), (',', '2.7%'), ('/', '2.2%'), (':', '1.6%')]
EchoBefunde        

#### Label EchoBefunde

In [29]:
tokens["EchoBefunde"].count("Ech"), tokens["Anrede"].count("Ech") #weird, since in IG: "Ech" has positive attribution score for Anrede

(441, 0)

**Question:**
How then has "Ech" positive attribution for Anrede

#### Label Anrede

In [30]:
tokens["Anrede"].count("wurde"), tokens["EchoBefunde"].count("über") # explains positive attribution for "über" in "EchoBefunde"

(0, 18)

**See @ferret.ipynb**:

In [111]:
sent = tokenizer.tokenize("I-PHONE")
label = tag2id["Abschluss"]

In [112]:
sorted(set([(round(tokens[id2tag[label]].count(tok)/len(tokens[id2tag[label]]),4), tok) for tok in sent]), key=lambda x:x[0], reverse=True)

[(0.1128, '-'),
 (0.044, 'I'),
 (0.0388, 'P'),
 (0.0343, '##E'),
 (0.0039, '##HO'),
 (0.0039, '##N')]

In [113]:
distrib = [(round(l.count(tok)/sum([li.count(tok) for li in tokens.values()]),2), tok, id2tag[i]) for tok in sent for i, l in enumerate(tokens.values()) if i == label]
sorted(set(distrib), key=lambda x:x[0], reverse=True)

[(0.52, 'I', 'Abschluss'),
 (0.51, '##HO', 'Abschluss'),
 (0.43, '##E', 'Abschluss'),
 (0.4, 'P', 'Abschluss'),
 (0.33, '##N', 'Abschluss'),
 (0.1, '-', 'Abschluss')]

In [115]:
import numpy as np
set([(id2tag[np.argmax([(l.count(t)/sum([li.count(t) for li in tokens.values()]))/len(tokens[id2tag[i]]) for i,l in enumerate(tokens.values())])], t) for t in sent])

{('Abschluss', '##E'),
 ('Abschluss', '##HO'),
 ('Abschluss', '##N'),
 ('Abschluss', 'I'),
 ('Anrede', 'P'),
 ('Medikation', '-')}

In [24]:
len(tokens["Anrede"])

26644

In [238]:
[l.count("Ech") for l in tokens.values()]

[0, 229, 0, 11, 0, 0, 123, 441, 350, 20, 1]

In [43]:
uniques = {l:[] for l in labels}

for l in labels:
    other = {k:v for k,v in vocab.items() if k != l}
    val_other = [x for y in other.values() for x in y]
    uniques[l] = [tok for tok in vocab[l] if tok not in val_other]

ratio = lambda x: (len(uniques[x])/(len(vocab[x])))*100

l = iter(labels)
table = [[n:=next(l), len(vocab[n]),len(uniques[n]),f"{round(ratio(n))}%"] for lab in labels]
table = sorted(table, key = lambda x:int(x[3][:-1]), reverse=True)
table.insert(0, ["Label","Vocab size","Amount unique tokens","Ratio"])


print(tabulate(table, headers="firstrow"))

Label                                  Vocab size    Amount unique tokens  Ratio
-----------------------------------  ------------  ----------------------  -------
Befunde                                      7108                    1005  14%
Zusammenfassung                              6602                     704  11%
Anrede                                        342                      35  10%
Anamnese                                     4282                     412  10%
Diagnosen                                    5560                     429  8%
Medikation                                   2189                     134  6%
AllergienUnverträglichkeitenRisiken          1435                      63  4%
KUBefunde                                    1381                      31  2%
EchoBefunde                                  2148                      44  2%
Mix                                          2071                      33  2%
Abschluss                                     630   

**EchoBefunde & Mix (lowest scorings) have biggest amount of shared vocab (ratio to vocab size)**

In [313]:
#shared = {"Anrede":{"Befunde":shared tokens ratio of Anrede vocab}}

shared = {k:[] for k in labels}
rank = {k:[] for k in labels}

for l in labels:
    other = {k:v for k,v in vocab.items() if k != l}
    shared[l] = [tok for tok in vocab[l] if tok not in uniques[l]] #{"Anrede":[tok1, tok2, ..]}
    lab = iter(other)
    rank[l] = {(q:=next(lab)): percent(len([tok for tok in shared[l] if tok in other[q]]),len(vocab[l])) for label in other}
    assert len(shared[l]) + len(uniques[l]) == len(vocab[l])
    
l = iter(labels)
table = [[n:=next(l), len(vocab[n]), sorted(rank[n].items(), key=lambda a: float(a[1][:-1]), reverse=True)[:5]] for lab in labels]
table.insert(0, ["Label", "Vocab size", "Labels with highest amount of shared vocab"])

print(tabulate(table, headers="firstrow"))

def get_ratio(vocab_size, shared_amount):
    return round((shared_amount/vocab_size)*100)

Label                                  Vocab size  Labels with highest amount of shared vocab
-----------------------------------  ------------  ----------------------------------------------------------------------------------------------------------------------
Anrede                                        342  [('Diagnosen', '85%'), ('Befunde', '79%'), ('Zusammenfassung', '74%'), ('Anamnese', '66%'), ('EchoBefunde', '63%')]
Diagnosen                                    5560  [('Befunde', '82%'), ('Zusammenfassung', '77%'), ('Anamnese', '52%'), ('EchoBefunde', '33%'), ('Medikation', '32%')]
AllergienUnverträglichkeitenRisiken          1435  [('Befunde', '87%'), ('Zusammenfassung', '85%'), ('Diagnosen', '84%'), ('Anamnese', '72%'), ('Medikation', '54%')]
Anamnese                                     4282  [('Zusammenfassung', '80%'), ('Befunde', '79%'), ('Diagnosen', '68%'), ('Mix', '35%'), ('EchoBefunde', '34%')]
Medikation                                   2189  [('Befunde', '83%'), (

In [314]:
# Top shared vocab of labels: Befunde, Zusammenfassung, Diagnosen → top vocab & token sized

In [None]:
unq_sents = {l:{0:[], 25:[], 50:[], 75:[], 100:[]} for l in labels}

for l in labels:
    file = open(f"{l}.txt", "r")
    lines = file.readlines()
    for line in lines:
        line = line.strip()
        shared_length = len([s for s in line.split() if s in shared[l]])
        ratio = shared_length/len(line.split()) 
        if ratio == 0:
            unq_sents[l][0].append(line)
        elif ratio <= 0.25:
            unq_sents[l][25].append(line)
        elif ratio <= 0.5:
            unq_sents[l][50].append(line)
        elif ratio <= 0.75:
            unq_sents[l][75].append(line)
        else:
            unq_sents[l][100].append(line)
    file.close()

In [284]:
table = [[l, [len(unq_sents[l][i]) for i in unq_sents[l].keys()]] for l in labels]
table.insert(0, ["Label", "Shared token ratio sentence count"])

print(tabulate(table, headers="firstrow"))

Label                                Shared token ratio sentence count
-----------------------------------  -----------------------------------
Anrede                               [1, 1, 197, 203, 0]
Diagnosen                            [1207, 2640, 3619, 500, 57]
AllergienUnverträglichkeitenRisiken  [361, 244, 280, 140, 6]
Anamnese                             [767, 133, 1681, 1608, 429]
Medikation                           [1752, 814, 2985, 491, 106]
KUBefunde                            [1218, 805, 1638, 493, 40]
Befunde                              [2391, 2343, 3237, 1310, 355]
EchoBefunde                          [349, 359, 671, 130, 57]
Zusammenfassung                      [994, 118, 2419, 5300, 713]
Mix                                  [237, 30, 314, 327, 37]
Abschluss                            [6493, 550, 948, 1094, 867]


In [285]:
# Write to files:

for l in labels:
    with open(f"{l}.txt", "w") as f:
        for n in [0,25,50,75,100]:
            sents = [f"{s}\t{n}\n" for s in unq_sents[l][n]]
            f.writelines(sents)

#### Average Sentence Length

In [29]:
from statistics import mean

avg_len = {l:0 for l in labels}

for l in labels:
    avg_len[l] = round(mean([len(tokenizer.tokenize(sent)) for sent in texts[l]]),1)
    
table = [[l, avg_len[l], len(tokens[l]), len(vocab[l]), percent(len(vocab[l]),len(tokens[l]))] for l in labels]
table = sorted(table, key=lambda x: int(x[1]), reverse = True)
table.insert(0, ["Label", "Average Sentence Length", "Token Size", "Vocab size", "Vocab-Token Ratio"])

print(tabulate(table, headers="firstrow"))

Label                                  Average Sentence Length    Token Size    Vocab size  Vocab-Token Ratio
-----------------------------------  -------------------------  ------------  ------------  -------------------
Anrede                                                    66.3         26644           342  1.3%
EchoBefunde                                               58.3         91260          2148  2.4%
Befunde                                                   33.2        319918          7112  2.2%
Diagnosen                                                 27.7        222154          5560  2.5%
Zusammenfassung                                           27          258136          6602  2.6%
AllergienUnverträglichkeitenRisiken                       23.5         24239          1435  5.9%
Mix                                                       22.8         21592          2071  9.6%
Anamnese                                                  18.9         87463          4282  4.9%
Me