In [1]:
import spacy
import pickle
import numpy as np

from datasets import load_from_disk, load_dataset
from datasets import load_metric

from collections import Counter
from datetime import datetime
from tqdm import tqdm

spacy.prefer_gpu()
nlp = spacy.load("de_dep_news_trf")
nlp.max_length = 17000000

In [3]:
results51_full = load_from_disk("results51_test")
results51_cut = load_from_disk("results51_cut")
results51_cut_full = load_from_disk("lm_kenlm_pred/lm_cut_full")
results51_full_full = load_from_disk("lm_kenlm_pred/lm_full_full")
results51_cut_cut = load_from_disk("lm_kenlm_pred/lm_cut_cut")

results51_full_full_2 = load_from_disk("lm_kenlm_pred/lm_full_full_2")
results51_full_full_3 = load_from_disk("lm_kenlm_pred/lm_full_full_3")

results54_piece = load_from_disk("results54_piece")
results54_piece_lm = load_from_disk("lm_kenlm_pred/lm_piece_full_full")

In [4]:
wer_metric = load_metric("wer")

In [5]:
print("Full Test WER: {:.3f}".format(wer_metric.compute(predictions=results51_full["pred_str"], references=results51_full["target_text"])))
print("Cut Test WER: {:.3f}".format(wer_metric.compute(predictions=results51_cut["pred_str"], references=results51_cut["target_text"])))
print("Cut --> Full:")
print("Cut full Test WER: {:.3f}".format(wer_metric.compute(predictions=results51_cut_full["pred_str"], references=results51_cut_full["target_text"])))
print("Cut full LM Test WER: {:.3f}".format(wer_metric.compute(predictions=results51_cut_full["lm_str"], references=results51_cut_full["target_text"])))
print("Full --> Full:")
print("Full full Test WER: {:.3f}".format(wer_metric.compute(predictions=results51_full_full["pred_str"], references=results51_full_full["target_text"])))
print("Full full LM Test WER: {:.3f}".format(wer_metric.compute(predictions=results51_full_full["lm_str"], references=results51_full_full["target_text"])))
print("Cut --> Cut:")
print("Cut cut Test WER: {:.3f}".format(wer_metric.compute(predictions=results51_cut_cut["pred_str"], references=results51_cut_cut["target_text"])))
print("Cut cut LM Test WER: {:.3f}".format(wer_metric.compute(predictions=results51_cut_cut["lm_str"], references=results51_cut_cut["target_text"])))
print("Full --> Full 2:")
print("Full full 2 Test WER: {:.3f}".format(wer_metric.compute(predictions=results51_full_full_2["pred_str"], references=results51_full_full_2["target_text"])))
print("Full full 2 LM Test WER: {:.3f}".format(wer_metric.compute(predictions=results51_full_full_2["lm_2_str"], references=results51_full_full_2["target_text"])))
print("Full --> Full 3:")
print("Full full 3 Test WER: {:.3f}".format(wer_metric.compute(predictions=results51_full_full_3["pred_str"], references=results51_full_full_3["target_text"])))
print("Full full 3 LM Test WER: {:.3f}".format(wer_metric.compute(predictions=results51_full_full_3["lm_2_str"], references=results51_full_full_3["target_text"])))
print("Sentencepiece:")
print("Sentencepiece Test WER: {:.3f}".format(wer_metric.compute(predictions=results54_piece["detokenized"], references=results54_piece["target_text"])))
print("Sentencepiece LM Test WER: {:.3f}".format(wer_metric.compute(predictions=results54_piece_lm["detokenized"], references=results54_piece_lm["target_text"])))

Full Test WER: 0.147
Cut Test WER: 0.148
Cut --> Full:
Cut full Test WER: 0.148
Cut full LM Test WER: 0.134
Full --> Full:
Full full Test WER: 0.147
Full full LM Test WER: 0.132
Cut --> Cut:
Cut cut Test WER: 0.148
Cut cut LM Test WER: 0.256
Full --> Full 2:
Full full 2 Test WER: 0.147
Full full 2 LM Test WER: 0.129
Full --> Full 3:
Full full 3 Test WER: 0.147
Full full 3 LM Test WER: 0.126
Sentencepiece:
Sentencepiece Test WER: 0.142
Sentencepiece LM Test WER: 0.107


In [5]:
results51_cut_full_trf = []
for i in tqdm(range(len(results51_cut_full))):
    results51_cut_full_trf.append(nlp(results51_cut_full[i]['lm_str'], disable = ['ner', 'parser']))

pickle.dump(results51_cut_full_trf, open("lm_data/results51_cut_full_trf.p", "wb"))

100%|████████████████████████████████████████████████████████████████████████████| 15588/15588 [12:00<00:00, 21.63it/s]


In [8]:
results51_full_full_trf = []
for i in tqdm(range(len(results51_cut_full))):
    results51_full_full_trf.append(nlp(results51_full_full[i]['lm_str'], disable = ['ner', 'parser']))

pickle.dump(results51_full_full_trf, open("lm_data/results51_full_full_trf.p", "wb"))

100%|████████████████████████████████████████████████████████████████████████████| 15588/15588 [11:53<00:00, 21.83it/s]


In [6]:
results51_cut_cut_trf = []
for i in tqdm(range(len(results51_cut_cut))):
    results51_cut_cut_trf.append(nlp(results51_cut_cut[i]['lm_str'], disable = ['ner', 'parser']))

pickle.dump(results51_cut_cut_trf, open("lm_data/results51_cut_cut_trf.p", "wb"))

100%|████████████████████████████████████████████████████████████████████████████| 15588/15588 [11:54<00:00, 21.81it/s]


In [7]:
results51_full_full_2_trf = []
for i in tqdm(range(len(results51_cut_cut))):
    results51_full_full_2_trf.append(nlp(results51_full_full_2[i]['lm_2_str'], disable = ['ner', 'parser']))

pickle.dump(results51_full_full_2_trf, open("lm_data/results51_full_full_2_trf.p", "wb"))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15588/15588 [12:14<00:00, 21.23it/s]


In [6]:
results51_full_full_3_trf = []
for i in tqdm(range(len(results51_cut_cut))):
    results51_full_full_3_trf.append(nlp(results51_full_full_3[i]['lm_2_str'], disable = ['ner', 'parser']))

pickle.dump(results51_full_full_3_trf, open("lm_data/results51_full_full_3_trf.p", "wb"))

100%|████████████████████████████████████████████████████████████████████████████| 15588/15588 [11:56<00:00, 21.74it/s]


In [6]:
results54_piece_trf = []
for i in tqdm(range(len(results54_piece))):
    results54_piece_trf.append(nlp(results54_piece[i]['detokenized'], disable = ['ner', 'parser']))

pickle.dump(results54_piece_trf, open("lm_data/results54_piece_trf.p", "wb"))

100%|████████████████████████████████████████████████████████████████████████████| 15588/15588 [10:55<00:00, 23.78it/s]
  0%|                                                                                        | 0/15588 [00:00<?, ?it/s]


KeyError: 'lm_2_str'

In [7]:
results54_piece_lm_trf = []
for i in tqdm(range(len(results54_piece_lm))):
    results54_piece_lm_trf.append(nlp(results54_piece_lm[i]['detokenized'], disable = ['ner', 'parser']))

pickle.dump(results54_piece_lm_trf, open("lm_data/results54_piece_lm_trf.p", "wb"))

100%|████████████████████████████████████████████████████████████████████████████| 15588/15588 [12:12<00:00, 21.28it/s]


In [8]:
#results51_full_trf = pickle.load(open("lm_res51/spacy_res/results51_full_trf.p", "rb"))
results51_cut_trf = pickle.load(open("lm_data/results51_cut_trf.p", "rb"))
results51_cut_full_trf = pickle.load(open("lm_data/results51_cut_full_trf.p", "rb"))
results51_full_full_trf = pickle.load(open("lm_data/results51_full_full_trf.p", "rb"))
results51_cut_cut_trf = pickle.load(open("lm_data/results51_cut_cut_trf.p", "rb"))
results51_full_full_2_trf = pickle.load(open("lm_data/results51_full_full_2_trf.p", "rb"))
results51_full_full_3_trf = pickle.load(open("lm_data/results51_full_full_3_trf.p", "rb"))

results54_piece_trf = pickle.load(open("lm_data/results54_piece_trf.p", "rb"))
results54_piece_lm_trf = pickle.load(open("lm_data/results54_piece_lm_trf.p", "rb"))

In [9]:
#test_tok_full = []
#for i in tqdm(range(len(results51_full_trf))):
#    test_tok_full = test_tok_full + [token.text for token in results51_full_trf[i] if(not token.is_stop and not token.is_punct)]

test_tok_cut = []
for i in tqdm(range(len(results51_cut_trf))):
    test_tok_cut = test_tok_cut + [token.text for token in results51_cut_trf[i] if(not token.is_stop and not token.is_punct)]

test_tok_cut_full = []
for i in tqdm(range(len(results51_cut_full_trf))):
    test_tok_cut_full = test_tok_cut_full + [token.text for token in results51_cut_full_trf[i] if(not token.is_stop and not token.is_punct)]

test_tok_full_full = []
for i in tqdm(range(len(results51_cut_trf))):
    test_tok_full_full = test_tok_full_full + [token.text for token in results51_full_full_trf[i] if(not token.is_stop and not token.is_punct)]
    
test_tok_cut_cut = []
for i in tqdm(range(len(results51_cut_cut_trf))):
    test_tok_cut_cut = test_tok_cut_cut + [token.text for token in results51_cut_cut_trf[i] if(not token.is_stop and not token.is_punct)]

test_tok_full_full_2 = []
for i in tqdm(range(len(results51_cut_trf))):
    test_tok_full_full_2 = test_tok_full_full_2 + [token.text for token in results51_full_full_2_trf[i] if(not token.is_stop and not token.is_punct)]

test_tok_full_full_3 = []
for i in tqdm(range(len(results51_cut_trf))):
    test_tok_full_full_3 = test_tok_full_full_3 + [token.text for token in results51_full_full_3_trf[i] if(not token.is_stop and not token.is_punct)]

tok_piece = []
for i in tqdm(range(len(results51_cut_trf))):
    tok_piece = tok_piece + [token.text for token in results54_piece_trf[i] if(not token.is_stop and not token.is_punct)]

tok_piece_lm = []
for i in tqdm(range(len(results51_cut_trf))):
    tok_piece_lm = tok_piece_lm + [token.text for token in results54_piece_lm_trf[i] if(not token.is_stop and not token.is_punct)]
  

100%|██████████████████████████████████████████████████████████████████████████| 15588/15588 [00:14<00:00, 1103.58it/s]
100%|██████████████████████████████████████████████████████████████████████████| 15588/15588 [00:14<00:00, 1092.74it/s]
100%|██████████████████████████████████████████████████████████████████████████| 15588/15588 [00:14<00:00, 1089.00it/s]
100%|██████████████████████████████████████████████████████████████████████████| 15588/15588 [00:14<00:00, 1109.07it/s]
100%|██████████████████████████████████████████████████████████████████████████| 15588/15588 [00:13<00:00, 1123.05it/s]
100%|██████████████████████████████████████████████████████████████████████████| 15588/15588 [00:13<00:00, 1139.22it/s]
100%|██████████████████████████████████████████████████████████████████████████| 15588/15588 [00:13<00:00, 1122.49it/s]
100%|██████████████████████████████████████████████████████████████████████████| 15588/15588 [00:13<00:00, 1147.02it/s]


In [14]:
print("Result full words: ", len(test_tok_cut))
test_tok_cut_freq = Counter(test_tok_cut)
print("Result full words dict: ", len(test_tok_cut_freq))

print("Result cut words: ", len(test_tok_cut_full))
test_tok_cut_full_freq = Counter(test_tok_cut_full)
print("Result cut words dict: ", len(test_tok_cut_full_freq))

print("Result full full words: ", len(test_tok_full_full))
test_tok_full_full_freq = Counter(test_tok_full_full)
print("Result full full words dict: ", len(test_tok_full_full_freq))

print("Result cut cut words: ", len(test_tok_cut_cut))
test_tok_cut_cut_freq = Counter(test_tok_cut_cut)
print("Result cut cut words dict: ", len(test_tok_cut_cut_freq))

print("Result full full 2 words: ", len(test_tok_full_full_2))
test_tok_full_full_2_freq = Counter(test_tok_full_full_2)
print("Result full full 2 words dict: ", len(test_tok_full_full_2_freq))

print("Result full full 3 words: ", len(test_tok_full_full_3))
test_tok_full_full_3_freq = Counter(test_tok_full_full_3)
print("Result full full 3 words dict: ", len(test_tok_full_full_3_freq))

print("Result piece words: ", len(tok_piece))
tok_piece_freq = Counter(tok_piece)
print("Result piece words dict: ", len(tok_piece_freq))

print("Result piece lm words: ", len(tok_piece_lm))
tok_piece_lm_freq = Counter(tok_piece_lm)
print("Result piece lm words dict: ", len(tok_piece_lm_freq))

Result full words:  68467
Result full words dict:  33810
Result cut words:  69977
Result cut words dict:  27273
Result full full words:  69913
Result full full words dict:  27363
Result cut cut words:  74653
Result cut cut words dict:  24362
Result full full 2 words:  69959
Result full full 2 words dict:  27128
Result full full 3 words:  68898
Result full full 3 words dict:  26992
Result piece words:  68996
Result piece words dict:  33369
Result piece lm words:  67065
Result piece lm words dict:  28169


In [10]:
test_doc_trf_loaded = pickle.load(open("data_w/test_doc_trf.p", "rb"))

In [11]:
test_tok_text = []
for i in tqdm(range(len(test_doc_trf_loaded))):
    test_tok_text = test_tok_text + [token.text for token in test_doc_trf_loaded[i] if(not token.is_stop and not token.is_punct)]

100%|███████████████████████████████████████████████████████████████████████████| 15588/15588 [00:16<00:00, 961.69it/s]


In [12]:
print("Test words: ", len(test_tok_text))
test_words_freq = Counter(test_tok_text)
print("Test words dict: ", len(test_words_freq))

test_words_freq_keys = test_words_freq.keys()

Test words:  67545
Test words dict:  29553


In [13]:
i = 0
for word in test_words_freq_keys:
    if test_tok_cut_freq[word] >= 1:
        i = i+1
print("Dict: Test -> cut Pred: ", i /len(test_words_freq))

Dict: Test -> cut Pred:  0.7110614827597875


In [14]:
i = 0
for word in test_words_freq_keys:
    if test_tok_cut_full_freq[word] >= 1:
        i = i+1
print("Dict: Test -> cut full Pred: ", i /len(test_words_freq))

Dict: Test -> cut full Pred:  0.731905390315704


In [15]:
i = 0
for word in test_words_freq_keys:
    if test_tok_full_full_freq[word] >= 1:
        i = i+1
print("Dict: Test -> full full Pred: ", i /len(test_words_freq))

Dict: Test -> full full Pred:  0.734680066321524


In [16]:
i = 0
for word in test_words_freq_keys:
    if test_tok_cut_cut_freq[word] >= 1:
        i = i+1
print("Dict: Test -> cut cut Pred: ", i /len(test_words_freq))

Dict: Test -> cut cut Pred:  0.5406219334754508


In [17]:
i = 0
for word in test_words_freq_keys:
    if test_tok_full_full_2_freq[word] >= 1:
        i = i+1
print("Dict: Test -> full full 2 Pred: ", i /len(test_words_freq))

Dict: Test -> full full 2 Pred:  0.7393834805265117


In [25]:
i = 0
for word in test_words_freq_keys:
    if test_tok_full_full_3_freq[word] >= 1:
        i = i+1
print("Dict: Test -> full full 3 Pred: ", i /len(test_words_freq))

Dict: Test -> full full 3 Pred:  0.7347477413460562


In [15]:
i = 0
for word in test_words_freq_keys:
    if tok_piece_freq[word] >= 1:
        i = i+1
print("Dict: Test -> Piece Pred: ", i /len(test_words_freq))

Dict: Test -> Piece Pred:  0.7180658477988698


In [16]:
i = 0
for word in test_words_freq_keys:
    if tok_piece_lm_freq[word] >= 1:
        i = i+1
print("Dict: Test -> Piece LM Pred: ", i /len(test_words_freq))

Dict: Test -> Piece LM Pred:  0.7702094542009271


In [17]:
test_train_set = pickle.load(open("data/test_train_set.p", "rb"))
importantLists = [1,2,3,4,5,10]

In [18]:
for i in range(1,11):      
    print("Number of words " + str(i) + " : ", len(test_train_set[i]))

Number of words 1 :  3235
Number of words 2 :  2239
Number of words 3 :  1538
Number of words 4 :  1280
Number of words 5 :  1063
Number of words 6 :  912
Number of words 7 :  766
Number of words 8 :  647
Number of words 9 :  638
Number of words 10 :  549


In [21]:
for x in importantLists:
    print("List: ", x)
    i=0
    print(len(test_train_set[x]))
    for word in test_train_set[x]:
        if test_tok_cut_freq[word] >= 1:
            i = i+1
    print(i)
    print(i/len(test_train_set[x]))

List:  1
3235
1819
0.5622874806800618
List:  2
2239
1357
0.6060741402411791
List:  3
1538
987
0.6417425227568271
List:  4
1280
818
0.6390625
List:  5
1063
711
0.6688617121354656
List:  10
549
424
0.7723132969034608


In [22]:
for x in importantLists:
    print("List: ", x)
    i=0
    print(len(test_train_set[x]))
    for word in test_train_set[x]:
        if test_tok_cut_full_freq[word] >= 1:
            i = i+1
    print(i)
    print(i/len(test_train_set[x]))

List:  1
3235
2697
0.833693972179289
List:  2
2239
1872
0.8360875390799464
List:  3
1538
1309
0.8511053315994799
List:  4
1280
1101
0.86015625
List:  5
1063
925
0.8701787394167451
List:  10
549
507
0.9234972677595629


In [23]:
for x in importantLists:
    print("List: ", x)
    i=0
    print(len(test_train_set[x]))
    for word in test_train_set[x]:
        if test_tok_full_full_freq[word] >= 1:
            i = i+1
    print(i)
    print(i/len(test_train_set[x]))

List:  1
3235
2709
0.837403400309119
List:  2
2239
1917
0.8561857972309067
List:  3
1538
1333
0.8667100130039012
List:  4
1280
1095
0.85546875
List:  5
1063
934
0.878645343367827
List:  10
549
504
0.9180327868852459


In [24]:
for x in importantLists:
    print("List: ", x)
    i=0
    print(len(test_train_set[x]))
    for word in test_train_set[x]:
        if test_tok_cut_cut_freq[word] >= 1:
            i = i+1
    print(i)
    print(i/len(test_train_set[x]))

List:  1
3235
1
0.0003091190108191654
List:  2
2239
1533
0.6846806610093792
List:  3
1538
1101
0.7158647594278283
List:  4
1280
906
0.7078125
List:  5
1063
786
0.7394167450611477
List:  10
549
430
0.7832422586520947


In [25]:
for x in importantLists:
    print("List: ", x)
    i=0
    print(len(test_train_set[x]))
    for word in test_train_set[x]:
        if test_tok_full_full_2_freq[word] >= 1:
            i = i+1
    print(i)
    print(i/len(test_train_set[x]))

List:  1
3235
2719
0.8404945904173107
List:  2
2239
1911
0.8535060294774452
List:  3
1538
1345
0.8745123537061118
List:  4
1280
1101
0.86015625
List:  5
1063
939
0.883349012229539
List:  10
549
504
0.9180327868852459


In [26]:
for x in importantLists:
    print("List: ", x)
    i=0
    print(len(test_train_set[x]))
    for word in test_train_set[x]:
        if test_tok_full_full_3_freq[word] >= 1:
            i = i+1
    print(i)
    print(i/len(test_train_set[x]))

List:  1
3235
2671
0.8256568778979907
List:  2
2239
1880
0.8396605627512282
List:  3
1538
1319
0.8576072821846554
List:  4
1280
1098
0.8578125
List:  5
1063
931
0.8758231420507996
List:  10
549
503
0.9162112932604736


In [19]:
for x in importantLists:
    print("List: ", x)
    i=0
    print(len(test_train_set[x]))
    for word in test_train_set[x]:
        if tok_piece_freq[word] >= 1:
            i = i+1
    print(i)
    print(i/len(test_train_set[x]))

List:  1
3235
1859
0.5746522411128284
List:  2
2239
1408
0.6288521661456007
List:  3
1538
988
0.6423927178153446
List:  4
1280
847
0.66171875
List:  5
1063
738
0.6942615239887112
List:  10
549
440
0.8014571948998178


In [20]:
for x in importantLists:
    print("List: ", x)
    i=0
    print(len(test_train_set[x]))
    for word in test_train_set[x]:
        if tok_piece_lm_freq[word] >= 1:
            i = i+1
    print(i)
    print(i/len(test_train_set[x]))

List:  1
3235
2151
0.6649149922720248
List:  2
2239
1624
0.7253238052702099
List:  3
1538
1155
0.7509752925877763
List:  4
1280
1006
0.7859375
List:  5
1063
870
0.8184383819379115
List:  10
549
490
0.8925318761384335


In [21]:
test_only_words = pickle.load(open("data/test_only_words.p", "rb"))

In [27]:
i=0
print(len(test_only_words))
for word in test_only_words:
    if test_tok_cut_full_freq[word] >= 1:
        i = i+1
print(i)

print((i/len(test_only_words)))

5810
2
0.00034423407917383823


In [28]:
i=0
print(len(test_only_words))
for word in test_only_words:
    if test_tok_full_full_freq[word] >= 1:
        i = i+1
print(i)

print((i/len(test_only_words)))

5810
1
0.00017211703958691912


In [29]:
i=0
print(len(test_only_words))
for word in test_only_words:
    if test_tok_cut_freq[word] >= 1:
        i = i+1
print(i)

print((i/len(test_only_words)))

5810
2802
0.4822719449225473


In [30]:
i=0
print(len(test_only_words))
for word in test_only_words:
    if test_tok_cut_cut_freq[word] >= 1:
        i = i+1
print(i)

print((i/len(test_only_words)))

5810
2
0.00034423407917383823


In [31]:
i=0
print(len(test_only_words))
for word in test_only_words:
    if test_tok_full_full_2_freq[word] >= 1:
        i = i+1
print(i)

print((i/len(test_only_words)))

5810
86
0.014802065404475043


In [28]:
i=0
print(len(test_only_words))
for word in test_only_words:
    if test_tok_full_full_3_freq[word] >= 1:
        i = i+1
print(i)

print((i/len(test_only_words)))

5810
117
0.020137693631669534


In [22]:
i=0
print(len(test_only_words))
for word in test_only_words:
    if tok_piece_freq[word] >= 1:
        i = i+1
print(i)

print((i/len(test_only_words)))

5810
2811
0.48382099827882963


In [23]:
i=0
print(len(test_only_words))
for word in test_only_words:
    if tok_piece_lm_freq[word] >= 1:
        i = i+1
print(i)

print((i/len(test_only_words)))

5810
2431
0.41841652323580036
