In [None]:
from datasets import load_from_disk, load_dataset

common_voice_train = load_dataset("common_voice", "de", split="train", cache_dir="D:\Master\wsl\data")
common_voice_validation = load_dataset("common_voice", "de", split="validation", cache_dir="D:\Master\wsl\data")
common_voice_test = load_dataset("common_voice", "de", split="test", cache_dir="D:\Master\wsl\data")

In [None]:
common_voice_train = common_voice_train.remove_columns(["path","accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])
common_voice_validation = common_voice_validation.remove_columns(["path","accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])
common_voice_test = common_voice_test.remove_columns(["path","accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])

# Preprocess Text 

In [None]:
import re
chars_to_keep = '[^A-Za-zäüöß ]+'

def remove_special_characters_chris(batch):
    batch["sentence"] = re.sub(chars_to_keep, '', batch["sentence"]).lower() + " "
    return batch

In [None]:
common_voice_train = common_voice_train.map(remove_special_characters_chris)
common_voice_validation = common_voice_validation.map(remove_special_characters_chris)
common_voice_test = common_voice_test.map(remove_special_characters_chris)

In [None]:
common_voice_validation.save_to_disk("E:/Master/data/val_text")
common_voice_train.save_to_disk("E:/Master/data/train_text")
common_voice_test.save_to_disk("E:/Master/data/test_text")

# Load Text Dataset

In [1]:
from datasets import load_from_disk, load_dataset

train_text_dataset = load_from_disk("E:/Master/data/0_text/train_text")
val_text_dataset = load_from_disk("E:/Master/data/0_text/val_text")
test_text_dataset = load_from_disk("E:/Master/data/0_text/test_text")

In [None]:
!pip install -U pip setuptools wheel
!pip install -U spacy[cuda112]

In [None]:
!python -m spacy download en_core_web_trf
!python -m spacy download de_dep_news_trf

In [None]:
!pip install spacy[transformers,cuda112]

In [None]:
!python -m spacy download de_core_news_sm

In [2]:
import spacy

spacy.prefer_gpu()

True

# Load Spacy

In [2]:
import spacy

#spacy.prefer_gpu()
nlp = spacy.load("de_core_news_sm")
nlp.max_length = 17000000

In [3]:
train_complete_text = ""
val_complete_text = ""
test_complete_text = ""

In [4]:
print(train_text_dataset.shape[0])
print(val_text_dataset.shape[0])
print(test_text_dataset.shape[0])

246525
15588
15588


## Convert Datasets to String

In [5]:
from tqdm import tqdm

for i in tqdm(range(train_text_dataset.shape[0])):
    train_complete_text = train_complete_text + train_text_dataset[i]['sentence']

for i in tqdm(range(val_text_dataset.shape[0])):
    val_complete_text = val_complete_text + val_text_dataset[i]['sentence']

for i in tqdm(range(test_text_dataset.shape[0])):
    test_complete_text = test_complete_text + test_text_dataset[i]['sentence']

100%|███████████████████████████████████████████████████████████████████████| 246525/246525 [00:13<00:00, 18065.69it/s]
100%|█████████████████████████████████████████████████████████████████████████| 15588/15588 [00:00<00:00, 34183.79it/s]
100%|█████████████████████████████████████████████████████████████████████████| 15588/15588 [00:00<00:00, 34259.35it/s]


In [6]:
print(len(train_complete_text))
print(len(val_complete_text))
print(len(test_complete_text))

16656802
997429
976676


# Process Datasets with Spacy

In [6]:
from datetime import datetime

now = datetime.now()
print("start process train @ ", now)

train_doc = nlp(train_complete_text, disable = ['ner', 'parser'])

now = datetime.now()
print("End process train @ ", now)

start process train @  2021-05-18 14:39:00.132384
End process train @  2021-05-18 14:40:54.267330


In [7]:
from datetime import datetime

now = datetime.now()
print("start process val @ ", now)

val_doc = nlp(val_complete_text, disable = ['ner', 'parser'])

now = datetime.now()
print("End process val @ ", now)

start process val @  2021-05-18 14:41:52.312361
End process val @  2021-05-18 14:41:58.565515


In [8]:
now = datetime.now()
print("start process test @ ", now)

test_doc = nlp(test_complete_text, disable = ['ner', 'parser'])

now = datetime.now()
print("End process test @ ", now)

start process test @  2021-05-18 14:42:02.360915
End process test @  2021-05-18 14:42:08.361938


# Analyse train Data

In [83]:
# all tokens that arent stop words or punctuations
train_all = [token.text
         for token in train_doc
         if not token.is_punct]

# all tokens that arent stop words or punctuations
train_words = [token.text
         for token in train_doc
         if not token.is_stop and not token.is_punct]

# noun tokens that arent stop words or punctuations
train_nouns = [token.text
         for token in train_doc
         if (not token.is_stop and
             not token.is_punct and
             token.pos_ == "NOUN")]

In [121]:
for word in train_words:
    if(word == "ausgebeutet"):
        print("found!")

found!
found!
found!
found!
found!
found!
found!
found!
found!
found!


In [90]:
from collections import Counter

print("train all: ", len(train_all))
train_all_freq = Counter(train_all)
print("train words dict: ", len(train_all_freq))

print("train words: ", len(train_words))
train_words_freq = Counter(train_words)
print("train words dict: ", len(train_words_freq))

print("train nouns: ", len(train_nouns))
train_nouns_freq = Counter(train_nouns)
print("train nouns dict: ", len(train_nouns_freq))

train all:  2350690
train words dict:  168237
train words:  1138469
train words dict:  167707
train nouns:  143335
train nouns dict:  26890


### Lemma nouns test:

In [91]:
# noun tokens that arent stop words or punctuations
train_nouns_lemma = [token.lemma_
         for token in train_doc
         if (not token.is_stop and
             not token.is_punct and
             token.pos_ == "NOUN")]

In [95]:
print("Lemma Noun Words:", len(train_nouns_lemma))

train_nouns_lemma_freq = Counter(train_nouns_lemma)

print("Lemma Noun freq Words", len(train_nouns_lemma_freq))

Lemma Words: 143335
Lemma freq Words 26758


In [101]:
train_nouns_lemma_freq_keys = train_nouns_lemma_freq.keys()

n = 11

for i in range(1,n):
    
    rare_lemma_words = []
    for word in train_nouns_lemma_freq_keys:
        
        if train_nouns_lemma_freq[word] <= i:
            rare_lemma_words.append(word)
        
    print("Number of rare Lemma words " + str(i) + " : ", len(rare_lemma_words))

Number of rare Lemma words 1 :  17082
Number of rare Lemma words 2 :  20678
Number of rare Lemma words 3 :  22212
Number of rare Lemma words 4 :  23109
Number of rare Lemma words 5 :  23670
Number of rare Lemma words 6 :  24042
Number of rare Lemma words 7 :  24333
Number of rare Lemma words 8 :  24578
Number of rare Lemma words 9 :  24782
Number of rare Lemma words 10 :  24939


In [108]:
train_rare_lemma_words = []
for word in train_nouns_lemma_freq_keys:
    if train_nouns_lemma_freq[word] <= 1:
        train_rare_lemma_words.append(word)

print(train_rare_lemma_words[5:100])

['schiebers', 'sturmes', 'piscina', 'tankdeckels', 'fertigungsanlage', 'kopie', 'tauernfensters', 'staatsuniversität', 'modellautos', 'hufen', 'rennleitung', 'schutzumschlag', 'highschoolzeit', 'kontrabassisten', 'homiletik', 'heiliggeistordens', 'beifall', 'geschichtspolitik', 'instandsetzungstruppe', 'merowingern', 'austreiben', 'sesklokultur', 'ovale', 'trajektorie', 'umsatzbeteiligung', 'gesellschafterversammlung', 'wood', 'trialog', 'granada', 'bildpunkt', 'einsatzfall', 'zarismus', 'feudalismus', 'realschulempfehlung', 'boxsack', 'versuchsanordnung', 'christin', 'xte', 'matheprüfung', 'weihnachtsbaumdekoration', 'modelltheorie', 'böhme', 'unterkieferknochen', 'ormandy', 'cholerakonferenz', 'arbeitsbewertung', 'handymusik', 'komponist', 'audiodaten', 'barchfeld', 'antonio', 'adour', 'humorvoll', 'master', 'freileitung', 'gesundheitsschutzniveau', 'ausgabenprogramm', 'schiffsunglück', 'friedenszeiten', 'mennonitengemeinde', 'leitungsfunktion', 'tritt', 'sauna', 'stiftungsprofessur'

### lemma words test:

In [136]:
train_words_lemma = [token.lemma_
         for token in train_doc
         if (not token.is_stop and
             not token.is_punct)]

print("Lemma Words:", len(train_words_lemma))
train_words_lemma_freq = Counter(train_words_lemma)
print("Lemma freq Words", len(train_words_lemma_freq))

Lemma Words: 1138469
Lemma freq Words 151687


In [135]:
for word in train_words:
    if word == "rutschte":
        print("found!")

train_words_freq['rutschte']

found!
found!
found!


3

In [138]:
train_words_lemma_freq_keys = train_words_lemma_freq.keys()

n = 11

for i in range(1,n):
    
    train_rare_words = []
    for word in train_words_lemma_freq_keys:
        
        if train_words_lemma_freq[word] <= i:
            train_rare_words.append(word)
        
    print("Number of rare Lemma words " + str(i) + " : ", len(train_rare_words))

Number of rare Lemma words 1 :  84290
Number of rare Lemma words 2 :  108559
Number of rare Lemma words 3 :  119610
Number of rare Lemma words 4 :  125815
Number of rare Lemma words 5 :  129823
Number of rare Lemma words 6 :  132562
Number of rare Lemma words 7 :  134655
Number of rare Lemma words 8 :  136284
Number of rare Lemma words 9 :  137614
Number of rare Lemma words 10 :  138702


In [126]:
train_rare_words = []
for word in train_words_freq_keys:
    if train_words_freq[word] <= 1:
        train_rare_words.append(word)

print(train_rare_words[5:100])

['alfried', 'abrakadabra', 'abdrücken', 'jadebusen', 'rohstoffknappheit', 'privatkraftwagen', 'indoarischen', 'schiebers', 'kartenleser', 'juniorengrandprixfinale', 'platzanweiser', 'sturmes', 'abkauft', 'fehlerkorrekturen', 'franzoseneinfall', 'knollig', 'obie', 'maissorten', 'piscina', 'landtagswahlkreis', 'turatta', 'mitanni', 'lscheich', 'geldkoffer', 'stahlfelgen', 'rutschen', 'tankdeckels', 'gerst', 'bestreicht', 'brötchenhälfte', 'ausgebeuteten', 'ausbeuter', 'mörderisch', 'verhandlungssache', 'luftkraftstoffgemisch', 'fertigungsanlage', 'emile', 'gomer', 'nassauidenstein', 'logoireihe', 'hereinholen', 'dünenstinkmorchel', 'bellman', 'cereus', 'tauernbach', 'schiefers', 'tauernfensters', 'gerichtsakten', 'ostwestfalens', 'funston', 'bambergs', 'czapski', 'staatsuniversität', 'vallaster', 'gifsuryvette', 'tuning', 'knolls', 'nauroth', 'zeitungswissenschaft', 'dietenhofen', 'armenisch', 'kunstgewerbemuseums', 'militärwaffen', 'lgötzen', 'kinderdentist', 'hilfst', 'zeigegeste', 'ko

## train words 

In [139]:
train_words_freq_keys = train_words_freq.keys()

n = 11

for i in range(0,n):
    
    rare_words = []
    for word in train_words_freq_keys:
        
        if train_words_freq[word] <= i:
            rare_words.append(word)
        
    print("Number of rare words " + str(i) + " : ", len(rare_words))

Number of rare words 0 :  0
Number of rare words 1 :  91121
Number of rare words 2 :  118265
Number of rare words 3 :  130694
Number of rare words 4 :  137805
Number of rare words 5 :  142409
Number of rare words 6 :  145643
Number of rare words 7 :  148076
Number of rare words 8 :  149990
Number of rare words 9 :  151534
Number of rare words 10 :  152818


In [141]:
train_rare_words = []
for word in train_words_freq_keys:
    if train_words_freq[word] <= 1:
        train_rare_words.append(word)
        
print(len(train_rare_words))
print(train_rare_words[:100])

91121
['drannehmen', 'marktverständnis', 'heilbäder', 'bildbearbeitungsprogramme', 'muttersohnkonflikt', 'alfried', 'abrakadabra', 'abdrücken', 'jadebusen', 'rohstoffknappheit', 'privatkraftwagen', 'indoarischen', 'schiebers', 'kartenleser', 'juniorengrandprixfinale', 'platzanweiser', 'sturmes', 'abkauft', 'fehlerkorrekturen', 'franzoseneinfall', 'knollig', 'obie', 'maissorten', 'piscina', 'landtagswahlkreis', 'turatta', 'mitanni', 'lscheich', 'geldkoffer', 'stahlfelgen', 'rutschen', 'tankdeckels', 'gerst', 'bestreicht', 'brötchenhälfte', 'ausgebeuteten', 'ausbeuter', 'mörderisch', 'verhandlungssache', 'luftkraftstoffgemisch', 'fertigungsanlage', 'emile', 'gomer', 'nassauidenstein', 'logoireihe', 'hereinholen', 'dünenstinkmorchel', 'bellman', 'cereus', 'tauernbach', 'schiefers', 'tauernfensters', 'gerichtsakten', 'ostwestfalens', 'funston', 'bambergs', 'czapski', 'staatsuniversität', 'vallaster', 'gifsuryvette', 'tuning', 'knolls', 'nauroth', 'zeitungswissenschaft', 'dietenhofen', 'arm

## Stem experiments

In [142]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("german")

train_rare_stem_words = []

for word in train_rare_words:
    train_rare_stem_words.append(stemmer.stem(word))

print("Rare Stem Words:", len(train_rare_stem_words))

train_rare_stem_words_freq = Counter(train_rare_stem_words)

print("Rare stem Words freq", len(train_rare_stem_words_freq))

Rare Stem Words: 91121
Rare stem Words freq 83273


In [148]:
train_rare_stem_words_2 = []

train_rare_stem_words_freq_keys = train_rare_stem_words_freq.keys()

for word in train_rare_stem_words_freq:
    if train_rare_stem_words_freq[word] <= 1:
        train_rare_stem_words_2.append(word)
        
print(train_rare_stem_words_2[:100])

['drannehm', 'marktverstandnis', 'heilbad', 'muttersohnkonflikt', 'alfried', 'abrakadabra', 'jadebus', 'rohstoffknapp', 'privatkraftwag', 'indoar', 'schieb', 'kartenles', 'juniorengrandprixfinal', 'platzanweis', 'abkauft', 'fehlerkorrektur', 'franzoseneinfall', 'knollig', 'obi', 'maissort', 'piscina', 'landtagswahlkreis', 'turatta', 'mitanni', 'lscheich', 'geldkoff', 'stahlfelg', 'tankdeckel', 'gerst', 'bestreicht', 'brotchenhalft', 'ausgebeutet', 'ausbeut', 'morder', 'verhandlungssach', 'luftkraftstoffgem', 'emil', 'gom', 'nassauidenstein', 'logoireih', 'hereinhol', 'dunenstinkmorchel', 'bellman', 'cereus', 'tauernbach', 'tauernfenst', 'gerichtsakt', 'funston', 'bamberg', 'czapski', 'staatsuniversitat', 'vallast', 'gifsuryvett', 'tuning', 'knoll', 'nauroth', 'dietenhof', 'armen', 'kunstgewerbemuseum', 'militarwaff', 'lgotz', 'kinderdentist', 'zeigeg', 'korrekt', 'dervi', 'erolu', 'monogam', 'vogl', 'handelsgeschaft', 'bohlau', 'quellenfrei', 'gnesiolutheran', 'philippist', 'zugerechne

In [48]:
print(len(train_word_stem_freq))

rare_stem_words = []

train_word_stem_freq_keys = train_word_stem_freq.keys()

for word in train_word_stem_freq:
    if train_word_stem_freq[word] < 2:
        rare_stem_words.append(word)
        
print(len(rare_stem_words))

123944
66504


In [51]:
train_word_stem_lemma_freq = Counter(train_words_stemmed_lemma_List)

print(len(train_word_stem_lemma_freq))

rare_stem_lemma_words = []

train_word_stem_freq_keys = train_word_stem_lemma_freq.keys()

for word in train_word_stem_lemma_freq:
    if train_word_stem_lemma_freq[word] < 2:
        rare_stem_lemma_words.append(word)
        
print(len(rare_stem_lemma_words))

120003
65216


# Analyse Val Data

In [149]:
val_words = [token.text
         for token in val_doc
         if not token.is_stop and not token.is_punct]

In [150]:
print("Val words: ", len(val_words))
val_words_freq = Counter(val_words)
print("Val words dict: ", len(val_words_freq))

Val words:  68675
Val words dict:  29735


In [154]:
val_train_set = []
for word in train_rare_words:
    if val_words_freq[word] >= 1:
        val_train_set.append(word)
        
print(len(val_train_set))

3232


# Analyse Test Data

In [156]:
test_words = [token.text
         for token in test_doc
         if not token.is_stop and not token.is_punct]

In [157]:
print("Test words: ", len(test_words))
test_words_freq = Counter(test_words)
print("Test words dict: ", len(test_words_freq))

Test words:  67545
Test words dict:  29553


In [158]:
test_train_set = []
for word in train_rare_words:
    if test_words_freq[word] >= 1:
        test_train_set.append(word)
        
print(len(test_train_set))

3235


# Check WER

In [None]:
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor

tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

model = Wav2Vec2ForCTC.from_pretrained("./wav2vec2-large-xlsr-ger-chris/checkpoint-51000")