In [None]:
from datasets import load_from_disk, load_dataset

common_voice_train = load_dataset("common_voice", "de", split="train", cache_dir="D:\Master\wsl\data")
common_voice_validation = load_dataset("common_voice", "de", split="validation", cache_dir="D:\Master\wsl\data")
common_voice_test = load_dataset("common_voice", "de", split="test", cache_dir="D:\Master\wsl\data")

In [None]:
common_voice_train = common_voice_train.remove_columns(["path","accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])
common_voice_validation = common_voice_validation.remove_columns(["path","accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])
common_voice_test = common_voice_test.remove_columns(["path","accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])

# Preprocess Text 

In [None]:
import re
chars_to_keep = '[^A-Za-zäüöß ]+'

def remove_special_characters_chris(batch):
    batch["sentence"] = re.sub(chars_to_keep, '', batch["sentence"]).lower() + " "
    return batch

In [None]:
common_voice_train = common_voice_train.map(remove_special_characters_chris)
common_voice_validation = common_voice_validation.map(remove_special_characters_chris)
common_voice_test = common_voice_test.map(remove_special_characters_chris)

In [None]:
common_voice_validation.save_to_disk("E:/Master/data/val_text")
common_voice_train.save_to_disk("E:/Master/data/train_text")
common_voice_test.save_to_disk("E:/Master/data/test_text")

# Load Text Dataset

In [14]:
from datasets import load_from_disk, load_dataset

val_text_dataset = load_from_disk("E:/Master/data/val_text")

In [5]:
import spacy
nlp = spacy.load("de_core_news_sm")

In [6]:
val_complete_text = ""

In [18]:
val_text_dataset.shape[0]

15588

In [26]:
from tqdm import tqdm

for i in tqdm(range(val_text_dataset.shape[0])):
    val_complete_text = val_complete_text + val_text_dataset[i]['sentence']

100%|█████████████████████████████████████████████████████████████████████████| 15588/15588 [00:00<00:00, 35586.68it/s]


In [27]:
len(val_complete_text)

997429

In [28]:
val_doc = nlp(val_complete_text)

In [30]:
# all tokens that arent stop words or punctuations
words = [token.text
         for token in val_doc
         if not token.is_stop and not token.is_punct]

In [31]:
# noun tokens that arent stop words or punctuations
nouns = [token.text
         for token in val_doc
         if (not token.is_stop and
             not token.is_punct and
             token.pos_ == "NOUN")]

In [57]:
from collections import Counter

# five most common tokens
word_freq = Counter(words)
common_words = word_freq.most_common(300)

In [58]:
# five most common noun tokens
noun_freq = Counter(nouns)
common_nouns = noun_freq.most_common(300)

In [40]:
print(len(nouns))

8332


In [59]:
print(common_nouns)

[('stadt', 72), ('hauptstadt', 71), ('sohn', 66), ('namen', 56), ('menschen', 55), ('frau', 48), ('bericht', 43), ('arbeit', 41), ('werk', 38), ('universität', 38), ('form', 34), ('region', 31), ('grund', 30), ('kirche', 30), ('weltkrieg', 29), ('tod', 28), ('land', 28), ('film', 27), ('name', 27), ('gruppe', 26), ('problem', 26), ('thema', 25), ('burg', 23), ('seiten', 23), ('straße', 23), ('fall', 23), ('staaten', 22), ('situation', 21), ('entwicklung', 21), ('titel', 21), ('bevölkerung', 21), ('buch', 21), ('ziel', 21), ('einsatz', 19), ('hilfe', 19), ('sitz', 19), ('musik', 19), ('bedeutung', 19), ('werke', 18), ('programm', 18), ('gegensatz', 18), ('saison', 18), ('wort', 18), ('mannschaft', 18), ('ort', 17), ('zukunft', 16), ('schule', 16), ('folge', 16), ('firma', 16), ('position', 16), ('erfolg', 16), ('art', 16), ('verein', 16), ('bau', 15), ('bezeichnung', 15), ('herrschaft', 15), ('industrie', 15), ('stil', 15), ('mutter', 15), ('welt', 15), ('familie', 15), ('partei', 15), 

In [60]:
print(common_words)

[('liegt', 201), ('mal', 131), ('stadt', 130), ('befindet', 101), ('steht', 86), ('gemeinde', 83), ('gehört', 82), ('deutschen', 80), ('hauptstadt', 78), ('ort', 78), ('herr', 77), ('frau', 77), ('gilt', 77), ('sohn', 77), ('zahlreiche', 72), ('kinder', 71), ('ebenfalls', 71), ('mehrere', 71), ('lassen', 68), ('namen', 67), ('leben', 66), ('meist', 66), ('zudem', 65), ('studierte', 65), ('menschen', 64), ('findet', 62), ('anschließend', 61), ('platz', 61), ('tätig', 57), ('deutschland', 56), ('besteht', 55), ('verschiedenen', 54), ('sehen', 54), ('berlin', 54), ('benannt', 53), ('weltkrieg', 53), ('geschichte', 53), ('lässt', 53), ('arbeitete', 52), ('bezeichnet', 52), ('kirche', 52), ('verwendet', 52), ('kommission', 51), ('bedeutung', 51), ('europa', 51), ('finden', 50), ('lebt', 49), ('begann', 49), ('unternehmen', 49), ('vater', 47), ('gebäude', 47), ('frage', 47), ('meisten', 46), ('bitte', 46), ('sitz', 46), ('insbesondere', 46), ('sowohl', 46), ('bericht', 45), ('führte', 45), (