In [None]:
!pip install transformers
!pip install datasets
!pip install pydub
!pip install soundfile
!pip install torchaudio
!pip install librosa
!pip install jiwer

In [None]:
import torch

torch.cuda.is_available()

In [None]:
from datasets import load_dataset, load_metric
from pathlib import Path

common_voice_train = load_dataset("common_voice", "de", split="train", cache_dir=Path("./data"))
common_voice_validation = load_dataset("common_voice", "de", split="validation", cache_dir=Path("./data"))
common_voice_test = load_dataset("common_voice", "de", split="test", cache_dir=Path("./data"))

In [None]:
print(common_voice_train.shape)
print(common_voice_validation.shape)
print(common_voice_test.shape)

In [None]:
common_voice_train = common_voice_train.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])
common_voice_validation = common_voice_validation.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])
common_voice_test = common_voice_test.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])

In [None]:
import re
chars_to_keep = '[^A-Za-zäüöß ]+'

def remove_special_characters_chris(batch):
    batch["sentence"] = re.sub(chars_to_keep, '', batch["sentence"]).lower() + " "
    return batch

In [None]:
common_voice_train = common_voice_train.map(remove_special_characters_chris)
#common_voice_validation = common_voice_validation.map(remove_special_characters_chris)
#common_voice_test = common_voice_test.map(remove_special_characters_chris)

In [None]:
import torchaudio
import numpy as np
import librosa

def speech_file_to_array_subsample_fn(batch):
    source_path = batch["path"]
    #target_path = source_path.replace("/fb6c78aa77894f852352462c1b7d4734e3a9a4357d8b5cf479f1cff7c62027b8/", "/0471615e22737a20fe3645109147bd3f20b421f6242d7f9f7c6db670ada35454/", 1)
    speech_array, sampling_rate = torchaudio.load(source_path)
    batch["speech"] = speech_array[0].numpy()
    batch["speech"] = librosa.resample(np.asarray(batch["speech"]), sampling_rate, 16_000)
    batch["sampling_rate"] = 16_000
    batch["target_text"] = batch["sentence"]
    return batch

In [None]:
from datasets import Dataset, concatenate_datasets, load_from_disk
import time

def preprocess(batch):
    
    print("Start: " + str(time.asctime()))

    batch_size = 2500
    batch_length = len(batch["path"])
    print("total length: " + str(batch_length))
    
    result_batch = Dataset.from_dict(batch[0:batch_size])
    result_batch = result_batch.map(speech_file_to_array_subsample_fn, remove_columns=result_batch.column_names, num_proc=8) 
    
    for i in range(batch_size, batch_length, batch_size):
        
        print("From: " + str(i) + " to: " + str(i+batch_size))
        print("Time: " + str(time.asctime()))
        
        small_batch = Dataset.from_dict(batch[i:i+batch_size])
        small_batch = small_batch.map(speech_file_to_array_subsample_fn, remove_columns=small_batch.column_names, num_proc=8) 
        result_batch = concatenate_datasets([result_batch, small_batch]) 
    
    return result_batch

In [None]:
import datasets

def preprocess_chunks(dataset):
    batch_size = 10000
    dataset_length = len(dataset["path"])
    
    for i in range(0, dataset_length, batch_size):
        chunk = Dataset.from_dict(dataset[i:i+batch_size])
        chunk = preprocess(chunk)
        chunk.save_to_disk("cv_sampled/data_" + str(i) + "_" + str(i+batch_size))

In [None]:
preprocess_chunks(common_voice_train)

In [None]:
import glob

cv_sampled_test = load_from_disk("cv_sampled/data_0_5000")

for file_name in glob.iglob("cv_sampled/*"):
    if(file_name =="cv_sampled/data_0_5000"):
        i= 0
        # do nothing
    else:
        print(file_name)
        cv_batch = load_from_disk(file_name)
        cv_sampled_test = concatenate_datasets([cv_sampled_test, cv_batch])   

In [None]:
import glob
for file_name in glob.iglob("G:/01-DATA/train_batch/*"):
    print(file_name)

In [None]:
from datasets import Dataset, concatenate_datasets, load_from_disk
import glob
import time

print("Start: " + str(time.asctime()))

cv_sampled_train = load_from_disk("G:/01-DATA/train_batch/train_sampled_pro_1")

first = True

for file_name in glob.iglob("G:/01-DATA/train_batch/*"):
    
    print("Time: " + str(time.asctime()))
    
    if(first):
        first = False
        print("first one skipped")
    else:
        print(file_name)
        cv_batch = load_from_disk(file_name)
        cv_sampled_train = concatenate_datasets([cv_sampled_train, cv_batch])

print("Saving to disk!")
print("Time: " + str(time.asctime()))
cv_sampled_train.save_to_disk("G:\01-DATA\train_sampled_test_batch")

In [None]:
print(cv_sampled_train.shape)

In [None]:
cv_sampled_test.save_to_disk("cv_sampled_test")

In [None]:
import random
rand_int = random.randint(0, len(cv_sampled_test)-1)

print("Target text:", cv_sampled_test[rand_int]["target_text"])
print("Input array shape:", np.asarray(cv_sampled_test[rand_int]["speech"]).shape)
print("Sampling rate:", cv_sampled_test[rand_int]["sampling_rate"])

In [None]:
import IPython.display as ipd
import numpy as np

ipd.Audio(data=np.asarray(cv_sampled_test[rand_int]["speech"]), autoplay=True, rate=16000)

In [None]:
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor

tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [None]:
def prepare_dataset(batch):
    # check that all files have the correct sampling rate
    assert (
        len(set(batch["sampling_rate"])) == 1
    ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."

    batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["target_text"]).input_ids
    return batch

In [None]:
from datasets import Dataset, concatenate_datasets, load_from_disk

train_sa_pro_2 = load_from_disk("D:\Master\wsl/5_train_sa")

print(train_sa_pro_2.shape)

In [None]:
import time

print("Start: " + str(time.asctime()))

train_sa_pro_2 = train_sa_pro_2.map(prepare_dataset, remove_columns=train_sa_pro_2.column_names, batch_size=64, batched=True)

print("Save: " + str(time.asctime()))

train_sa_pro_2.save_to_disk("D:/Master/Data/train_sampled_pro_5")


In [None]:
print(train_sa_pro_2.shape)

In [None]:
test_sampled_pro = cv_sampled_test.map(prepare_dataset, remove_columns=cv_sampled_test.column_names, batch_size=8, num_proc=4, batched=True)

In [None]:
#test_sampled_pro.save_to_disk("test_sampled_pro")

In [None]:
from datasets import Dataset, concatenate_datasets, load_from_disk
test_sampled_pro = load_from_disk("test_sampled_pro")
val_sampled_pro = load_from_disk("val_sampled_pro")

In [None]:
big_set = concatenate_datasets([test_sampled_pro, val_sampled_pro])

In [None]:
big_set = concatenate_datasets([big_set, val_sampled_pro])

In [None]:
big_set.shape

In [None]:
big_set.save_to_disk("big_set")

In [None]:
import datasets
import sys
import platform

print(f"""
- Datasets: {datasets.__version__}
- Python: {sys.version}
- Platform: {platform.platform()}
""")

In [None]:
import glob
import time
for file_name in glob.iglob("D:/Master/Data/train_batch/*"):
    print(file_name)

In [None]:
from datasets import Dataset, concatenate_datasets, load_from_disk
import glob
import time

print("Start: " + str(time.asctime()))

cv_sampled_train = load_from_disk("D:/Master/Data/train_batch/train_sampled_pro_1")

first = True

for file_name in glob.iglob("D:/Master/Data/train_batch/*"):
    
    print("Time: " + str(time.asctime()))
    
    if(first):
        first = False
        print("first one skipped")
    else:
        print("concatenate next file")
        print(file_name)
        cv_batch = load_from_disk(file_name)
        cv_sampled_train = concatenate_datasets([cv_sampled_train, cv_batch])

print("Saving to disk!")
print("Time: " + str(time.asctime()))
cv_sampled_train.save_to_disk("D:/Master/Data/train_sampled_pro")

In [None]:
from datasets import Dataset, concatenate_datasets, load_from_disk
import glob
import time

print("Start: " + str(time.asctime()))

cv_sampled_train = load_from_disk("D:/Master/Data/train_batch/train_sampled_pro_1")

cv_batch = load_from_disk("D:/Master/Data/train_batch/train_sampled_pro_2")
cv_sampled_train = concatenate_datasets([cv_sampled_train, cv_batch])

print("concatenate next file")
print("Time: " + str(time.asctime()))

cv_batch = load_from_disk("D:/Master/Data/train_batch/train_sampled_pro_3")
cv_sampled_train = concatenate_datasets([cv_sampled_train, cv_batch])

print("Saving to disk!")
print("Time: " + str(time.asctime()))

cv_sampled_train.save_to_disk("D:/Master/Data/train_sampled_pro_big_1")

In [None]:
from datasets import Dataset, concatenate_datasets, load_from_disk
import glob
import time

print("Start: " + str(time.asctime()))

cv_sampled_train = load_from_disk("D:/Master/Data/train_batch/train_sampled_pro_4")

cv_batch = load_from_disk("D:/Master/Data/train_batch/train_sampled_pro_5")
cv_sampled_train = concatenate_datasets([cv_sampled_train, cv_batch])

print("Saving to disk!")
print("Time: " + str(time.asctime()))

cv_sampled_train.save_to_disk("D:/Master/Data/train_sampled_pro_big_2")

In [None]:
cv_sampled_train.shape

In [None]:
from datasets import Dataset, concatenate_datasets, load_from_disk
import glob
import time

print("Start: " + str(time.asctime()))

train_sampled_pro = load_from_disk("D:/Master/Data/train_sampled_pro_big_1")

cv_batch = load_from_disk("D:/Master/Data/train_sampled_pro_big_2")
train_sampled_pro = concatenate_datasets([train_sampled_pro, cv_batch])

print("Saving to disk!")
print("Time: " + str(time.asctime()))

train_sampled_pro.save_to_disk("D:/Master/Data/train_sampled_pro")

In [None]:
train_sampled_pro.shape

In [None]:
from datasets import Dataset, concatenate_datasets, load_from_disk
import glob
import time

print("Start: " + str(time.asctime()))

cv_sampled_train = load_from_disk("D:\Master\Data/test_sampled_pro")

cv_batch = load_from_disk("D:\Master\Data/val_sampled_pro")
cv_sampled_train = concatenate_datasets([cv_sampled_train, cv_batch])

print("concatenate next file")
print("Time: " + str(time.asctime()))

cv_batch = load_from_disk("D:\Master\Data/test_sampled_pro")
cv_sampled_train = concatenate_datasets([cv_sampled_train, cv_batch])

print("Saving to disk!")
print("Time: " + str(time.asctime()))

cv_sampled_train.save_to_disk("D:/Master/Data/test_save_BIG_fast")