In [1]:
from utils import *

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# From https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_tuning_Wav2Vec2_for_English_ASR.ipynb#scrollTo=72737oog2F6U
# Uses the GEO dataset
def extract_all_chars(batch):
    all_text = " ".join(batch["transcript"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}


In [None]:
# Create vocabulary based on train and val set transcriptions
def create_vocabulary(dataset):
    vocabs = dataset.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=dataset.column_names["train"])
    vocab_list = list(set(vocabs["train"]["vocab"][0]) | set(vocabs["val"]["vocab"][0]))
    vocab_dict = {v: k for k, v in enumerate(vocab_list)}
    vocab_dict["|"] = vocab_dict[" "]
    del vocab_dict[" "]
    vocab_dict["<unk>"] = len(vocab_dict)
    vocab_dict["<pad>"] = len(vocab_dict)
    with open('vocab.json', 'w', encoding="utf-8") as vocab_file:
        json.dump(vocab_dict, vocab_file, ensure_ascii=False)
    return vocab_dict

In [10]:
# Paths to data folder, CSV file names
DATA_PATH = "data/"
# TRAIN_CSV = DATA_PATH + "train.csv"
TRAIN_CSV = DATA_PATH + "train_aug.csv"

DEV_CSV = DATA_PATH + "dev.csv"
TEST_CSV = DATA_PATH + "test_release.csv"


In [5]:
# Create dataset
dataset = create_data_set(DATA_PATH, TRAIN_CSV, DEV_CSV, TEST_CSV)
dataset = dataset.map(remove_special_characters)
print("Dataset:")
print(dataset)
print()
print("Example files and transcripts")
show_random_elements(dataset["train"].remove_columns(["audio"]))
print()


Map: 100%|██████████| 12000/12000 [00:02<00:00, 4884.69 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 4029.20 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 8112.84 examples/s]


Dataset:
DatasetDict({
    train: Dataset({
        features: ['file', 'transcript', 'audio'],
        num_rows: 12000
    })
    val: Dataset({
        features: ['file', 'transcript', 'audio'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['file', 'transcript', 'audio'],
        num_rows: 1000
    })
})

Example files and transcripts


Unnamed: 0,file,transcript
0,augmented_audio/train_4549_aug.wav,la refiro doubs vormas in nordogcedinto di la ...
1,geo/train_1390.wav,la rikeonon admenestras akintijo di protigto d...
2,geo/train_5958.wav,teuj vortoj pofas surlogi life la survacon di ...
3,geo/train_3524.wav,gei sen trofas teu lando me demandes al mi mim
4,geo/train_3024.wav,la ankoroj ĉivi relates la gumuleĝon di la hes...
5,augmented_audio/train_502_aug.wav,ĉeu plumaroj istas semelaj
6,geo/train_1773.wav,la subaj partoj istas krezicaj sid la kapo kun...
7,geo/train_4502.wav,ĉekomponaĵoj ple malvrui apires in sireo da go...
8,geo/train_145.wav,teal la nomo honora unefirsetato
9,augmented_audio/train_4022_aug.wav,ĝe istas la akregultura gaj gomirca cintro di ...





In [12]:
# Create vocabulary
vocab_dict = create_vocabulary(dataset)
print(vocab_dict)

Map: 100%|██████████| 12000/12000 [00:00<00:00, 125753.04 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 56436.50 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 69642.75 examples/s]


{'ĝ': 0, 'a': 1, 'z': 2, 'ĉ': 3, 'm': 4, 'u': 5, 'ĵ': 6, 'p': 7, 's': 8, 'ĥ': 9, 't': 10, 'c': 11, 'f': 13, 'd': 14, 'l': 15, 'j': 16, 'b': 17, 'e': 18, 'h': 19, 'n': 20, 'ŭ': 21, 'r': 22, 'i': 23, 'v': 24, 'o': 25, 'ŝ': 26, 'g': 27, 'k': 28, '|': 12, '<unk>': 29, '<pad>': 30}
