In [82]:
import torchaudio
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2CTCTokenizer
from transformers import Trainer, TrainingArguments
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from datasets import load_metric

from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import librosa
from tqdm.notebook import tqdm
import glob
import os
import re
import json

# Preprocessing

In [34]:
all_folders = glob.glob('data/*')

In [48]:
data = []
for folder in all_folders:
    wav_files = glob.glob(folder + '/wavs/*.wav')
    label_files = glob.glob(folder + '/labels/*.txt')
    
    for wav, label in zip(wav_files, label_files):
        text = open(label, 'r').read()
        if text != '':
            data.append({
                'wav_file': wav,
                'sentence': text
            })

In [56]:
df = pd.DataFrame(data)
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'
df['sentence'] = df['sentence'].apply(lambda x: re.sub(chars_to_ignore_regex, '', x).lower() + " ")

In [57]:
# Split train and test 
train_df, eval_df = train_test_split(df, test_size=0.2, random_state=2021)

In [71]:
train_text = ' '.join(train_df['sentence']) 
eval_text = ' '.join(eval_df['sentence'])

vocab_list = list(set(train_text) | set(eval_text))
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [73]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
print(len(vocab_dict))

41


In [75]:
with open('vocab/vocab.json', 'w') as json_file:
    json.dump(vocab_dict, json_file)

In [79]:
# Create tokenizer
tokenizer = Wav2Vec2CTCTokenizer('vocab/vocab.json', unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

# Feature Extraction

In [80]:
feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1, sampling_rate=16000, padding_value=0.0,
    do_normalize=True, return_attention_mask=True
)

In [81]:
processor = Wav2Vec2Processor(
    feature_extractor=feature_extractor, tokenizer=tokenizer,
)

In [None]:
# Resample from 41kHz to 16khz
# For that we need to create a dataset class from torch
# As malay is not part of the datasets class
class AudoDataSet(Dataset):
    def __init__(self, df):
        wav_file = df['wav_file']
        labels = df['labels']