In [2]:
import torch
import torch.utils.data
import os
import numpy as np
import json
import pandas as pd
import librosa 
import pickle


In [3]:
class PhonemeTokenizer:
# tokenize phonemes into IDs  
    def __init__(self, phoneme_to_phoneme_index):
        self.phoneme_to_phoneme_index = phoneme_to_phoneme_index
        self.phoneme_index_to_phoneme = {v: k for k, v in self.phoneme_to_phoneme_index.items()}

    def EncodeAsIds(self, phoneme_string):
        return [self.phoneme_to_phoneme_index[p] for p in phoneme_string.split()]

    def DecodeIds(self, phoneme_ids):
        return " ".join([self.phoneme_index_to_phoneme[id] for id in phoneme_ids])
    
# label_set = [1: 'AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0', 'AO1', 'AO2', \
#              'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', \
#              'EY0', 'EY1', 'EY2', 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'OW0', 'OW1', 'OW2', \
#              'OY0', 'OY1', 'OY2', 'UH0', 'UH1', 'UH2', 'UW0', 'UW1', 'UW2', \
#              'B', 'CH', 'D', 'DH', 'F', 'G', 'HH', 'JH', 'K', 'L', 'M', 'N', 'NG', 'P', 'R', \
#              'S', 'SH', 'T', 'TH', 'V', 'W', 'Y', 'Z', 'ZH', 'sil', 'sp', '']

label_set = ['AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'B', 'CH', 'D', 'DH', 'EH', 'ER', 'EY', 'F', 'G', 'HH',
             'IH', 'IY', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OY', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH',
             'UW', 'V', 'W', 'Y', 'Z', 'ZH', 'SIL', 'SPN', '']


In [4]:
# load in audio data in train set 

audio_path = "/Users/chantal/Desktop/StMichaels/unpacked/"
data = os.listdir(audio_path)

label_path = "/Users/chantal/Desktop/StMichaels/mini-librispeech-csv/train_data.csv"

all_data = {}

# load in the audio and get mfcc features for each audio 
for item in data: 
    if item.endswith(".flac"):
        y, sr = librosa.load(audio_path+item)
        feat = librosa.feature.mfcc(y, sr, n_mfcc=30, n_fft=25, hop_length=10)
        all_data[item[:-5]] = feat


  "Empty filters detected in mel frequency basis. "


In [5]:
# load in the labels
label_df = pd.read_csv(label_path)
phoneme_df = label_df['phonemes_39'].tolist()
l = []
n = []
v = []

for (key, value), label in zip(all_data.items(), phoneme_df):
    int_label = [label_set.index(x.upper()) for x in label.split()]
    l.append(int_label)
    n.append(key)
    v.append(value)
    
# save down data in pandas dataframe format: filename - data - labels for train, test, and dev. 
print(len(n), len(v), len(l))
df = pd.DataFrame({'name': n, 'audio': v, 'label':l})


1519 1519 1519


In [8]:
df[1000:].to_pickle("all_train_audio_2",  compression='gzip', protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
df['audio'][0][0]