In [17]:
import pandas as pd       
import os 
import math 
import numpy as np
import matplotlib.pyplot as plt
import librosa
from pydub import AudioSegment
from pydub.silence import split_on_silence
import torch
from torch.utils.data import Dataset, DataLoader
from pathlib import Path

In [67]:
class MFCCDataset(Dataset):
    def __init__(self, root_dir, train):
        self.root_dir = root_dir
        self.train = train

    def __len__(self):
        
        if train:
            return len(os.listdir(root_dir)) * 2
        
        return len(os.listdir(root_dir))

    def __getitem__(self, idx):
        
        noise = False
        num_files = len(os.listdir(self.root_dir))
        
        data_id = idx
        
        if idx >= num_files:
            data_id = idx - num_files
            noise = True
            
    
        f = os.listdir(self.root_dir)[data_id]
        label = torch.tensor(0)
        y, sr = librosa.load(self.root_dir / f)
        m = librosa.feature.mfcc(y=y, sr=sr)
        
        
        if noise:
            n = np.random.normal(0,1, m.shape)
            m += n
        
        m_standardized = np.zeros(m.shape)
        for b in range(m.shape[0]):
            m_slice = m[b,:]
            centered = m_slice - np.mean(m_slice)
            if np.std(centered) != 0:
                centered_scaled = centered / np.std(centered)
                
            m_standardized[b,:] = centered_scaled
            
        delta1 = librosa.feature.delta(m_standardized, order=1)
        delta2 = librosa.feature.delta(m_standardized, order=2)
        mfcc_data = np.stack((m_standardized,delta1,delta2))
        
        return torch.tensor(mfcc_data), label

In [None]:
for i in range(96):
    print(f"{i} {train_data[i][0].shape}")

In [23]:
original_dir = Path('C:/Users/omar_/Documents/cockatoos/data/accent_samples/recordings/recordings')
train_dir =  Path('C:/Users/omar_/Documents/cockatoos/data/train')
val_dir =  Path('C:/Users/omar_/Documents/cockatoos/data/val')
test_dir =  Path('C:/Users/omar_/Documents/cockatoos/data/test')


files = os.listdir(original_dir)


other_accent_types = ["mandarin", "japanese", "korean", "taiwanese", "cantonese", "thai", "indonesian"]

english_accent_files = []
other_accent_files = []

num_train_files = 150
num_val_files = 23
num_test_files = 25

end_idx_train = num_train_files
end_idx_val = end_idx_train + num_val_files
end_idx_test = end_idx_val + num_test_files

for f in files:
    if "english" in f:
        english_accent_files.append(f)
    
    if any(t in f for t in other_accent_types):
        other_accent_files.append(f)
        
print(f"Number of english accent files: {len(english_accent_files)}")
print(f"Number of other accent files: {len(other_accent_files)}")

train_files = english_accent_files[0:end_idx_train] + other_accent_files[0:end_idx_train]
val_files   = english_accent_files[end_idx_train:end_idx_val] + other_accent_files[end_idx_train:end_idx_val]
test_files  = english_accent_files[end_idx_val:end_idx_test] + other_accent_files[end_idx_val:end_idx_test]

print(f"Number of training files: {len(train_files)}")
print(f"Number of validaiton files: {len(val_files)}")
print(f"Number of test files: {len(test_files)}")

Number of english accent files: 579
Number of other accent files: 198
Number of training files: 300
Number of validaiton files: 46
Number of test files: 50


In [27]:
def generate_model_data(path, files):
    
    counter = 0
    seg_thresh = 500
    
    for f in files:
        if "english" in f:
            label = 1
        else:
            label = 0
            
        sound_file = AudioSegment.from_mp3(original_dir / f)
        audio_chunks = split_on_silence(sound_file, 
            # must be silent for at least half a second
            min_silence_len = 80,

            # consider it silent if quieter than -16 dBFS
            silence_thresh=-30
        )

        
        for seg in audio_chunks:
            
            seg_len = len(seg)
    
            if seg_len >= seg_thresh:
                seg_standardized = seg[0:seg_thresh]
            else:
                seg_standardized = seg + AudioSegment.silent(duration=(seg_thresh - seg_len))
                
            out_file = path / f"{label}_word{counter}.wav"
            counter += 1
            seg_standardized.export(out_file, format="wav")

In [28]:
#Uncomment to create the files for the dataset folders

# generate_model_data(train_dir, train_files)
# print("Training words created")
# generate_model_data(val_dir, val_files)
# print("Validaiton words created")
# generate_model_data(test_dir, test_files)
# print("Testing words created")

Training words created
Validaiton words created
Testing words created


In [68]:
train_data_dir = Path('C:/Users/omar_/Documents/cockatoos/data/train')
val_data_dir = Path('C:/Users/omar_/Documents/cockatoos/data/val')
test_data_dir = Path('C:/Users/omar_/Documents/cockatoos/data/test')

train_data = MFCCDataset(train_data_dir,True)
val_data = MFCCDataset(val_data_dir,True)
test_data = MFCCDataset(test_data_dir,True)