In [1]:
import pandas as pd       
import os 
import math 
import numpy as np
import matplotlib.pyplot as plt
import librosa
from pydub import AudioSegment
from pydub.silence import split_on_silence
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from pathlib import Path

In [131]:
class MFCCDataset(Dataset):
    def __init__(self, root_dir, train):
        self.root_dir = root_dir
        self.train = train
        
        
        self.mfccs = []
        self.labels = []
        
        for i in range(len(os.listdir(self.root_dir))):

            f = os.listdir(self.root_dir)[i]
            label = torch.tensor(int(f[0]), dtype=torch.float)
            y, sr = librosa.load(self.root_dir / f)
            mfcc = librosa.feature.mfcc(y=y, sr=sr)
            data = self.__generate_mfcc_data(mfcc)
            self.mfccs.append(torch.from_numpy(data).float())
            self.labels.append(label)

            if self.train:
                noise = np.random.normal(0,1, mfcc.shape)
                mfcc_noisy = mfcc + noise
                noisy_data = self.__generate_mfcc_data(mfcc_noisy)
                self.mfccs.append(torch.from_numpy(noisy_data).float())
                self.labels.append(label)

            
    def __generate_mfcc_data(self, mfcc):
        mfcc_standardized = np.zeros(mfcc.shape)
        for b in range(mfcc.shape[0]):
            mfcc_slice = mfcc[b,:]
            centered = mfcc_slice - np.mean(mfcc_slice)
            if np.std(centered) != 0:
                centered_scaled = centered / np.std(centered)

            mfcc_standardized[b,:] = centered_scaled

        delta1 = librosa.feature.delta(mfcc_standardized, order=1)
        delta2 = librosa.feature.delta(mfcc_standardized, order=2)
        mfcc_data = np.stack((mfcc_standardized,delta1,delta2))
        
        return mfcc_data
    
    def __len__(self):
        
        return len(self.mfccs)

    def __getitem__(self, idx):
        
        return self.mfccs[idx], self.labels[idx]

In [3]:
original_dir = Path('C:/Users/omar_/Documents/cockatoos/data/accent_samples/recordings/recordings')
train_dir =  Path('C:/Users/omar_/Documents/cockatoos/data/train')
val_dir =  Path('C:/Users/omar_/Documents/cockatoos/data/val')
test_dir =  Path('C:/Users/omar_/Documents/cockatoos/data/test')

files = os.listdir(original_dir)


other_accent_types = ["mandarin", "japanese", "korean", "taiwanese", "cantonese", "thai", "indonesian"]

english_accent_files = []
other_accent_files = []

num_train_files = 150
num_val_files = 23
num_test_files = 25

end_idx_train = num_train_files
end_idx_val = end_idx_train + num_val_files
end_idx_test = end_idx_val + num_test_files

for f in files:
    if "english" in f:
        english_accent_files.append(f)
    
    if any(t in f for t in other_accent_types):
        other_accent_files.append(f)
        
np.random.shuffle(english_accent_files)
np.random.shuffle(other_accent_files)
        
print(f"Number of english accent files: {len(english_accent_files)}")
print(f"Number of other accent files: {len(other_accent_files)}")

train_files = english_accent_files[0:end_idx_train] + other_accent_files[0:end_idx_train]
val_files   = english_accent_files[end_idx_train:end_idx_val] + other_accent_files[end_idx_train:end_idx_val]
test_files  = english_accent_files[end_idx_val:end_idx_test] + other_accent_files[end_idx_val:end_idx_test]

print(f"Number of training files: {len(train_files)}")
print(f"Number of validaiton files: {len(val_files)}")
print(f"Number of test files: {len(test_files)}")

Number of english accent files: 579
Number of other accent files: 198
Number of training files: 300
Number of validaiton files: 46
Number of test files: 50


In [4]:
def generate_model_data(path, files):
    
    counter = 0
    seg_thresh = 500
    
    for f in files:
        if "english" in f:
            label = 1
        else:
            label = 0
            
        sound_file = AudioSegment.from_mp3(original_dir / f)
        audio_chunks = split_on_silence(sound_file, 
            # must be silent for at least half a second
            min_silence_len = 80,

            # consider it silent if quieter than -16 dBFS
            silence_thresh=-30
        )

        
        for seg in audio_chunks:
            
            seg_len = len(seg)
    
            if seg_len >= seg_thresh:
                seg_standardized = seg[0:seg_thresh]
            else:
                seg_standardized = seg + AudioSegment.silent(duration=(seg_thresh - seg_len))
                
            out_file = path / f"{label}_word{counter}.wav"
            counter += 1
            seg_standardized.export(out_file, format="wav")

In [25]:
#Uncomment to create the files for the dataset folders

# generate_model_data(train_dir, train_files)
# print("Training words created")
# generate_model_data(val_dir, val_files)
# print("Validaiton words created")
# generate_model_data(test_dir, test_files)
# print("Testing words created")

Training words created
Validaiton words created
Testing words created


In [132]:
train_data_dir = Path('C:/Users/omar_/Documents/cockatoos/data/train')
val_data_dir = Path('C:/Users/omar_/Documents/cockatoos/data/val')
test_data_dir = Path('C:/Users/omar_/Documents/cockatoos/data/test')

train_data = MFCCDataset(train_data_dir,True)
# val_data = MFCCDataset(val_data_dir,False)
# test_data = MFCCDataset(test_data_dir,False)

In [133]:
print(len(train_data))

30346


In [134]:
#Model definition
model = nn.Sequential(
            nn.Conv2d(3,32,3),
            nn.ReLU(),
            nn.BatchNorm2d(32),
            nn.MaxPool2d(2),
            nn.Conv2d(32,64,3),
            nn.ReLU(),
            nn.BatchNorm2d(64),
            nn.MaxPool2d(2),
            nn.Dropout(0.5),
            nn.Flatten(1,3),
            nn.Linear(768,128),
            nn.Dropout(0.5),
            nn.Linear(128,1),
            nn.Softmax(dim=0)
        ).to(device)

In [26]:
print(model)

Sequential(
  (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (4): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (5): ReLU()
  (6): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (8): Dropout(p=0.5, inplace=False)
  (9): Flatten(start_dim=1, end_dim=3)
  (10): Linear(in_features=768, out_features=128, bias=True)
  (11): Dropout(p=0.5, inplace=False)
  (12): Linear(in_features=128, out_features=2, bias=True)
  (13): Softmax(dim=0)
)


In [135]:
with torch.no_grad():
    torch.cuda.empty_cache()

In [136]:
train_loader = DataLoader(train_data,batch_size=32,shuffle=True)
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 100

for epoch in range(epochs):
    
    running_loss = 0
    
    for i, (inputs, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(inputs.to(device))
        loss = nn.BCELoss()(outputs,labels.to(device).reshape(-1,1))
        running_loss += loss.item()
        loss.backward()
        optimizer.step()
        
        
    print(f"Epoch:{epoch}, avg loss for epoch:{running_loss / len(train_loader)} ")

Epoch:0, avg loss for epoch:1.5608267308663266 
Epoch:1, avg loss for epoch:1.5227497042293168 
Epoch:2, avg loss for epoch:1.5127733751142238 
Epoch:3, avg loss for epoch:1.5045160447395514 
Epoch:4, avg loss for epoch:1.4993145132341175 
Epoch:5, avg loss for epoch:1.492054014361696 
Epoch:6, avg loss for epoch:1.4835579829296397 
Epoch:7, avg loss for epoch:1.4785768433039257 
Epoch:8, avg loss for epoch:1.471606018043293 
Epoch:9, avg loss for epoch:1.4679021722273027 
Epoch:10, avg loss for epoch:1.4631020287630807 
Epoch:11, avg loss for epoch:1.4579420624844517 
Epoch:12, avg loss for epoch:1.4540073163768137 
Epoch:13, avg loss for epoch:1.4482370840485657 
Epoch:14, avg loss for epoch:1.4448634328279404 
Epoch:15, avg loss for epoch:1.4441619467685045 
Epoch:16, avg loss for epoch:1.4399929461790715 
Epoch:17, avg loss for epoch:1.4359401049302425 
Epoch:18, avg loss for epoch:1.4324712093499488 
Epoch:19, avg loss for epoch:1.4303337505167728 
Epoch:20, avg loss for epoch:1.4