In [13]:
import pandas as pd       
import os 
import math 
import numpy as np
import matplotlib.pyplot as plt
import librosa
from pydub import AudioSegment, silence
from pydub.silence import split_on_silence
import torch
import torch.nn as nn
import pickle
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import KFold
from pathlib import Path

In [2]:
class MFCCDataset(Dataset):
    def __init__(self, root_dir):
        self.root_dir = root_dir
              
        self.mfccs = []
        self.labels = []
        
        for i in range(len(os.listdir(self.root_dir))):

            f = os.listdir(self.root_dir)[i]
            with open(self.root_dir / f, 'rb') as handle:
                entry = pickle.load(handle)
            self.mfccs.append(entry['data'])
            self.labels.extend(entry['labels'])
        
        self.mfccs = torch.from_numpy(np.vstack(self.mfccs).reshape(-1,3,50,44)).float()
        self.labels = torch.tensor(self.labels, dtype=torch.float)
    
    def __len__(self):
        
        return len(self.mfccs)

    def __getitem__(self, idx):
        
        return self.mfccs[idx], self.labels[idx]

In [52]:
def generate_dataset(df, files, accents, sizes):
    for accent in accents:
        if accent == 'us':
            label = 1
        else:
            label = 0
        accent_df = df[df['accent'] == accent]
        filenames = accent_df['filename'].tolist()
        if accent in sizes:
            filenames = filenames[:sizes[accent]]
        for name in filenames:
            files.append((name,label))

In [75]:
#Data setup for kaggle common voice dataset

train_dir =  Path('C:/Users/omar_/Documents/cockatoos/data/train')
test_dir =  Path('C:/Users/omar_/Documents/cockatoos/data/test')

data_train_src =  Path('C:/Users/omar_/Documents/kaggle_voice/archive (1)/cv-valid-train')
data_test_src =  Path('C:/Users/omar_/Documents/kaggle_voice/archive (1)/cv-valid-test')

train_csv = Path('C:/Users/omar_/Documents/kaggle_voice/archive (1)/cv-valid-train.csv')
test_csv = Path('C:/Users/omar_/Documents/kaggle_voice/archive (1)/cv-valid-test.csv')

train_files = []
val_files = []
test_files = []

train_df = pd.read_csv(train_csv)
test_df = pd.read_csv(test_csv)


accents = ['malaysia', 'african', 'wales', 'philippines','hongkong','singapore', 'indian', 'us']
train_sizes = {'us': 6000, 'indian': 4000}
generate_dataset(train_df, train_files, accents, train_sizes)
test_sizes = {'us': 150}
generate_dataset(test_df, test_files, accents, test_sizes)

        
print(f"Number of training files: {len(train_files)}")
print(f"Number of test files: {len(test_files)}")

Number of training files: 12166
Number of test files: 287


In [63]:
  def generate_mfcc_data(mfcc):
        mfcc_standardized = np.zeros(mfcc.shape)
        for b in range(mfcc.shape[0]):
            mfcc_slice = mfcc[b,:]
            centered = mfcc_slice - np.mean(mfcc_slice)
            if np.std(centered) != 0:
                centered_scaled = centered / np.std(centered)

            mfcc_standardized[b,:] = centered_scaled

        delta1 = librosa.feature.delta(mfcc_standardized, order=1)
        delta2 = librosa.feature.delta(mfcc_standardized, order=2)
        mfcc_data = np.stack((mfcc_standardized,delta1,delta2))
        
        return mfcc_data

In [64]:
def segment_and_standardize_audio(path, seg_size):
    sound_file = AudioSegment.from_mp3(path)
    limit = len(sound_file) // seg_size if len(sound_file) % seg_size == 0 else len(sound_file) // seg_size + 1
    chunks = []
    for i in range(0,limit):
        chunk = sound_file[i * seg_size : (i + 1) * seg_size]
        if len(chunk) < seg_size:
            chunk = chunk + AudioSegment.silent(duration=(seg_size - len(chunk)))
          

        if np.count_nonzero(chunk.get_array_of_samples()) > 45000:
            chunks.append(chunk)
    return chunks

In [65]:
def generate_model_data(src, dst, files, train, mean=0, std=1):
    
    counter = 0
    seg_size = 1000
    batch_num = 1
    mfccs = []
    items = []
    labels = []
    n_mfcc = 50
    mfcc_width = 44
    c = 0
    
    for f in files:

     
        # use for common voice data
        label = f[1]
        audio_chunks = segment_and_standardize_audio(src / f[0], seg_size)
        for seg in audio_chunks:
                 
            samples = seg.get_array_of_samples()
            arr = np.array(samples).astype(np.float32)/32768 # 16 bit 
            arr = librosa.core.resample(arr, seg.frame_rate, 22050, res_type='kaiser_best') 
                
            mfcc = librosa.feature.mfcc(y=arr, sr=22050, n_mfcc=n_mfcc)
            mfccs.append(mfcc)
            labels.append(label)
            
        c += 1
        if c % 100 == 0:
            print(f"Processed {c} files")
        
        
    all_data = np.vstack(mfccs).reshape(-1,n_mfcc,mfcc_width)
    if train:
        mean = all_data.mean(axis=0)
        std = all_data.std(axis=0)
        all_data = (all_data - mean) / std
    else:
        all_data = (all_data - mean) / std
    
    for j in range(all_data.shape[0]):
        d = generate_mfcc_data(all_data[j])
        items.append(d)
    
    
    batch_size = len(labels) // 6
    for j in range(6):
        
        start = j * batch_size
        end = start + batch_size
        if j == 5 and len(labels) % 6 != 0:
            end = len(labels)
        curr_data = items[start:end]
        curr_labels = labels[start:end]
        batch_mfcc = np.vstack(curr_data).reshape(-1,3,n_mfcc,mfcc_width)
        entry = dict()
        entry['data'] = batch_mfcc
        entry['labels'] = curr_labels
        with open(dst / f'data_batch_{j+1}.pickle', 'wb') as handle:
            pickle.dump(entry, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
        
    if train:
        return mean, std

In [76]:
#Uncomment to create the files for the dataset folders (common voice)

# mean, std = generate_model_data(data_train_src, train_dir, train_files, True)
# print("Training words created")
# generate_model_data(data_test_src, test_dir, test_files, False, mean, std)
# print("Testing words created")

Processed 100 files
Processed 200 files
Testing words created


In [77]:
train_data_dir = Path('C:/Users/omar_/Documents/cockatoos/data/train')
test_data_dir = Path('C:/Users/omar_/Documents/cockatoos/data/test')

train_data = MFCCDataset(train_data_dir)
test_data = MFCCDataset(test_data_dir)

In [17]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [71]:
class AccentClassifier(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Conv2d(3,32,3),
            nn.ReLU(),
            nn.BatchNorm2d(32),
            nn.MaxPool2d(2),
            nn.Conv2d(32,64,3),
            nn.ReLU(),
            nn.BatchNorm2d(64),
            nn.MaxPool2d(2),
            nn.Dropout(0.5),
            nn.Flatten(1,3),
            nn.Linear(6336,256),
            nn.Dropout(0.5),
            nn.Linear(256,1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        return self.layers(x)

In [85]:
epochs = 100
kfold = KFold(n_splits=10, shuffle=True)
best_accuracy = 0
for fold, (train_ids, test_ids) in enumerate(kfold.split(train_data)):
    torch.cuda.empty_cache()
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
    test_subsampler = torch.utils.data.SubsetRandomSampler(test_ids)
    trainloader = torch.utils.data.DataLoader(
                      train_data, 
                      batch_size=128, sampler=train_subsampler)
    testloader = torch.utils.data.DataLoader(
                      train_data,
                      batch_size=32, sampler=test_subsampler)
    
    model = AccentClassifier().to(device)
    model.train()
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
    for epoch in range(epochs):
        running_loss = 0
        correct = 0
        for i, (inputs, labels) in enumerate(trainloader):
            optimizer.zero_grad()
            outputs = model(inputs.to(device))
            loss = nn.BCELoss()(outputs,labels.to(device).reshape(-1,1))
            running_loss += loss.item()
            loss.backward()
            optimizer.step()
            outputs = outputs.reshape(1, -1)
            outputs = outputs.squeeze()
            for i in range(outputs.size()[0]):
                if (labels[i] == 0 and outputs[i] < 0.5) or (labels[i] == 1 and outputs[i] >= 0.5):
                    correct += 1

       
        print(f"Epoch {epoch + 1}  Loss: {running_loss / len(trainloader)}  Accuracy: {100 * correct / len(train_ids)}")

    
    with torch.no_grad():
            model.eval()
            test_loss = 0
            test_correct = 0
            for j, (d,l) in enumerate(testloader):
                o = model(d.to(device))
                loss = nn.BCELoss()(o,l.to(device).reshape(-1,1))
                test_loss += loss.item()
                o = o.reshape(1,-1)
                o = o.squeeze()
                for i in range(o.size()[0]):
                    if (l[i] == 0 and o[i] < 0.5) or (l[i] == 1 and o[i] >= 0.5):
                        test_correct += 1

            accuracy = 100 * test_correct / len(test_ids)
            
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        print("Higher validaiton accuracy score achieved! Saving model.")
        save_model(model)
            
    
            
    print(f"Model test accuracy for fold {fold}: {accuracy} ")  

Epoch 1  Loss: 0.9169487223932238  Accuracy: 57.13608345187293
Epoch 2  Loss: 0.6490876372113372  Accuracy: 61.5398293029872
Epoch 3  Loss: 0.6421443113323414  Accuracy: 62.39331436699858
Epoch 4  Loss: 0.6372921959017263  Accuracy: 62.84376481744903
Epoch 5  Loss: 0.6286762456099192  Accuracy: 64.07657657657657
Epoch 6  Loss: 0.6228202922326146  Accuracy: 64.48850165955429
Epoch 7  Loss: 0.6192113021106431  Accuracy: 65.1967757230915
Epoch 8  Loss: 0.6126357872377742  Accuracy: 65.98210052157421
Epoch 9  Loss: 0.604826879772273  Accuracy: 66.93930772878142
Epoch 10  Loss: 0.5992452933481245  Accuracy: 66.77631578947368
Epoch 11  Loss: 0.5914975576328508  Accuracy: 67.7898293029872
Epoch 12  Loss: 0.5850246977625471  Accuracy: 68.53959222380276
Epoch 13  Loss: 0.5760906907645139  Accuracy: 69.1678520625889
Epoch 14  Loss: 0.5732338963584467  Accuracy: 69.36936936936937
Epoch 15  Loss: 0.5652462466422356  Accuracy: 69.99466571834994
Epoch 16  Loss: 0.5588383462392923  Accuracy: 70.63477

Epoch 29  Loss: 0.49660611976728297  Accuracy: 75.64900426742533
Epoch 30  Loss: 0.491098141579917  Accuracy: 75.82385016595543
Epoch 31  Loss: 0.4868248723673098  Accuracy: 76.20614035087719
Epoch 32  Loss: 0.48685861920768564  Accuracy: 76.22688477951635
Epoch 33  Loss: 0.48210234691699344  Accuracy: 76.48174490279753
Epoch 34  Loss: 0.4815569393562548  Accuracy: 76.47285443338075
Epoch 35  Loss: 0.47510363387339044  Accuracy: 77.08926031294452
Epoch 36  Loss: 0.47319610958749597  Accuracy: 77.24039829302987
Epoch 37  Loss: 0.4701695430910949  Accuracy: 77.21965386439071
Epoch 38  Loss: 0.46727703590736247  Accuracy: 77.38857278330963
Epoch 39  Loss: 0.46336072502714215  Accuracy: 77.79457088667615
Epoch 40  Loss: 0.45699054318847077  Accuracy: 78.08795637743006
Epoch 41  Loss: 0.458226778967814  Accuracy: 78.20353247984826
Epoch 42  Loss: 0.4556111226027662  Accuracy: 78.15315315315316
Epoch 43  Loss: 0.452579181980003  Accuracy: 78.26872925557136
Epoch 44  Loss: 0.45018915583690006

Epoch 56  Loss: 0.421578352537119  Accuracy: 80.25130393551446
Epoch 57  Loss: 0.4200916453970201  Accuracy: 80.45282124229493
Epoch 58  Loss: 0.4217178052799268  Accuracy: 80.31353722143196
Epoch 59  Loss: 0.42514178258451546  Accuracy: 80.19796111901375
Epoch 60  Loss: 0.4209017533470284  Accuracy: 80.532835467046
Epoch 61  Loss: 0.41740291689833003  Accuracy: 80.66026552868658
Epoch 62  Loss: 0.4163429496640509  Accuracy: 80.5832147937411
Epoch 63  Loss: 0.41189706201354664  Accuracy: 81.0099573257468
Epoch 64  Loss: 0.4075225643587835  Accuracy: 81.27963489805595
Epoch 65  Loss: 0.4135451167821884  Accuracy: 80.86770981507824
Epoch 66  Loss: 0.4086885808995276  Accuracy: 81.25889046941678
Epoch 67  Loss: 0.4126396237900763  Accuracy: 80.87067330488384
Epoch 68  Loss: 0.4080764281027245  Accuracy: 81.20554765291607
Epoch 69  Loss: 0.4020135547175552  Accuracy: 81.52264106211474
Epoch 70  Loss: 0.40148578708370525  Accuracy: 81.45744428639165
Epoch 71  Loss: 0.4066797601002635  Accur

Epoch 84  Loss: 0.39443254459536436  Accuracy: 81.94049312470365
Epoch 85  Loss: 0.39334077348537516  Accuracy: 81.75379326695116
Epoch 86  Loss: 0.3887479229632652  Accuracy: 82.34945471787577
Epoch 87  Loss: 0.39067475147771114  Accuracy: 82.05903271692745
Epoch 88  Loss: 0.39269690813891817  Accuracy: 82.31389284020862
Epoch 89  Loss: 0.39338038015094673  Accuracy: 82.18053579895685
Epoch 90  Loss: 0.39400233350919955  Accuracy: 81.80120910384068
Epoch 91  Loss: 0.38707614802952967  Accuracy: 82.5154101469891
Epoch 92  Loss: 0.383443993600932  Accuracy: 82.6280227596017
Epoch 93  Loss: 0.38493003301096684  Accuracy: 82.50355618776672
Epoch 94  Loss: 0.38679793527857825  Accuracy: 82.31685633001422
Epoch 95  Loss: 0.3819002838184436  Accuracy: 82.59542437174017
Epoch 96  Loss: 0.38568777873209026  Accuracy: 82.43243243243244
Epoch 97  Loss: 0.3817283412949605  Accuracy: 82.7584163110479
Epoch 98  Loss: 0.3813427938995036  Accuracy: 82.76730678046468
Epoch 99  Loss: 0.3816577901668621

Epoch 10  Loss: 0.6325556100769476  Accuracy: 63.3397540376352
Epoch 11  Loss: 0.6232631369070574  Accuracy: 64.4006519484368
Epoch 12  Loss: 0.6136580195390817  Accuracy: 65.5060008890206
Epoch 13  Loss: 0.6063539087772369  Accuracy: 66.24388798340495
Epoch 14  Loss: 0.5973704247312113  Accuracy: 67.15068899096163
Epoch 15  Loss: 0.5886619820287733  Accuracy: 68.12268484219885
Epoch 16  Loss: 0.5820549440880617  Accuracy: 68.8013038968736
Epoch 17  Loss: 0.5754690262855906  Accuracy: 68.95836420210402
Epoch 18  Loss: 0.5683816354157347  Accuracy: 69.8622018076752
Epoch 19  Loss: 0.5627644454891031  Accuracy: 70.47562601866943
Epoch 20  Loss: 0.5525666813958775  Accuracy: 71.23129352496666
Epoch 21  Loss: 0.5450943447649479  Accuracy: 71.826937324048
Epoch 22  Loss: 0.540369269974304  Accuracy: 72.06697288487183
Epoch 23  Loss: 0.5331458951713461  Accuracy: 72.6033486442436
Epoch 24  Loss: 0.5281519872898405  Accuracy: 73.21973625722329
Epoch 25  Loss: 0.5238708859365998  Accuracy: 73.

Epoch 37  Loss: 0.48703854914867517  Accuracy: 76.28982071417988
Epoch 38  Loss: 0.48175770145925606  Accuracy: 76.51503926507631
Epoch 39  Loss: 0.4811611912693038  Accuracy: 76.34612535190399
Epoch 40  Loss: 0.47709802797797957  Accuracy: 76.77878204178397
Epoch 41  Loss: 0.4763013797275948  Accuracy: 76.82915987553712
Epoch 42  Loss: 0.4732457713195772  Accuracy: 76.94176915098534
Epoch 43  Loss: 0.46842335096814414  Accuracy: 77.38924285079271
Epoch 44  Loss: 0.4668210929993427  Accuracy: 77.49296192028449
Epoch 45  Loss: 0.46650426947709284  Accuracy: 77.70336346125352
Epoch 46  Loss: 0.46117015702254843  Accuracy: 78.03822788561268
Epoch 47  Loss: 0.46260627178531705  Accuracy: 77.68261964735517
Epoch 48  Loss: 0.45435556125911797  Accuracy: 78.50348199733294
Epoch 49  Loss: 0.4519808345446081  Accuracy: 78.43236034968143
Epoch 50  Loss: 0.4556827751750296  Accuracy: 78.07675211142391
Epoch 51  Loss: 0.45214815132997255  Accuracy: 78.36716550600089
Epoch 52  Loss: 0.4531509712564

Epoch 65  Loss: 0.4193974170043613  Accuracy: 80.50970514150245
Epoch 66  Loss: 0.42221263242941914  Accuracy: 80.25485257075123
Epoch 67  Loss: 0.41814804698030156  Accuracy: 80.56304637724108
Epoch 68  Loss: 0.41261147515791835  Accuracy: 80.6875092606312
Epoch 69  Loss: 0.4161885351394162  Accuracy: 80.72010668247148
Epoch 70  Loss: 0.414469708998998  Accuracy: 80.94828863535338
Epoch 71  Loss: 0.4137233869370186  Accuracy: 80.87716698770188
Epoch 72  Loss: 0.4131405447242838  Accuracy: 81.18239739220625
Epoch 73  Loss: 0.40703792045965337  Accuracy: 81.31575048155283
Epoch 74  Loss: 0.4109144066319321  Accuracy: 81.10831234256926
Epoch 75  Loss: 0.40592755964308075  Accuracy: 81.19425100014817
Epoch 76  Loss: 0.4105875583534891  Accuracy: 80.92754482145503
Epoch 77  Loss: 0.40472198921171104  Accuracy: 81.49651800266706
Epoch 78  Loss: 0.4052966260774569  Accuracy: 81.434286560972
Epoch 79  Loss: 0.40469095381823456  Accuracy: 81.35427470736406
Epoch 80  Loss: 0.4002432235036836  A

Epoch 92  Loss: 0.38245852133541397  Accuracy: 82.58408653133797
Epoch 93  Loss: 0.38246338814496994  Accuracy: 82.61964735516372
Epoch 94  Loss: 0.3797685811578324  Accuracy: 82.84190250407468
Epoch 95  Loss: 0.37784873084588483  Accuracy: 83.03156023114535
Epoch 96  Loss: 0.380058191716671  Accuracy: 82.77078085642317
Epoch 97  Loss: 0.3809714975456397  Accuracy: 82.6818787968588
Epoch 98  Loss: 0.378518572923812  Accuracy: 82.91598755371166
Epoch 99  Loss: 0.37468784595980786  Accuracy: 82.86560971995851
Epoch 100  Loss: 0.3756586770442399  Accuracy: 83.00488961327603
Model test accuracy for fold 8: 65.45745532141905 
Epoch 1  Loss: 0.8896639213869066  Accuracy: 58.2219588087124
Epoch 2  Loss: 0.6459853366920443  Accuracy: 61.77507778930212
Epoch 3  Loss: 0.6395487437645594  Accuracy: 62.7678174544377
Epoch 4  Loss: 0.6366598369045691  Accuracy: 63.41680248925767
Epoch 5  Loss: 0.631266241949616  Accuracy: 63.84056897318121
Epoch 6  Loss: 0.6212200135218374  Accuracy: 65.07630760112

In [88]:
test_loader = DataLoader(test_data,batch_size=32,shuffle=True)
m = AccentClassifier()
m.load_state_dict(torch.load("binary_accent_classifier.pt"))
m.to(device)

with torch.no_grad():
    m.eval()
    test_loss = 0
    test_correct = 0
    for j, (d,l) in enumerate(test_loader):
        o = m(d.to(device))
        loss = nn.BCELoss()(o,l.to(device).reshape(-1,1))
        test_loss += loss.item()
        o = o.reshape(1,-1)
        o = o.squeeze()
        for i in range(o.size()[0]):
            if (l[i] == 0 and o[i] < 0.5) or (l[i] == 1 and o[i] >= 0.5):
                test_correct += 1

    accuracy = 100 * test_correct / len(test_data)
            
print(accuracy)

66.59551760939168


In [81]:
#Classifying specific set of audio samples
def predict(test_dir):
    
    predictions = dict()
    for f in os.listdir(test_dir):
        audio_chunks = segment_and_standardize_audio(test_dir / f, 1000)
        num_american_pred = 0
        for seg in audio_chunks:

            samples = seg.get_array_of_samples()
            arr = np.array(samples).astype(np.float32)/32768 # 16 bit 
            arr = librosa.core.resample(arr, seg.frame_rate, 22050, res_type='kaiser_best') 

            mfcc = librosa.feature.mfcc(y=arr, sr=22050, n_mfcc=50)
            data = generate_mfcc_data(mfcc)
            pred = model(torch.from_numpy(data).unsqueeze(0).float().to(device)).item()
            if pred > 0.5:
                num_american_pred += 1
        
        frac_american_preds = num_american_pred / len(audio_chunks)
        
        if frac_american_preds >= 0.5:
            predictions[f] = 1
        else:
            predictions[f] = 0
            
    return predictions

In [80]:
def save_model(model):
    x = torch.randn(1, 3, 50, 44, requires_grad=True).to(device)
    torch.save(model.state_dict(), "binary_accent_classifier.pt")
    torch.onnx.export(model, x, "binary_accent_classifier.onnx", opset_version=11)