### Speaker Verification System

In [13]:
#importing the libraries
import torchvision
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
from random import *
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from torchvision import datasets
import librosa
from itertools import combinations
import random
import itertools

In [14]:
def stft(arr):
    arr=librosa.stft(arr, n_fft=1024, hop_length=512)
    arr=np.abs(arr).T
    return arr

In [15]:
import pickle
with open('data/hw4_trs.pkl', 'rb') as f:
    train_data = pickle.load(f)
with open('data/hw4_tes.pkl', 'rb') as f:
    test_data = pickle.load(f)

### Negative Samples

In [16]:
def get_positive_sample(j,data):
    pos_index=j*10
    pos_data=data[pos_index:pos_index+10]
    positive_list=[]
    pos_combinations=list(itertools.combinations(range(10),2))
    pos_samples=random.sample(pos_combinations,45)
    for i in range(len(pos_samples)):
        first_speaker=stft(pos_data[pos_combinations[i][0]])
        second_speaker=stft(pos_data[pos_combinations[i][1]])
        positive_list.append([first_speaker,second_speaker])
    return positive_list

In [17]:
def negative_samples(pos_speaker,data):
    negative_list=[]
    pos_index=pos_speaker * 10
    neg_index=pos_index + 10
    pos_speaker_arr=data[pos_index:neg_index]
    neg_speaker_arr=data[:pos_index] + data[neg_index:]
    neg_samples=random.sample(neg_speaker_arr,45)
    for i in range(45):
        positive_sample=random.choice(pos_speaker_arr)
        first_speaker_arr=stft(positive_sample)
        second_speaker_arr=stft(neg_samples[i])
        negative_list.append([first_speaker_arr,second_speaker_arr])
    return negative_list
        

In [19]:
train = []
train_df=list(train_data)
for i in range(0,50):
    positive_lst = get_positive_sample(i,train_df)
    negative_lst = negative_samples(i,train_df)
    train=train + positive_lst + negative_lst

In [21]:
x_train = np.stack(train)
x_train.shape

(4500, 2, 32, 513)

In [22]:
def padding(x):
    final_shape=np.zeros((2,45,513))
    final_shape[:,:32]=x
    return final_shape

In [23]:
x_train_pad=[]
for i in range(0,len(x_train)):
    arr=padding(x_train[i])
    x_train_pad.append(arr)

In [26]:
len(x_train_pad)

4500

In [27]:
target_arr = [] 
for i in range(50):
    y_tr = np.zeros(90, dtype = int)
    y_tr[45:] += 1
    target_arr.append(y_tr)
target_arr = np.hstack(target_arr)

In [28]:
from torch import Tensor
dataset = torch.utils.data.TensorDataset(Tensor(x_train_pad), Tensor(target_train))
trainloader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=False)
dataiter = iter(trainloader)
inputs, targets = dataiter.next()
print(inputs.shape)

torch.Size([32, 2, 45, 513])


In [101]:
from torch.autograd import Variable
import torch.nn as nn
drop_prob=0.2

class SiameseNetwork(nn.Module):# A simple implementation of siamese network, ResNet50 is used, and then connected by three fc layer.
    def __init__(self,input_dim,hidden_dim,output_dim,dropout_prob,layers):
        super(SiameseNetwork, self).__init__()
        self.input_size=input_dim
        self.output_size=output_dim
        self.dropout=dropout_prob
        self.layer_size=layers
        self.hidden_dim=hidden_dim
        self.gru = nn.GRU(input_size=self.input_size,hidden_size=self.hidden_dim,num_layers=self.layer_size,batch_first=True, dropout=0.2)
        self.fc1= nn.Linear(5760,self.output_size)
        self.fc1= self.init_weights(self.fc1)
        self.sigmoid = nn.Sigmoid()
        self.relu=nn.ReLU()
        self.tanh=nn.Tanh()
        
    def init_weights(self,m):
        torch.nn.init.xavier_normal_(m.weight)
        return m
    
    def forward(self,x1,x2):
        h0_1 = torch.zeros(self.layer_size, x1.size(0),128)
        h0_2 = torch.zeros(self.layer_size, x2.size(0),128)
        output1,final_hidden_state = self.gru(x1,h0_1)
        output1=self.tanh(output1)
        output2,final_hidden_state=self.gru(x2,h0_2)
        output2=self.tanh(output2)
        output =torch.multiply(output1,output2)
        output= output.reshape(output.size(0),-1)
        output=self.fc1(output)
        output=self.sigmoid(output)
        return output

In [102]:
gru_model=SiameseNetwork(513,128,1,0.2,2)
gru_model

SiameseNetwork(
  (gru): GRU(513, 128, num_layers=2, batch_first=True, dropout=0.2)
  (fc1): Linear(in_features=5760, out_features=1, bias=True)
  (sigmoid): Sigmoid()
  (relu): ReLU()
  (tanh): Tanh()
)

In [103]:
optimizer = optim.Adam(gru_model.parameters(), lr=0.0001)
loss_func = nn.BCELoss()

In [104]:
e = 75
final_output = []
for epoch in range(e):
    train_loss = []
    for data, label in trainloader:
        x=data[:,0]
        y=data[:,1]
        outputs = gru_model(x,y)
        loss = loss_func(outputs.squeeze(), label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss.append(loss.item())
        if epoch == e-1:
            final_output.append(outputs)
    if epoch == e-1:
        final_output = torch.cat(final_output)
    print('Epoch: {}, Train loss: {}'.format(epoch, np.round(np.mean(train_loss),7)))

Epoch: 0, Train loss: 0.694581
Epoch: 1, Train loss: 0.6876055
Epoch: 2, Train loss: 0.6724918
Epoch: 3, Train loss: 0.6127531
Epoch: 4, Train loss: 0.5498557
Epoch: 5, Train loss: 0.5083435
Epoch: 6, Train loss: 0.4768887
Epoch: 7, Train loss: 0.4499564
Epoch: 8, Train loss: 0.4278079
Epoch: 9, Train loss: 0.4066438
Epoch: 10, Train loss: 0.3875279
Epoch: 11, Train loss: 0.3714248
Epoch: 12, Train loss: 0.353963
Epoch: 13, Train loss: 0.3383491
Epoch: 14, Train loss: 0.3209416
Epoch: 15, Train loss: 0.3064107
Epoch: 16, Train loss: 0.2900203
Epoch: 17, Train loss: 0.2768416
Epoch: 18, Train loss: 0.2612205
Epoch: 19, Train loss: 0.2483585
Epoch: 20, Train loss: 0.2366068
Epoch: 21, Train loss: 0.2245398
Epoch: 22, Train loss: 0.2126477
Epoch: 23, Train loss: 0.2032895
Epoch: 24, Train loss: 0.194182
Epoch: 25, Train loss: 0.1863022
Epoch: 26, Train loss: 0.1769837
Epoch: 27, Train loss: 0.1702463
Epoch: 28, Train loss: 0.1653714
Epoch: 29, Train loss: 0.157456
Epoch: 30, Train loss: 0

In [105]:
### Test DataSet Preparation
test = []
test_df=list(test_data)
for i in range(0,20):
    positive_lst = get_positive_sample(i,test_df)
    negative_lst = negative_samples(i,test_df)
    test=test + positive_lst + negative_lst

In [106]:
x_test=np.stack(test)
x_test.shape

(1800, 2, 45, 513)

In [107]:
test_arr = [] 
for i in range(20):
    y_test= np.zeros(90, dtype = int)
    y_test[45:] += 1
    test_arr.append(y_test)
y_test_pad = np.hstack(test_arr)

In [108]:
from torch import Tensor
dataset = torch.utils.data.TensorDataset(Tensor(x_test), Tensor(y_test_pad))
testloader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=False)
dataiter = iter(testloader)
inputs, targets = dataiter.next()
print(inputs.shape)

torch.Size([32, 2, 45, 513])


In [109]:
prediction=[]
test_accuracy=[]
for images,labels in testloader:
    x=images[:,0]
    y=images[:,1]
    gru_model.eval()
    outputs = gru_model(x,y)
    prediction.append(outputs)
    outputs=torch.round(outputs.squeeze())
#     print(outputs)
    test_accuracy.append((outputs == labels).sum().item() / outputs.size(0))
prediction_valid=torch.cat(prediction,dim=0)

In [110]:
print('Test accuracy: {}'.format(np.round(np.mean(test_accuracy),4)))

Test accuracy: 0.7001


### We get a test accuracy of 70%`