In [8]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(Net, self).__init__()                    # Inherited from the parent class nn.Module
        self.fc1 = nn.Linear(input_size, hidden_size)  # 1st Full-Connected Layer: 784 (input data) -> 500 (hidden node)
        self.relu = nn.ReLU()                          # Non-Linear ReLU Layer: max(0,x)
        self.fc2 = nn.Linear(hidden_size, num_classes) # 2nd Full-Connected Layer: 500 (hidden node) -> 10 (output class)
    
    def forward(self, x):                              # Forward pass: stacking each layer together
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        #print(out)
        out = F.softmax(out, dim=0)
        return out

In [41]:
net = Net(input_size, hidden_size, num_classes)

In [37]:
import sys, os
import torch
#import librosa
import numpy as np
import pandas as pd
from torch import Tensor
from scipy.io import wavfile
from torchvision import transforms
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset
import torchaudio
import numpy as np
import scipy.io.wavfile
import torch.nn.functional as F
import torch
import torch.nn as nn


In [38]:
class RawNetDataset(Dataset):
    def __init__(self,
                 transform=None,
                 mode="train",
                 files_dir=None,
                 base_dir="",
                 csv_file_dir="",
                 nb_time=59049,
                 train=True):
        
        self.base_dir = base_dir
        self.mode = mode
        self.transform = transform
        self.csv_file_dir = csv_file_dir
        self.files_dir = files_dir
        csv_file = pd.read_csv(csv_file_dir,sep="\t")
        self.nb_time = nb_time
        self.train = train
        #self.classes = {cls_name:i for i, cls_name in enumerate(csv_file["VoxCeleb1 ID"].unique())}
        self.classes = {'id10009': 0,'id10016': 1,'id10017': 2,'id10019': 3}

    def __len__(self):
        
        return len(self.files_dir)
    
    def __getitem__(self, idx):
        filename = self.files_dir[idx]
        classe = filename.split('/')[0]
        X, sample_rate = torchaudio.load(self.base_dir + filename)
        #print(" shape(X): ",type(X))
        label = self.classes[classe]
        self._pre_emphasis(X)
        if self.train:
            nb_time = X.shape[1]
            if nb_time > self.nb_time:
                start_idx = np.random.randint(low = 0,
                    high = nb_time - self.nb_time)
                X = X[:, start_idx:start_idx+self.nb_time]
                #print("nb_time: ",nb_time )
                #print("self.nb_time: ",self.nb_time)
            elif nb_time < self.nb_time:
                nb_dup = int(self.nb_time / nb_time) + 1
                X = np.tile(X, (1, nb_dup))[:, :self.nb_time]
                #print("taille inférieure")
            else:
                X = X
            #print("taille égale")
        #print(" type(X): ",X.size())
        return X, label

    def _pre_emphasis(self, x):
        '''
        Pre-emphasis for single channel input
        '''
        return np.asarray(x[:,1:] - 0.97 * x[:, :-1], dtype=np.float32) 



In [39]:

#input_size = 59049       # input size
#hidden_size = 500      # The number of nodes at the hidden layer


In [40]:
base_dir = 'data/wav/'
csv_file_dir = 'data/target/vox1_meta.csv'
def get_utt_list(src_dir):
    l_utt = []
    for r, ds, fs in os.walk(src_dir):
        r = r.replace('\\', '/')   
        base = '/'.join(r.split('/')[-2:])+'/'
        for f in fs:
            l_utt.append(base+f[:-4]+'.wav')
    return l_utt
list_IDs = get_utt_list(base_dir)

In [41]:
# split train,  validation data
import random
import numpy as np
from sklearn.model_selection import train_test_split
X_train, X_valid = train_test_split(list_IDs, test_size=0.05, random_state=42, shuffle=True)



In [42]:
#TRAIN DATA
num_classes = 4      # The number of output classes. In this case, from 0 to 3
num_epochs = 5         # The number of times entire dataset is trained
batch_size = 100       # The size of input data took for one iteration
learning_rate = 0.001  # The speed of convergence
train_dataset = RawNetDataset(files_dir=X_train,base_dir=base_dir,csv_file_dir=csv_file_dir, train=True)
train_loader = torch.utils.data.DataLoader(train_dataset,
                                          batch_size=batch_size,
                                          shuffle=True)

In [43]:
#VALIDATION DATA : for cross validation 
valid_dataset = RawNetDataset(files_dir=X_valid,base_dir=base_dir,csv_file_dir=csv_file_dir, train=True)
valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset, 
                                          batch_size=len(valid_dataset), 
                                          shuffle=False)

In [44]:
class RawNet(nn.Module):
    def __init__(self):
        super(RawNet, self).__init__()

        self.lrelu = nn.LeakyReLU()
        self.lrelu_keras = nn.LeakyReLU(negative_slope=0.3)

        self.conv1 = nn.Conv1d(in_channels = 1,#1
			out_channels = 128,#128
			kernel_size = 3,#3
                        padding = 0,
                        stride = 3
        )

        self.bn = nn.BatchNorm1d(num_features = 128)
        self.gru = nn.GRU(input_size = 256,
			hidden_size = 1024,
                        num_layers = 1,
                        batch_first = True)

        self.gru_fc1 = nn.Linear(in_features = 1024,
                                 out_features = 1024)

        self.gru_fc2 = nn.Linear(in_features = 1024,
                                 out_features = 4)

        self.bn_before_gru = nn.BatchNorm1d(num_features = 256)


        self.conv2 = nn.Conv1d(in_channels = 128,
                               out_channels = 128,
                               kernel_size = 3,
                               padding = 1,
                               stride = 1
        )


        self.bn2 = nn.BatchNorm1d(num_features=128)
        
        self.conv3_1_1 = nn.Conv1d(in_channels = 128,
                               out_channels = 256,
                               kernel_size = 3,
                               padding = 1,
                               stride = 1
        )

        self.conv3_1 = nn.Conv1d(in_channels = 256,
                               out_channels = 256,
                               kernel_size = 3,
                               padding = 1,
                               stride = 1
        )



        self.conv3_2 = nn.Conv1d(in_channels = 256,
                               out_channels = 256,
                               kernel_size = 3,
                               padding = 1,
                               stride = 1
        )

        self.conv3_3 = nn.Conv1d(in_channels = 128,
                               out_channels = 256,
                               kernel_size = 1,
                               padding = 0,
                               stride = 1
        )


        self.bn3_1_1 = nn.BatchNorm1d(num_features=256)
        self.bn3_1 = nn.BatchNorm1d(num_features=128)
        self.bn3_2 = nn.BatchNorm1d(num_features=256)


        self.mp = nn.MaxPool1d(3)

    def forward(self, x):
        out = self.conv1(x)
        out = self.bn(out)
        out = self.lrelu(out)

        print("shape conv 1 : ", out.shape)

        #-------- Block 1 --------------
        
        out = self.conv2(out)
        out = self.bn2(out)
        out = self.lrelu_keras(out)
        out = self.conv2(out)
        out = self.mp(out)

        out = self.bn2(out)
        out = self.lrelu_keras(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.lrelu_keras(out)
        out = self.conv2(out)
        out = self.mp(out)

        print("shape resblock 1 : ", out.shape)

        #-------- Block 2 --------------
        
        out_identite = out
        out = self.conv3_1_1(out)
        out = self.bn3_2(out)
        out = self.lrelu_keras(out)
        out = self.conv3_2(out)
        out += self.conv3_3(out_identite)
        out = self.mp(out)

        out_identite = out
        out = self.bn3_1_1(out)
        out = self.lrelu_keras(out)
        out = self.conv3_1(out)
        out = self.bn3_2(out)
        out = self.lrelu_keras(out)
        out = self.conv3_2(out)
        out += out_identite
        out = self.mp(out)

        out_identite = out
        out = self.bn3_1_1(out)
        out = self.lrelu_keras(out)
        out = self.conv3_1(out)
        out = self.bn3_2(out)
        out = self.lrelu_keras(out)
        out = self.conv3_2(out)
        out += out_identite
        out = self.mp(out)

        out_identite = out
        out = self.bn3_1_1(out)
        out = self.lrelu_keras(out)
        out = self.conv3_1(out)
        out = self.bn3_2(out)
        out = self.lrelu_keras(out)
        out = self.conv3_2(out)
        out += out_identite
        out = self.mp(out)

        print("shape resblock 2 : ", out.shape)
        

        #-------- Gru --------------

        out = self.bn_before_gru(out)
        out = self.lrelu_keras(out)
        out = out.permute(0, 2, 1)
        #(batch, filt, time) >> (batch, time, filt)

        out, _ = self.gru(out)

        out = out[:,-1,:]
        code = self.gru_fc1(out)

        code_norm = code.norm(p=2,dim=1, keepdim=True) / 10.
        code = torch.div(code, code_norm)

        print("shape GRU : ",code.shape)

        out = self.gru_fc2(code)

        print("shape output : ", out.shape)

        return out

In [45]:
#criterion = nn.NLLLoss()
rawnet = RawNet()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rawnet.parameters(), lr=learning_rate)

In [46]:
#TRAINING
def train_model():
    rawnet.train()
    for i, (data, labels) in enumerate(train_loader):   # Loadl a batch of audio with its (data, class)
        optimizer.zero_grad()                             # Intialize the hidden weight to all zeros
        outputs = rawnet(data)                             # Forward pass: compute the output class given a audio
        loss = criterion(outputs, labels)                 # Compute the loss: difference between the output class and the pre-given label
        loss.backward()                                   # Backward pass: compute the weight
        optimizer.step()                                  # Optimizer: update the weights of hidden nodes

In [47]:
# cross validation 
def cross_validation(model, eval_loader):
    model.eval()
    for i, (data, labels) in enumerate(eval_loader):   # Loadl a batch of audio with its (data, class)
        outputs = model(data)                             # Forward pass: compute the output class given a audio
        loss = criterion(outputs, labels)                 # Compute the loss: difference between the output class and the pre-given label
        print("loss", i, loss)
        return loss

In [48]:
meilleur_loss = 99.
for epoch in range(num_epochs):
    train_model()
    loss = cross_validation(rawnet, valid_loader)
    if loss < meilleur_loss:
        meilleur_loss = loss
        print("meilleur loss:", meilleur_loss)
        #if (i+1) % num_epochs == 0:                              # Logging
            #print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f'
                    #%(epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss.item()))

shape conv 1 :  torch.Size([100, 128, 19683])
shape resblock 1 :  torch.Size([100, 128, 2187])
shape resblock 2 :  torch.Size([100, 256, 27])
shape GRU :  torch.Size([100, 1024])
shape output :  torch.Size([100, 4])
shape conv 1 :  torch.Size([100, 128, 19683])
shape resblock 1 :  torch.Size([100, 128, 2187])
shape resblock 2 :  torch.Size([100, 256, 27])
shape GRU :  torch.Size([100, 1024])
shape output :  torch.Size([100, 4])
shape conv 1 :  torch.Size([100, 128, 19683])
shape resblock 1 :  torch.Size([100, 128, 2187])
shape resblock 2 :  torch.Size([100, 256, 27])
shape GRU :  torch.Size([100, 1024])
shape output :  torch.Size([100, 4])
shape conv 1 :  torch.Size([100, 128, 19683])
shape resblock 1 :  torch.Size([100, 128, 2187])
shape resblock 2 :  torch.Size([100, 256, 27])
shape GRU :  torch.Size([100, 1024])
shape output :  torch.Size([100, 4])
shape conv 1 :  torch.Size([100, 128, 19683])
shape resblock 1 :  torch.Size([100, 128, 2187])
shape resblock 2 :  torch.Size([100, 256,

KeyboardInterrupt: 