In [113]:
import sys, os
import torch
import librosa
import numpy as np
import pandas as pd
from torch import Tensor
from scipy.io import wavfile
from torchvision import transforms
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset
import torchaudio
import numpy as np
import scipy.io.wavfile
import torch.nn.functional as F


In [164]:
class RawNetDataset(Dataset):
    def __init__(self, transform=None, mode="train",files_dir=None, base_dir="",csv_file_dir="",nb_time=59049):
        self.base_dir = base_dir
        self.mode = mode
        self.transform = transform
        self.csv_file_dir = csv_file_dir
        self.files_dir = files_dir
        csv_file = pd.read_csv(csv_file_dir,sep="\t")
        self.nb_time = nb_time
        self.classes = {cls_name:i+1 for i, cls_name in enumerate(csv_file["VoxCeleb1 ID"].unique())}

    def __len__(self):
        return len(self.files_dir)
    
    def __getitem__(self, idx):
        filename = self.files_dir[idx]
        classe = filename.split('/')[1]
        X, sample_rate = torchaudio.load(self.base_dir + filename)
        #print(" shape(X): ",type(X))
        label = self.classes[classe]
        self._pre_emphasis(X)
        nb_time = X.shape[1]
        if nb_time > self.nb_time:
            start_idx = np.random.randint(low = 0,
                high = nb_time - self.nb_time)
            X = X[:, start_idx:start_idx+self.nb_time]
            #print("nb_time: ",nb_time )
            #print("self.nb_time: ",self.nb_time)
        elif nb_time < self.nb_time:
            nb_dup = int(self.nb_time / nb_time) + 1
            X = np.tile(X, (1, nb_dup))[:, :self.nb_time]
            #print("taille inférieure")
        else:
            X = X
            #print("taille égale")
        #print(" type(X): ",X.size())

        #print('------------------------------------------------')

        return X, label

    def _pre_emphasis(self, x):
        '''
        Pre-emphasis for single channel input
        '''
        return np.asarray(x[:,1:] - 0.97 * x[:, :-1], dtype=np.float32) 



In [162]:
base_dir = '../data/wav'
list_IDs = ['/id10009/7hpSiT9_gCE/00001.wav',
        '/id10009/aFttHpeaXaQ/00001.wav',
        '/id10009/AtavJVP4bCk/00001.wav',
        '/id10009/AtavJVP4bCk/00002.wav',
        '/id10009/AtavJVP4bCk/00003.wav',
        '/id10009/AtavJVP4bCk/00004.wav',
        '/id10009/AtavJVP4bCk/00005.wav',
        '/id10009/AtavJVP4bCk/00006.wav',
        '/id10009/AtavJVP4bCk/00007.wav',
        '/id10009/AtavJVP4bCk/00008.wav',
        ]
csv_file_dir = '../data/target/vox1_meta.csv'

In [158]:
def get_utt_list(src_dir):
    l_utt = []
    for r, ds, fs in os.walk(src_dir):
        r = r.replace('\\', '/')   
        base = '/'.join(r.split('/')[-2:])+'/'
        for f in fs:
            l_utt.append(base+f[:-4]+'.wav')
    return l_utt


In [159]:
get_utt_list(base_dir)

['data/wav/.DS_S.wav',
 'wav/id10009/.DS_S.wav',
 'id10009/HCGXIgKsozU/00001.wav',
 'id10009/AtavJVP4bCk/00009.wav',
 'id10009/AtavJVP4bCk/00008.wav',
 'id10009/AtavJVP4bCk/00006.wav',
 'id10009/AtavJVP4bCk/00007.wav',
 'id10009/AtavJVP4bCk/00005.wav',
 'id10009/AtavJVP4bCk/00004.wav',
 'id10009/AtavJVP4bCk/00001.wav',
 'id10009/AtavJVP4bCk/00003.wav',
 'id10009/AtavJVP4bCk/00002.wav',
 'id10009/x_HdUZuSusA/00001.wav',
 'id10009/x_HdUZuSusA/00002.wav',
 'id10009/seo9TTTEoE4/00001.wav',
 'id10009/seo9TTTEoE4/00002.wav',
 'id10009/7hpSiT9_gCE/00001.wav',
 'id10009/JrwqvWr5_VE/00009.wav',
 'id10009/JrwqvWr5_VE/00008.wav',
 'id10009/JrwqvWr5_VE/00006.wav',
 'id10009/JrwqvWr5_VE/00007.wav',
 'id10009/JrwqvWr5_VE/00005.wav',
 'id10009/JrwqvWr5_VE/00004.wav',
 'id10009/JrwqvWr5_VE/00001.wav',
 'id10009/JrwqvWr5_VE/00003.wav',
 'id10009/JrwqvWr5_VE/00002.wav',
 'id10009/FOFbkVlz-wQ/00001.wav',
 'id10009/FOFbkVlz-wQ/00002.wav',
 'id10009/JntZkGsH2Cc/00004.wav',
 'id10009/JntZkGsH2Cc/00001.wav',

In [154]:
train_dataset = RawNetDataset(files_dir=list_IDs,base_dir=base_dir,csv_file_dir=csv_file_dir)
batch_size = 128
n_iters = 3000
num_epochs = n_iters / (len(train_dataset) / batch_size)
num_epochs = int(num_epochs)

train_loader = torch.utils.data.DataLoader(train_dataset,
                                          batch_size=batch_size,
                                          shuffle=True)
print(len(train_dataset))

10


In [117]:
test_dataset = RawNetDataset(files_dir=list_IDs,base_dir=base_dir,csv_file_dir=csv_file_dir)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)

In [165]:
train_dataset = RawNetDataset(files_dir=list_IDs,base_dir=base_dir,csv_file_dir=csv_file_dir)
print(train_dataset.__getitem__(0))




id10009
(tensor([[ 0.0245,  0.0313,  0.0364,  ..., -0.0035, -0.0059, -0.0060]]), 9)


In [118]:
t=dataset.classes.shape[0]
print(t)

1251


In [119]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as dsets

In [120]:
class FeedforwardNeuralNetModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(FeedforwardNeuralNetModel, self).__init__()
        # Linear function
        self.fc1 = nn.Linear(input_dim, hidden_dim) 

        # Non-linearity
        self.sigmoid = nn.Sigmoid()

        # Linear function (readout)
        self.fc2 = nn.Linear(hidden_dim, output_dim)  

    def forward(self, x):
        # Linear function  # LINEAR
        out = self.fc1(x)

        # Non-linearity  # NON-LINEAR
        out = self.sigmoid(out)

        # Linear function (readout)  # LINEAR
        out = self.fc2(out)
        return out

In [128]:
# 1. Instantiation du model
input_dim = 59049
#hidden_dim = 100
output_dim = 1211
model = FeedforwardNeuralNetModel(input_dim, hidden_dim, output_dim)

In [129]:
# Instantiation de la loss function
criterion = nn.CrossEntropyLoss()

In [130]:
# optimizer
learning_rate = 0.1
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) 
print(model.parameters())
print(len(list(model.parameters())))


<generator object Module.parameters at 0x1c2b056660>
4


In [137]:
# 2. Train model
import numpy 
def train():
    iter = 0
    for epoch in range(num_epochs):
        for i, (data, labels) in enumerate(train_loader):
            # Load data with gradient accumulation capabilities
            data = data.requires_grad_()

            # Clear gradients w.r.t. parameters
            optimizer.zero_grad()

            # Forward pass to get output/logits
            outputs = model(data)
            print("-------------TYPE---------------")
            print(outputs.numpy())

            # Calculate Loss: softmax --> cross entropy loss
            loss = criterion(outputs, labels)

            # Getting gradients w.r.t. parameters
            loss.backward()

            # Updating parameters
            optimizer.step()

            iter += 1

            if iter % 500 == 0:
                # Calculate Accuracy         
                correct = 0
                total = 0
                # Iterate through test dataset
                for data, labels in test_loader:
                    # Load data with gradient accumulation capabilities
                    data = data.requires_grad_()

                    # Forward pass only to get logits/output
                    outputs = model(data)

                    # Get predictions from the maximum value
                    _, predicted = torch.max(outputs.data, 1)

                    # Total number of labels
                    total += labels.size(0)

                    # Total correct predictions
                    correct += (predicted == labels).sum()

                accuracy = 100 * correct / total

                # Print Loss
                print('Iteration: {}. Loss: {}. Accuracy: {}'.format(iter, loss.item(), accuracy))

In [139]:
train()

-------------TYPE---------------


RuntimeError: Can't call numpy() on Variable that requires grad. Use var.detach().numpy() instead.