# Neural Networks

In [113]:
import time
import os
import pprint
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import librosa
import librosa.display
import tqdm.notebook as tq
import utils

## Creation of training / validation / test sets

In [2]:
AUDIO_DIR = os.environ.get('AUDIO_DIR')
print("audio directory: ",AUDIO_DIR)
print("Loading tracks.csv...")
tracks = utils.load('data/fma_metadata/tracks.csv')
#get only the small subset of the dataset
small = tracks[tracks['set', 'subset'] <= 'small']
print("small dataset shape:",small.shape)

audio directory:  ./data/fma_small/
Loading tracks.csv...
small dataset shape: (8000, 52)


In [3]:
#since each track is split in clips, create a lable for each clip
def expand_labels(labels_set,expand_factor):
    # Repeat each element expand_factor times using np.repeat()
    expanded_array = np.repeat(labels_set, expand_factor)
    #convert to numpy array
    expanded_array = np.array(expanded_array)
    # return the expanded array variable
    return expanded_array

#transforms a vector of strings into a vector of integer following a dictionary
def to_integer_vector(labels_vector, unique_genres):
    #create dictionary genre-integer {'Rock':1, 'Pop', 2, ...}
    dictionary = {}
    i=1
    for genre in unique_genres:
        dictionary[genre]=i
        i+=1
    print("dictionary created:",dictionary,"\n")
    output_vector = []
    #using the dictionary, transform the label vector ['Rock','Pop',...] into a vector [1,2, ...]
    for elem in labels_vector:
        output_vector.append(dictionary[elem])        
    return np.array(output_vector) # convert to numpy array and return the vector

#transforms a vector of integers into a vector of one hot encoded label of dim (len(labels) x num_classes)
def to_one_hot(labels_vector, num_classes):
    output = []
    print("Creating one hot encoded lables...")
    #cicle through all elements to be encoded
    for elem in labels_vector:
        one_hot = [0]*num_classes # [0, 0, ... 0]
        one_hot[elem-1] = 1
        output.append(one_hot)
    return np.array(output) 

#the function which call all the other funcitons to generate the final one hot encoded label vector
def generate_one_hot_encoded_labels(data, n_clips_per_track):
    print("Number of clips per track used:",n_clips_per_track)
    labels = expand_labels(data, n_clips_per_track) #expand the labels by the number of clips per track amount
    unique_genres = np.unique(labels)
    print("There are {} unique genres:".format(len(unique_genres)),unique_genres) #get a vector of integers labels [1,2,4,...]
    integer_label_vector = to_integer_vector(labels,unique_genres) 
    labels_one_hot = to_one_hot(integer_label_vector, len(unique_genres)) #get one hot encoded vector of labels
    return labels_one_hot

In [4]:
#retrieve labels for each subset
tr_labels = small.loc[small[('set', 'split')] == 'training', ('track', 'genre_top')].values
vl_labels = small.loc[small[('set', 'split')] == 'validation', ('track', 'genre_top')].values
ts_labels = small.loc[small[('set', 'split')] == 'test', ('track', 'genre_top')].values

print('{} training tracks, {} validation tracks, {} testing tracks\n'.format(*map(len, [tr_labels, vl_labels, ts_labels])))


n_clips_per_track = 10 #number of clips per track

tr_labels_one_hot = generate_one_hot_encoded_labels(tr_labels, n_clips_per_track)
vl_labels_one_hot = generate_one_hot_encoded_labels(vl_labels, n_clips_per_track)
ts_labels_one_hot = generate_one_hot_encoded_labels(ts_labels, n_clips_per_track)

print('Training labels vector: {},\nValidation labels vector: {},\nTest labels vector: {}'.format(tr_labels_one_hot.shape, vl_labels_one_hot.shape, ts_labels_one_hot.shape))


6400 training tracks, 800 validation tracks, 800 testing tracks

Number of clips per track used: 10
There are 8 unique genres: ['Electronic' 'Experimental' 'Folk' 'Hip-Hop' 'Instrumental'
 'International' 'Pop' 'Rock']
dictionary created: {'Electronic': 1, 'Experimental': 2, 'Folk': 3, 'Hip-Hop': 4, 'Instrumental': 5, 'International': 6, 'Pop': 7, 'Rock': 8} 

Creating one hot encoded lables...
Number of clips per track used: 10
There are 8 unique genres: ['Electronic' 'Experimental' 'Folk' 'Hip-Hop' 'Instrumental'
 'International' 'Pop' 'Rock']
dictionary created: {'Electronic': 1, 'Experimental': 2, 'Folk': 3, 'Hip-Hop': 4, 'Instrumental': 5, 'International': 6, 'Pop': 7, 'Rock': 8} 

Creating one hot encoded lables...
Number of clips per track used: 10
There are 8 unique genres: ['Electronic' 'Experimental' 'Folk' 'Hip-Hop' 'Instrumental'
 'International' 'Pop' 'Rock']
dictionary created: {'Electronic': 1, 'Experimental': 2, 'Folk': 3, 'Hip-Hop': 4, 'Instrumental': 5, 'International

# Network Architecture Definition (nnet1)

In [105]:
class NNet1(nn.Module):
    def __init__(self):
        super(NNet1, self).__init__()
        
        self.conv1 = nn.Conv2d(1, 128, kernel_size=(4, 513))
        self.relu = nn.ReLU()
        self.maxpool1 = nn.MaxPool2d(kernel_size=(2, 1))
        self.conv2 = nn.Conv2d(128, 128, kernel_size=(4, 1))
        self.maxpool2 = nn.MaxPool2d(kernel_size=(2, 1))
        self.conv3 = nn.Conv2d(128, 256, kernel_size=(4, 1))
        self.avgpool = nn.AvgPool2d(kernel_size=(26, 1))
        self.maxpool = nn.MaxPool2d(kernel_size=(26, 1))
        self.flatten = nn.Flatten()
        self.dense1 = nn.Linear(512, 300)
        self.dense2 = nn.Linear(300, 150)
        self.dense3 = nn.Linear(150, 8)
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.relu(x)
        x = self.maxpool1(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.maxpool2(x)
        x = self.conv3(x)
        x_avg = self.avgpool(x)
        x_max = self.maxpool(x)
        x = torch.cat([x_avg, x_max], dim=1)
        x = self.flatten(x)
        x = self.dense1(x)
        x = self.relu(x)
        x = self.dense2(x)
        x = self.relu(x)
        x = self.dense3(x)
        x = self.softmax(x)
        return x

# Dataset Class

In [106]:
# batch_size=16 #number of samples taken at a time for the train

# Define the custom dataset class
class MyDataset(Dataset):
    def __init__(self, file_list, labels):
        self.file_list = file_list
        self.labels=labels

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        '''
        #returns a batch of items (wrong, old code)
        
        file_paths = self.file_list[idx:idx+batch_size]  # Get batch of file paths
        labels = self.labels[idx:idx+batch_size]  # Get batch of labels

        stft_vectors = []
        for file_path in file_paths:
            stft_vector = np.load(file_path).transpose(1, 0)
            print("DATASET: Loaded stft vector with shape",stft_vector.shape)
            stft_vectors.append(stft_vector)

        stft_vectors = torch.stack([torch.from_numpy(vec) for vec in stft_vectors])  # Convert to tensor
        print("DATASET: batch of samples with shape: ",stft_vectors.shape)
        labels = torch.tensor(labels)
        print("DATASET: batch of labels with shape: ",labels.shape)
        
        '''
        # returns an item (not a batch of items)
        file_path = self.file_list[idx]
        label = torch.tensor(self.labels[idx])

        stft_vector = torch.tensor(np.load(file_path).transpose(1,0)) #load from file and transpose
        
        
        return stft_vector, label



# Train function

In [117]:
def train(model, dataset, batch_size, num_epochs, learning_rate):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    if not isinstance(dataset, Dataset):
        raise ValueError("The dataset parameter should be an instance of torch.utils.data.Dataset.")

    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    num_batches = len(data_loader)
    
    # Initialize the progress bar
    progress_bar = tq.tqdm(total=num_epochs * num_batches, unit="batch")

    for epoch in range(num_epochs):
        running_loss = 0.0
        
        # Initialize the progress bar
        progress_bar.set_description(f"Epoch {epoch+1}/{num_epochs}")
        start_time = time.time()

        
        for batch_idx, batch in enumerate(data_loader):
            inputs,labels = batch[0],batch[1]
            #print("inputs shape:",inputs.shape,", content: ",inputs)
            #print("labels shape:",labels.shape,", content: ",labels)
            inputs = inputs.unsqueeze(1)
            
            # Extract the inputs and targets
            optimizer.zero_grad()
            
            outputs = model(inputs)
            
            #print("\noutputs type:",type(outputs),"content:",outputs)
            #print("\nlabels type:",type(labels),"content:",labels)

            loss = criterion(outputs, labels.float()) #labels need to be a vector of float, not Long
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            
            # Update the progress bar description and calculate bps
            progress_bar.set_postfix({"Loss": running_loss / (batch_idx + 1)})
            bps = (batch_idx + 1) / (time.time() - start_time)
            progress_bar.set_postfix({"Loss": running_loss / (batch_idx + 1)})

            # Update the progress bar
            progress_bar.update(1)
        progress_bar.close()



        average_loss = running_loss / len(data_loader)
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {average_loss:.4f}")

In [108]:
folder_path="data/fma_small_stft/train/"
file_list = os.listdir(folder_path)
file_paths = [os.path.join(folder_path, file_name) for file_name in file_list]
print(file_paths)
print("\nNumber of training samples:",len(file_paths),"\n")

dataset = MyDataset(file_paths, tr_labels_one_hot)

['data/fma_small_stft/train/58161_8.npy', 'data/fma_small_stft/train/91181_2.npy', 'data/fma_small_stft/train/53726_5.npy', 'data/fma_small_stft/train/123980_6.npy', 'data/fma_small_stft/train/108230_9.npy', 'data/fma_small_stft/train/114040_0.npy', 'data/fma_small_stft/train/149416_1.npy', 'data/fma_small_stft/train/54297_3.npy', 'data/fma_small_stft/train/144938_2.npy', 'data/fma_small_stft/train/91205_4.npy', 'data/fma_small_stft/train/10673_1.npy', 'data/fma_small_stft/train/48861_6.npy', 'data/fma_small_stft/train/57164_2.npy', 'data/fma_small_stft/train/140872_0.npy', 'data/fma_small_stft/train/57371_7.npy', 'data/fma_small_stft/train/149100_3.npy', 'data/fma_small_stft/train/30056_4.npy', 'data/fma_small_stft/train/62195_2.npy', 'data/fma_small_stft/train/87107_0.npy', 'data/fma_small_stft/train/109670_5.npy', 'data/fma_small_stft/train/104008_2.npy', 'data/fma_small_stft/train/54063_3.npy', 'data/fma_small_stft/train/61013_6.npy', 'data/fma_small_stft/train/119725_0.npy', 'data

In [125]:
model = NNet1()
train(model, dataset, batch_size=32, num_epochs=8, learning_rate=0.001)

  0%|          | 0/16000 [00:00<?, ?batch/s]

KeyboardInterrupt: 

# Network Architecture Definition (nnet2)