# Neural Networks

In [1]:
import time
import os
import pprint
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import librosa
import librosa.display
import tqdm.notebook as tq
import utils

## Creation of training / validation / test sets

In [2]:
AUDIO_DIR = os.environ.get('AUDIO_DIR')
print("audio directory: ",AUDIO_DIR)
print("Loading tracks.csv...")
tracks = utils.load('data/fma_metadata/tracks.csv')
#get only the small subset of the dataset
small = tracks[tracks['set', 'subset'] <= 'small']
print("small dataset shape:",small.shape)

audio directory:  ./data/fma_small/
Loading tracks.csv...
small dataset shape: (8000, 52)


In [3]:
#since each track is split in clips, create a lable for each clip
def expand_labels(labels_set,expand_factor):
    # Repeat each element expand_factor times using np.repeat()
    expanded_array = np.repeat(labels_set, expand_factor)
    #convert to numpy array
    expanded_array = np.array(expanded_array)
    # return the expanded array variable
    return expanded_array

#transforms a vector of strings into a vector of integer following a dictionary
def to_integer_vector(labels_vector, unique_genres):
    #create dictionary genre-integer {'Rock':1, 'Pop', 2, ...}
    dictionary = {}
    i=1
    for genre in unique_genres:
        dictionary[genre]=i
        i+=1
    print("dictionary created:",dictionary,"\n")
    output_vector = []
    #using the dictionary, transform the label vector ['Rock','Pop',...] into a vector [1,2, ...]
    for elem in labels_vector:
        output_vector.append(dictionary[elem])        
    return np.array(output_vector) # convert to numpy array and return the vector

#transforms a vector of integers into a vector of one hot encoded label of dim (len(labels) x num_classes)
def to_one_hot(labels_vector, num_classes):
    output = []
    print("Creating one hot encoded lables...")
    #cicle through all elements to be encoded
    for elem in labels_vector:
        one_hot = [0]*num_classes # [0, 0, ... 0]
        one_hot[elem-1] = 1
        output.append(one_hot)
    return np.array(output) 

#the function which call all the other funcitons to generate the final one hot encoded label vector
def generate_one_hot_encoded_labels(data, n_clips_per_track):
    print("Number of clips per track used:",n_clips_per_track)
    labels = expand_labels(data, n_clips_per_track) #expand the labels by the number of clips per track amount
    unique_genres = np.unique(labels)
    print("There are {} unique genres:".format(len(unique_genres)),unique_genres) #get a vector of integers labels [1,2,4,...]
    integer_label_vector = to_integer_vector(labels,unique_genres) 
    labels_one_hot = to_one_hot(integer_label_vector, len(unique_genres)) #get one hot encoded vector of labels
    return labels_one_hot

In [4]:
#retrieve labels for each subset
tr_labels = small.loc[small[('set', 'split')] == 'training', ('track', 'genre_top')].values
vl_labels = small.loc[small[('set', 'split')] == 'validation', ('track', 'genre_top')].values
ts_labels = small.loc[small[('set', 'split')] == 'test', ('track', 'genre_top')].values

print('{} training tracks, {} validation tracks, {} testing tracks\n'.format(*map(len, [tr_labels, vl_labels, ts_labels])))


n_clips_per_track = 10 #number of clips per track

tr_labels_one_hot = generate_one_hot_encoded_labels(tr_labels, n_clips_per_track)
vl_labels_one_hot = generate_one_hot_encoded_labels(vl_labels, n_clips_per_track)
ts_labels_one_hot = generate_one_hot_encoded_labels(ts_labels, n_clips_per_track)

print('Training labels vector: {},\nValidation labels vector: {},\nTest labels vector: {}'.format(tr_labels_one_hot.shape, vl_labels_one_hot.shape, ts_labels_one_hot.shape))


6400 training tracks, 800 validation tracks, 800 testing tracks

Number of clips per track used: 10
There are 8 unique genres: ['Electronic' 'Experimental' 'Folk' 'Hip-Hop' 'Instrumental'
 'International' 'Pop' 'Rock']
dictionary created: {'Electronic': 1, 'Experimental': 2, 'Folk': 3, 'Hip-Hop': 4, 'Instrumental': 5, 'International': 6, 'Pop': 7, 'Rock': 8} 

Creating one hot encoded lables...
Number of clips per track used: 10
There are 8 unique genres: ['Electronic' 'Experimental' 'Folk' 'Hip-Hop' 'Instrumental'
 'International' 'Pop' 'Rock']
dictionary created: {'Electronic': 1, 'Experimental': 2, 'Folk': 3, 'Hip-Hop': 4, 'Instrumental': 5, 'International': 6, 'Pop': 7, 'Rock': 8} 

Creating one hot encoded lables...
Number of clips per track used: 10
There are 8 unique genres: ['Electronic' 'Experimental' 'Folk' 'Hip-Hop' 'Instrumental'
 'International' 'Pop' 'Rock']
dictionary created: {'Electronic': 1, 'Experimental': 2, 'Folk': 3, 'Hip-Hop': 4, 'Instrumental': 5, 'International

# Network Architecture Definition (nnet1)

In [5]:
class NNet1(nn.Module):
    def __init__(self):
        super(NNet1, self).__init__()
        
        self.conv1 = nn.Conv2d(1, 128, kernel_size=(4, 513), stride=(1,513))
        self.relu = nn.ReLU()
        self.maxpool1 = nn.MaxPool2d(kernel_size=(2, 1))
        self.conv2 = nn.Conv2d(128, 128, kernel_size=(4, 1), stride=(1,513))
        self.maxpool2 = nn.MaxPool2d(kernel_size=(2, 1))
        self.conv3 = nn.Conv2d(128, 256, kernel_size=(4, 1), stride=(1,513))
        self.avgpool = nn.AvgPool2d(kernel_size=(26, 1))
        self.maxpool = nn.MaxPool2d(kernel_size=(26, 1))
        self.flatten = nn.Flatten()
        self.dense1 = nn.Linear(512, 300)
        self.dense2 = nn.Linear(300, 150)
        self.dense3 = nn.Linear(150, 8)
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.relu(x)
        x = self.maxpool1(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.maxpool2(x)
        x = self.conv3(x)
        x_avg = self.avgpool(x)
        x_max = self.maxpool(x)
        x = torch.cat([x_avg, x_max], dim=1)
        x = self.flatten(x)
        x = self.dense1(x)
        x = self.relu(x)
        x = self.dense2(x)
        x = self.relu(x)
        x = self.dense3(x)
        x = self.softmax(x)
        return x

# Dataset Class

In [6]:
# batch_size=16 #number of samples taken at a time for the train

# Define the custom dataset class
class MyDataset(Dataset):
    def __init__(self, file_list, labels):
        self.file_list = file_list
        self.labels=labels

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        # returns an item
        file_path = self.file_list[idx]
        label = torch.tensor(self.labels[idx])

        stft_vector = torch.tensor(np.load(file_path)) #load from file
        
        
        return stft_vector, label



# Train function

In [7]:
def train(model, dataset, batch_size, num_epochs, learning_rate):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    if not isinstance(dataset, Dataset):
        raise ValueError("The dataset parameter should be an instance of torch.utils.data.Dataset.")

    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    num_batches = len(data_loader)
    
    
    for epoch in range(num_epochs):
        running_loss = 0.0 
        running_accuracy = 0.0
        #initialize correctly predicted samples
        
        # Initialize the progress bar
        progress_bar = tq.tqdm(total=num_batches, unit="batch")
    
        # Initialize the progress bar description
        progress_bar.set_description(f"Epoch {epoch+1}/{num_epochs}")
        start_time = time.time()
        
        for batch_idx, batch in enumerate(data_loader):
            correct = 0 # reset train accuracy each batch
            
            inputs,labels = batch[0],batch[1]
            #print("inputs shape:",inputs.size(),", content: ",inputs)
            #print("labels shape:",labels.size(),", content: ",labels)
            
            inputs = inputs.unsqueeze(1)
            #print("inputs unsqueezed shape:",inputs.size(),", content: ",inputs)

            # Extract the inputs and targets
            optimizer.zero_grad()
            
            outputs = model(inputs)
            
            #print("\noutputs type:",type(outputs),"content:",outputs)
            #print("\nlabels type:",type(labels),"content:",labels)

            loss = criterion(outputs, labels.float()) #labels need to be a vector of float, not Long
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            
            #rint("outputs shape:",outputs.size(),"content:",outputs)
            #print("labels shape:",labels.size(),"content:",labels)

            #calculate train accuracy
            for idx, predicted_label in enumerate(outputs):
                #print("predicted_label size:",predicted_label.size(),"content:",predicted_label)
                #print("labels[idx] size:",labels[idx].size(),"content:",labels[idx])
                max_idx = torch.argmax(predicted_label).item() #index with max argument in the one hot predicted label vector
                #print("max_idx content:",max_idx)

                if(labels[idx][max_idx].item() == 1):
                    correct += 1
            
            accuracy = 100 * correct / batch_size
            running_accuracy += accuracy #epoch running_accuracy
            #print("Accuracy = {}".format(accuracy))

            
            # Update the progress bar description and calculate bps
            #progress_bar.set_postfix({"Loss": running_loss / (batch_idx + 1)})
            average_accuracy = running_accuracy / (batch_idx + 1)
            progress_bar.set_postfix({"Batch accuracy": accuracy, "Average accuracy": average_accuracy})



            #bps = (batch_idx + 1) / (time.time() - start_time)

            # Update the progress bar
            progress_bar.update(1)
       


        average_loss = running_loss / len(data_loader)
        average_accuracy = running_accuracy / len(data_loader)
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {average_loss:.4f}. Accuracy: {average_accuracy}")
        progress_bar.close()


In [14]:
def verify_dataset(filepaths):
    error_indexes = []
    progress = 0
    for file in filepaths:
        progress+=1
        print("checked {}/{} files".format(progress,len(filepaths)))
        x = np.load(file)
        if(x.shape != (128,513)):
            error_indexes.append(x)
            print("error")
    print("{} errors found in files: {}".format(len(error_indexes),error_indexes))
    for idx,error in enumerate(error_indexes):
        print("index: {}, shape: {}".format(idx,error.shape))
        

In [None]:
#folder_path="data/fma_small_stft/train/"
folder_path="data/fma_small_stft_transposed/train/"
file_list = os.listdir(folder_path)
file_paths = [os.path.join(folder_path, file_name) for file_name in file_list]
print(file_paths)
print("\nNumber of training samples:",len(file_paths),"\n")
verify_dataset(file_paths)
train_dataset = MyDataset(file_paths, tr_labels_one_hot)

In [None]:
model = NNet1()
summary(model, (1, 128, 513))
train(model, train_dataset, batch_size=256, num_epochs=10, learning_rate=0.1)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [-1, 128, 125, 1]         262,784
              ReLU-2          [-1, 128, 125, 1]               0
         MaxPool2d-3           [-1, 128, 62, 1]               0
            Conv2d-4           [-1, 128, 59, 1]          65,664
              ReLU-5           [-1, 128, 59, 1]               0
         MaxPool2d-6           [-1, 128, 29, 1]               0
            Conv2d-7           [-1, 256, 26, 1]         131,328
         AvgPool2d-8            [-1, 256, 1, 1]               0
         MaxPool2d-9            [-1, 256, 1, 1]               0
          Flatten-10                  [-1, 512]               0
           Linear-11                  [-1, 300]         153,900
             ReLU-12                  [-1, 300]               0
           Linear-13                  [-1, 150]          45,150
             ReLU-14                  [

  0%|          | 0/250 [00:00<?, ?batch/s]

# Network Architecture Definition (nnet2)

In [20]:
# Define the custom model class
class NNet2(nn.Module):
    def __init__(self):
        super(NNet2, self).__init__()
        self.conv1 = nn.Conv2d(1, 256, kernel_size=(4, 513),padding="same")
        self.conv2 = nn.Conv2d(256, 256, kernel_size=(4, 1),padding="same")
        self.conv3 = nn.Conv2d(256, 256, kernel_size=(4, 1),padding="same")
        self.dense1 = nn.Linear(256*125*1, 300)
        self.dense2 = nn.Linear(300, 150)
        self.dense3 = nn.Linear(150, 8)
        
    def forward(self, x):
        x = nn.ReLU()(self.conv1(x))
        x1 = x  # Save the output of the first convolutional layer for later
        x = nn.ReLU()(self.conv2(x))
        x = nn.ReLU()(self.conv3(x))
        x += x1  # Sum the output of the first convolutional layer with the third convolutional layer
        
        x = torch.cat((nn.AvgPool2d(kernel_size=(125, 1))(x), nn.MaxPool2d(kernel_size=(125, 1))(x)), dim=1)
        x = x.view(x.size(0), -1)
        x = nn.ReLU()(self.dense1(x))
        x = nn.ReLU()(self.dense2(x))
        x = nn.Softmax(dim=1)(self.dense3(x))
        return x

In [None]:
model2 = NNet2()
summary(model2, (1, 128, 513))
train(model2, train_dataset, batch_size=256, num_epochs=8, learning_rate=0.1)

# Recurrent Neural Network

In [10]:
class AudioGenreClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(AudioGenreClassifier, self).__init__()
        self.hidden_size = hidden_size
        
        # Time-distributed layer
        self.time_distributed = nn.Linear(input_size, hidden_size)
        
        # Bidirectional LSTM layers
        self.bilstm = nn.LSTM(256, hidden_size, num_layers = num_layers, bidirectional=True)
        
        # Attention mechanism
        self.attention = nn.Linear(hidden_size * 2, 1)
        
        # Pooling layer
        self.pooling = nn.AdaptiveAvgPool1d(1)
        
        # Output layer
        self.fc = nn.Linear(hidden_size * 2, num_classes)
    
    def forward(self, x):
        # Time-distributed layer
        x = self.time_distributed(x)

        # Bidirectional LSTM layers
        output, _ = self.bilstm(x)

        # Attention mechanism
        attention_weights = torch.softmax(self.attention(output), dim=1)
        attended_output = torch.sum(attention_weights * output, dim=1)

        # Pooling layer
        pooled_output = self.pooling(attended_output.permute(0, 1).unsqueeze(2))
        
        # Reshape and pass through the output layer
        pooled_output = pooled_output.squeeze(2)
        output = self.fc(pooled_output)

        return output

In [11]:
def trainRNN(model, dataset, batch_size, num_epochs, learning_rate):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device: ",device)
    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    if not isinstance(dataset, Dataset):
        raise ValueError("The dataset parameter should be an instance of torch.utils.data.Dataset.")

    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    num_batches = len(data_loader)
    
    # Initialize the progress bar
    progress_bar = tq.tqdm(total=num_batches, unit="batch")
    
  
    
    for epoch in range(num_epochs):
        running_loss = 0.0 
        running_accuracy = 0.0
        #initialize correctly predicted samples
        
        # Initialize the progress bar
        progress_bar.set_description(f"Epoch {epoch+1}/{num_epochs}")
        start_time = time.time()
        
        for batch_idx, batch in enumerate(data_loader):
            correct = 0 # reset train accuracy each batch
            
            inputs,labels = batch[0],batch[1]
            #print("inputs shape:",inputs.shape,", content: ",inputs)
            #print("labels shape:",labels.shape,", content: ",labels)
            #inputs = inputs.unsqueeze(1)
            
            # Extract the inputs and targets
            optimizer.zero_grad()
            
            outputs = model(inputs)
            
            #print("\noutputs type:",type(outputs),"content:",outputs)
            #print("\nlabels type:",type(labels),"content:",labels)

            loss = criterion(outputs, labels.float()) #labels need to be a vector of float, not Long
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            
            #print("outputs shape:",outputs.size(),"content:",outputs)
            #print("labels shape:",labels.size(),"content:",labels)

            #calculate train accuracy
            for idx, predicted_label in enumerate(outputs):
                #print("predicted_label size:",predicted_label.size(),"content:",predicted_label)
                #print("labels[idx] size:",labels[idx].size(),"content:",labels[idx])
                max_idx = torch.argmax(predicted_label).item() #index with max argument in the one hot predicted label vector
                #print("max_idx content:",max_idx)

                if(labels[idx][max_idx].item() == 1):
                    correct += 1
            
            accuracy = 100 * correct / batch_size
            running_accuracy += accuracy #epoch running_accuracy
            #print("Accuracy = {}".format(accuracy))

            
            # Update the progress bar description and calculate bps
            #progress_bar.set_postfix({"Loss": running_loss / (batch_idx + 1)})
            average_accuracy = running_accuracy / (batch_idx + 1)
            progress_bar.set_postfix({"Batch accuracy": accuracy, "Average accuracy": average_accuracy})



            #bps = (batch_idx + 1) / (time.time() - start_time)

            # Update the progress bar
            progress_bar.update(1)
       


        average_loss = running_loss / len(data_loader)
        average_accuracy = running_accuracy / len(data_loader)
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {average_loss:.4f}. Accuracy: {average_accuracy}")
        progress_bar.close()

In [13]:
modelRNN = AudioGenreClassifier(input_size=513, hidden_size=256, num_layers=5, num_classes=8)
trainRNN(modelRNN, train_dataset, batch_size=32, num_epochs=8, learning_rate=0.1)

Using device:  cpu


  0%|          | 0/16000 [00:00<?, ?batch/s]

KeyboardInterrupt: 