# Neural Networks

In [71]:
import time
import os
import pprint
import torch
import torch.nn as nn
from sklearn.metrics import ConfusionMatrixDisplay
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import librosa
import librosa.display
import tqdm.notebook as tq
import utils
from pydub import AudioSegment
from tkinter import Tcl # file sorting by name

# Load STFT Dataset

### Dictionary creation for the classes

We want a dictionary indicating a numbeer for each genre:

{0: 'Hip-Hop', 1: 'Pop', 2: 'Folk', 3: 'Rock', 4: 'Experimental', 5: 'International', 6: 'Electronic', 7: 'Instrumental'}

### Creation of the labels vector

In [72]:
def create_single_dataset(folder_path, tracks_dataframe, genre_dictionary):    
    labels = []
   
    _, file_list = get_sorted_file_paths(folder_path)
    
    for i,file in enumerate(file_list):
        #print("considering file:",file, "({}/{})".format(i,len(file_list)))
        track_id_clip_id = file.split('.')[0]
        track_id = track_id_clip_id.split('_')[0]
        #print("track id with clip: {}, track id: {}".format(track_id_clip_id, track_id))
        genre = tracks_dataframe.loc[int(track_id)]
        #print("genre from dataframe: ", genre)
        label = genre_dictionary[genre]
        #print("label from dictionary:",label)
        labels.append(label)
    print("labels length: {}".format(len(labels)))
    return labels
    

#create the train,validation and test vectors using the files in the train/validation/test folders
def create_dataset_splitted(folder_path):
    train_folder = os.path.join(folder_path,'train') # concatenate train folder to path
    validation_folder = os.path.join(folder_path,'validation') # concatenate train folder to path
    test_folder = os.path.join(folder_path,'test') # concatenate train folder to path
    
    print("train_folder:",train_folder)
    print("validation_folder:",validation_folder)
    print("test_folder:",test_folder,"\n")
    
    AUDIO_DIR = os.environ.get('AUDIO_DIR')
    print("audio directory: ",AUDIO_DIR)
    print("Loading tracks.csv...")
    tracks = utils.load('data/fma_metadata/tracks.csv')
    
    #get only the small subset of the dataset
    small = tracks[tracks['set', 'subset'] <= 'small']
    print("small dataset shape:",small.shape)    

    small_training = small.loc[small[('set', 'split')] == 'training']['track']
    small_validation = small.loc[small[('set', 'split')] == 'validation']['track']
    small_test = small.loc[small[('set', 'split')] == 'test']['track']

    print("Track.csv: {} training samples, {} validation samples, {} test samples\n".format(len(small_training), len(small_validation), len(small_test)))

    small_training_top_genres = small_training['genre_top']
    small_validation_top_genres = small_validation['genre_top']
    small_test_top_genres = small_test['genre_top']
    
    #create dictionary of genre classes:
    unique_genres = small_training_top_genres.unique()
    unique_genres = np.array(unique_genres)
    print("there are {} unique genres".format(len(unique_genres)))
    genre_dictionary = {}
    for i,genre in enumerate(unique_genres):
        genre_dictionary[genre] = i
    print("Dictionary of genres created:",genre_dictionary)
    
    
    Y_train = create_single_dataset(train_folder, small_training_top_genres, genre_dictionary)
    Y_validation = create_single_dataset(validation_folder, small_validation_top_genres, genre_dictionary)
    Y_test = create_single_dataset(test_folder, small_test_top_genres, genre_dictionary)
    
    return Y_train, Y_validation, Y_test
 
def get_sorted_file_paths(folder_path):
    file_list = os.listdir(folder_path)
    #sort the dataset files in alphabetical order (important to associate correct labels created using track_id in track.csv)
    file_list = Tcl().call('lsort', '-dict', file_list) # sort file by name: 2_0,2_1, ... 2_9,3_0, ... 400_0,400_1, ...
    file_paths = [os.path.join(folder_path, file_name) for file_name in file_list] #join filename with folder path
    #print("There are {} in the folder: {}".format(len(file_list),file_list))
    return file_paths, file_list
    
    
folder_path="data/fma_small_stft_transposed_22050_overlapped"
Y_train, Y_validation, Y_test = create_dataset_splitted(folder_path)

train_folder: data/fma_small_stft_transposed_22050_overlapped/train
validation_folder: data/fma_small_stft_transposed_22050_overlapped/validation
test_folder: data/fma_small_stft_transposed_22050_overlapped/test 

audio directory:  ./data/fma_small/
Loading tracks.csv...
small dataset shape: (8000, 52)
Track.csv: 6400 training samples, 800 validation samples, 800 test samples

there are 8 unique genres
Dictionary of genres created: {'Hip-Hop': 0, 'Pop': 1, 'Folk': 2, 'Rock': 3, 'Experimental': 4, 'International': 5, 'Electronic': 6, 'Instrumental': 7}
labels length: 127940
labels length: 16000
labels length: 16000


# Dataset Class

Class to load the STFT from files. Each file has a (128,513) matrix containing the STFT of a 3 seconds audio clip.

In [73]:
# Define the custom class for accessing our dataset
class MyDataset(Dataset):
    def __init__(self, file_list, labels, transform=None, verbose = False):
        self.file_list = file_list
        self.labels=labels
        self.transform = transform
        self.verbose = verbose

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        # returns a training sample and its label
        file_path = self.file_list[idx]
        label = torch.tensor(self.labels[idx])
        stft_vector = torch.tensor(np.load(file_path)) #load from file
        
        # Normalize your data here
        if self.transform:
            if(self.verbose==True):
                print("TRANSFORM: applying transform to tensor shape:",stft_vector.shape,"content:",stft_vector)
            stft_vector = self.transform(torch.unsqueeze(stft_vector, dim=0)) #unsqueeze needed for the torchvision normalize method
            if(self.verbose==True):
                print("TRANSFORM: after transform shape:",stft_vector.shape,"content:",stft_vector)
            stft_vector = torch.squeeze(stft_vector, dim=0)
            if(self.verbose==True):
                print("TRANSFORM: after squeeze shape:",stft_vector.shape,"content:",stft_vector)

        
        return stft_vector, label

In [74]:
folder_path="data/fma_small_stft_transposed_22050_overlapped"

train_folder = os.path.join(folder_path,'train') # concatenate train folder to path
validation_folder = os.path.join(folder_path,'validation') # concatenate train folder to path
test_folder = os.path.join(folder_path,'test') # concatenate train folder to path

train_file_paths, _ = get_sorted_file_paths(train_folder)
train_dataset = MyDataset(train_file_paths, Y_train)
print("len of train dataset: ",len(train_dataset))

validation_file_paths, _ = get_sorted_file_paths(validation_folder)
validation_dataset = MyDataset(validation_file_paths, Y_validation)
print("len of validation dataset: ",len(validation_file_paths))

test_file_paths, _ = get_sorted_file_paths(test_folder)
test_dataset = MyDataset(test_file_paths, Y_test)
print("len of test dataset: ",len(test_dataset))

len of train dataset:  127940
len of validation dataset:  16000
len of test dataset:  16000


# Data normalization
We will use Z-Score to normalize the training, validation and test set by calculating the mean and the std deviation on the training set.

In [75]:
save_filename = './data/fma_small_stft_transposed_22050_overlapped/train_mean'
std_save_filename = './data/fma_small_stft_transposed_22050_overlapped/train_std_deviation'

## Calculation of mean and standard deviation (Long)

In [10]:
batch_size=1
total_n_batches = len(train_dataset)/batch_size
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
current_sum=0

#iter all the training set by batches and calculate the sum of all the sample values (513*128 values for each sample)
for batch_idx, batch in enumerate(train_loader):
    #print("batch",batch_idx,"/",total_n_batches,"current_sum:",current_sum)
    inputs = batch[0]
    labels = batch[1]
    #print("inputs: shape:",inputs.shape,"content:",inputs)
    #print("labels:",labels)
    for sample in inputs:
        #print("sample: shape",sample.shape,"content:",sample)
        current_sum += torch.sum(sample)
        #print("current_sum:",current_sum)
print("final sum:",current_sum)



final sum: tensor(8.9296e+09)


In [11]:
mean = current_sum/(len(train_dataset)*513*128) #divide the sum for the total number of values considerated
print("mean of training set:",mean)

print("Saving the mean in file:",save_filename) 
np.save(save_filename,mean)

mean of training set: tensor(1.0629)
Saving the mean in file: ./data/fma_small_stft_transposed_22050_overlapped/train_mean


In [13]:
#now let's calculate the standard deviation (squared root of the variance)

batch_size=1
total_n_batches = len(train_dataset)/batch_size
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
current_sum_of_squares = 0

for batch_idx, batch in enumerate(train_loader):
    #print("batch",batch_idx,"/",total_n_batches,"current_sum_of_squares:",current_sum_of_squares)
    inputs = batch[0]
    labels = batch[1]
    #print("inputs: shape:",inputs.shape,"content:",inputs)
    #print("labels:",labels)
    for sample in inputs:
        #print("sample shape",sample.shape)
        for row in sample:
            #print("row shape:",row.shape)
            for elem in row:
                #print("elem: shape",elem.shape,"content:",elem)
                difference = elem - mean
                difference_squared = difference**2
                current_sum_of_squares += difference_squared
                #print("current_sum:",current_sum)
print("final sum of squares:",current_sum_of_squares)


final sum of squares: tensor(7.8299e+10)


In [None]:
import math

variance = current_sum_of_squares/((len(train_dataset) * 513 * 128)-1)
std_deviation = math.sqrt(variance)

print("variance:",variance)
print("std_deviation:",std_deviation)

print("Saving the std_deviation in file:",std_save_filename) 
np.save(std_save_filename,std_deviation)

## Load calculated mean and std deviation from file

In [76]:
loaded_mean = np.load(save_filename+'.npy')
print("loaded mean:",loaded_mean)

loaded_std = np.load(std_save_filename+'.npy')
print("loaded std:",loaded_std)

loaded mean: 1.0629134
loaded std: 3.0528938510507846


## Create the normalized dataset

In [77]:
from torchvision import transforms

batch_size = 1

transform = transforms.Compose([
    transforms.Normalize(mean= loaded_mean, std= loaded_std)
])

train_dataset = MyDataset(train_file_paths, Y_train,  transform = transform)
validation_dataset = MyDataset(validation_file_paths, Y_validation,  transform = transform)
test_dataset = MyDataset(test_file_paths, Y_test,  transform = transform)

# Network Architecture Definition (nnet1)

In [78]:
class NNet1(nn.Module):
    def __init__(self):
        super(NNet1, self).__init__()

        self.conv1 = nn.Conv2d(1, 128, kernel_size=(4,513))
        self.bn1 = nn.BatchNorm2d(128)
        self.relu = nn.ReLU()
        self.maxpool1 = nn.MaxPool2d(kernel_size=(2, 1))
        self.conv2 = nn.Conv2d(128, 128, kernel_size=(4, 1))
        self.bn2 = nn.BatchNorm2d(128)
        self.maxpool2 = nn.MaxPool2d(kernel_size=(2, 1))
        self.conv3 = nn.Conv2d(128, 256, kernel_size=(4, 1))
        self.bn3 = nn.BatchNorm2d(256)
        self.avgpool = nn.AvgPool2d(kernel_size=(26, 1))
        self.maxpool = nn.MaxPool2d(kernel_size=(26, 1))
        self.flatten = nn.Flatten()
        self.dropout = nn.Dropout(0.2)
        self.dense1 = nn.Linear(512, 300)
        self.bn4 = nn.BatchNorm1d(300)
        self.dense2 = nn.Linear(300,150)
        self.bn5 = nn.BatchNorm1d(150)
        self.dense3 = nn.Linear(150, 8)
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, x):
        x = self.conv1(x.float())
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool1(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.maxpool2(x)
        x = self.conv3(x)
        x = self.bn3(x)
        x_avg = self.avgpool(x)
        x_max = self.maxpool(x)
        x = torch.cat([x_avg, x_max], dim=1)
        x = self.flatten(x)
        x = self.dense1(x)
        x = self.dropout(x)
        x = self.bn4(x)
        x = self.relu(x)
        x = self.dense2(x)
        x = self.dropout(x)
        x = self.bn5(x)
        x = self.relu(x)
        x = self.dense3(x)
        x = self.softmax(x)
        return x

In [51]:
class NNet2(nn.Module):
    def __init__(self):
        super(NNet2, self).__init__()
        self.drop=nn.Dropout(0.2)
        # STFT spectrogram input: (batch_size, 1, 128, 513)
        self.conv1 = nn.Conv2d(1, 128, kernel_size=(4, 513))
        self.batch1=nn.BatchNorm2d(128)
        self.conv2 = nn.Conv2d(128,128, kernel_size=(4, 1),padding=1)
        self.batch2=nn.BatchNorm2d(128)
        self.conv3 = nn.Conv2d(128, 128, kernel_size=(4, 1),padding=2)
        self.batch3=nn.BatchNorm2d(128)
        self.maxpool = nn.MaxPool2d(kernel_size=(128,1))
        self.avgpool = nn.AvgPool2d(kernel_size=(128,1))
        self.fc1 = nn.Linear(250,128)
        self.bn1=nn.BatchNorm1d(128)
        self.fc2 = nn.Linear(128,64)
        self.bn2=nn.BatchNorm1d(64)
        self.fc3 = nn.Linear(64, 8)  # 8 classes for genre predictions
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        x = self.conv1(x.float())
        x=self.batch1(x)
        x = torch.relu(x)
        y=x
        x = self.conv2(x)
        x=self.batch2(x)
        x = torch.relu(x)
        x = self.conv3(x)
        x=self.batch3(x)
        x = torch.relu(x)
        # Sum between the first and third conv layers
        x = x[:, :, :, 0] + y[:, :, :, 0]
        
        x = torch.relu(x)
        x_max = self.maxpool(x)
        x_avg = self.avgpool(x)
        x = torch.cat([x_avg, x_max], dim=1)
        # Flatten the tensor for fully connected layers
        x = torch.flatten(x, 1)
        
        x = self.fc1(x)
        x=self.drop(x)
        x=self.bn1(x)
        x = torch.relu(x)
        x = self.fc2(x)        
        x=self.drop(x)
        x=self.bn2(x)
        x = torch.relu(x)
        x = self.fc3(x)
        x = self.softmax(x)
        
        return x

# Hyperparameters

In [52]:
BATCH_SIZE=32
EPOCHS=10
LEARNING_RATE=0.0001

learning_rate_list = [0.0001,0.00001]
batch_size_list = [128,256,512]
reg_list=[0.001,0.0001,0.00001]

# Train function

In [53]:
def test(model, validation_dataset, Y_validation,RGB=False):
    # Stop parameters learning
    model.eval()

    validation_loader = torch.utils.data.DataLoader(validation_dataset)

    criterion = nn.CrossEntropyLoss()
    correct = 0
    total = 0
    total_loss = 0
    #confusion_matrix = np.zeros((8, 8), dtype=int)

    with torch.no_grad():
        for sample, label in validation_loader:
            
            if RGB==False:
                sample = sample.unsqueeze(1)

            # Predict label
            output = model(sample)
            
            # Compute loss
            loss = criterion(output, label)
            total_loss += loss.item()

            max_index = torch.argmax(output).item()  # The index with maximum probability

            #confusion_matrix[label][max_index] += 1

            correct += (max_index == label)

    #cm = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix)
    #cm.plot()
    #print(confusion_matrix)
    accuracy = 100 * correct / len(Y_validation)
    average_loss = total_loss / len(Y_validation)

    model.train()
    return accuracy, average_loss

In [54]:
def train(model, dataset, batch_size, num_epochs, learning_rate, verbose = False, RGB=False, reg=1e-5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    val_loss_list=[]
    val_acc_list=[]
    train_loss_list=[]
    train_acc_list=[]
    counted_labels=[0,0,0,0,0,0,0,0]
    
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=reg)
    criterion = nn.CrossEntropyLoss()

    if not isinstance(dataset, Dataset):
        raise ValueError("The dataset parameter should be an instance of torch.utils.data.Dataset.")

    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    num_batches = len(data_loader)
    
    
    for epoch in range(num_epochs):
        running_loss = 0.0 
        running_accuracy = 0.0
        #initialize correctly predicted samples
        
        # Initialize the progress bar
        progress_bar = tq.tqdm(total=num_batches, unit="batch")
    
        # Initialize the progress bar description
        progress_bar.set_description(f"Epoch {epoch+1}/{num_epochs}")
        start_time = time.time()
        
        for batch_idx, batch in enumerate(data_loader):
            
            correct = 0 # reset train accuracy each batch
            
            inputs,labels = batch[0],batch[1]
            if(verbose == True):
                print("\ninputs shape:",inputs.size(),", dtype:",inputs.dtype," content: ",inputs)
                print("min value:",torch.min(inputs))
                print("max value:",torch.max(inputs))
                print("\nlabels shape:",labels.size(),",dtype:",labels.dtype,", content: ",labels)
            if(RGB==False):
                inputs = inputs.unsqueeze(1) #add a dimension if input is to be considered just grayscale
                #if input is RGB, there are already 3 channels
            
            # Extract the inputs and targets
            optimizer.zero_grad()
            outputs = model(inputs)
            
            if(verbose == True):
                print("\noutputs size:",outputs.size(),"content:",outputs)
                print("List of labels until now:",counted_labels)

            loss = criterion(outputs, labels) #labels need to be a vector of class indexes (0-7) of dim (batch_size)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            
            #calculate train accuracy
            for index, output in enumerate(outputs):
                max_index = torch.argmax(output).item() #the index with maximum probability
                counted_labels[labels[index].item()]+=1
                if(labels[index].item() == max_index):
                    correct += 1
            
                if(verbose==True):
                    print("considering output at index {}:".format(index,output))
                    print("max output index = {}",max_index)
                    if(labels[index].item() == max_index):
                        print("correct! in fact labels[index] = {}, max_index = {}".format(labels[index].item(),max_index))
                    else:
                        print("NOT correct! in fact labels[index] = {}, max_index = {}".format(labels[index].item(),max_index))

            
            accuracy = 100 * correct / batch_size
            running_accuracy += accuracy #epoch running_accuracy
            
            # Update the progress bar description and calculate bps
            #progress_bar.set_postfix({"Loss": running_loss / (batch_idx + 1)})
            average_accuracy = running_accuracy / (batch_idx + 1)
            average_loss = running_loss / (batch_idx + 1)
            progress_bar.set_postfix({"avg_loss": average_loss, "acc": accuracy, "avg_acc": average_accuracy})

            # Update the progress bar
            progress_bar.update(1)
            # Evaluate the model on the validation dataset
        
        #calculate train loss and accuracy
        average_loss = running_loss / len(data_loader)
        average_accuracy = running_accuracy / len(data_loader)
        train_loss_list.append(average_loss)
        train_acc_list.append(average_accuracy)
        
        #calculate validation loss and accuracy
        val_acc, val_loss = test(model, validation_dataset, Y_validation,RGB=RGB)
        val_loss_list.append(val_loss)
        val_acc_list.append(val_acc)
        
        
        print(f"Epoch [{epoch+1}/{num_epochs}],Train Loss: {average_loss:.4f}. Train Accuracy: {average_accuracy} Val Loss: {val_loss} Val Accuracy: {val_acc}")
        progress_bar.close()
    return train_loss_list, train_acc_list, val_loss_list, val_acc_list

# Network Architecture Definition (nnet1 + BN)

In [55]:
class NNet1_Small(nn.Module):
    def __init__(self):
        super(NNet1_Small, self).__init__()

        self.conv1 = nn.Conv2d(1, 64, kernel_size=(2, 513))
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU()

        self.maxpool1 = nn.MaxPool2d(kernel_size=(4, 1))
        self.conv2 = nn.Conv2d(64,128, kernel_size=(2, 1))
        self.bn2 = nn.BatchNorm2d(128)
        self.maxpool2 = nn.MaxPool2d(kernel_size=(4, 1))
        self.conv3 = nn.Conv2d(128,64, kernel_size=(4, 1))
        self.bn3 = nn.BatchNorm2d(64)
        self.avgpool = nn.AvgPool2d(kernel_size=(2, 1))
        self.maxpool = nn.MaxPool2d(kernel_size=(2, 1))
        self.flatten = nn.Flatten()
        self.dropout = nn.Dropout(0.3)
        self.dense1 = nn.Linear(256, 64)
        self.bn4 = nn.BatchNorm1d(64)
        self.dense2 = nn.Linear(128,64)
        self.bn5 = nn.BatchNorm1d(64)
        self.dense3 = nn.Linear(64, 8)
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, x):
        x = self.conv1(x.float())
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool1(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.maxpool2(x)
        x = self.conv3(x)
        x = self.bn3(x)
        x_avg = self.avgpool(x)
        x_max = self.maxpool(x)
        x = torch.cat([x_avg, x_max], dim=1)
        x = self.flatten(x)
        x = self.dense1(x)
        x = self.dropout(x)
        x = self.bn4(x)
        x = self.relu(x)
        #x = self.dense2(x)
        #x = self.dropout(x)
        #x = self.bn5(x)
        #x = self.relu(x)
        x = self.dense3(x)
        x = self.softmax(x)
        return x

In [14]:
model_NNet1 = NNet1()
summary(model_NNet1, (1, 128, 513))

RuntimeError: Calculated padded input size per channel: (128 x 513). Kernel size: (513 x 4). Kernel size can't be greater than actual input size

In [None]:
#TODO: Add batch_size optimization loop
save_directory="./results/NNet1_Small/"

lr_list= [ 0.0001]
r_list=[0.0001]

for i in lr_list:
    for j in r_list:
        if(j!=1e-5 and j!=1e-6):
            filename=save_directory+"lr_"+"0"+str(i).split(".")[1]+"_reg_"+"0"+str(j).split(".")[1]
        elif(j==1e-5):
            filename=save_directory+"lr_"+"0"+str(i).split(".")[1]+"_reg_"+"00001"
        else:
            filename=save_directory+"lr_"+"0"+str(i).split(".")[1]+"_reg_"+"000001"
        print(filename)
        model = NNet1_Small()
        train_loss_list, train_acc_list, val_loss_list, val_acc_list =train(model, train_dataset, batch_size=128, num_epochs=10, learning_rate=i, reg=j)
        print("Trained with learning rate=",i," and with regularization term=",j)
        print("Loss:",val_loss_list)
        print("Accuracy:",val_acc_list)
        save_values=[val_loss_list,val_acc_list]
        np.savetxt(filename,save_values)

## Raw audio dataset

In [64]:
class MyDatasetRaw(Dataset):
    def __init__(self, file_list, labels, transform=None, verbose=False):
        self.file_list = file_list
        self.labels=labels
        self.transform = transform
        self.verbose=verbose

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        file_path = self.file_list[idx]
        label = torch.tensor(self.labels[idx])
        raw_vector = np.load(file_path).astype(np.int16) # Ensure int16 data type
        if(self.verbose==True):
            print("raw vector shape:",raw_vector.shape)
        raw_vector = torch.tensor(raw_vector)
        
        # Normalize your data here
        if self.transform:
            
            #convert to float64 tensor
            raw_vector = raw_vector.double()
            if(self.verbose==True):
                print("TRANSFORM: applying transform to tensor shape:",raw_vector.shape,"content:",raw_vector)
            raw_vector = torch.unsqueeze(raw_vector, dim=0)
            #print("TRANSFORM: after first unsqueeze:",raw_vector.shape,"content:",raw_vector)
            raw_vector = torch.unsqueeze(raw_vector, dim=0) #unsqueeze two times (needed for torchvision normalize method)
            #print("TRANSFORM: after second unsqueeze:",raw_vector.shape,"content:",raw_vector)
            raw_vector = self.transform(raw_vector) #normalize the sample
            if(self.verbose==True):
                print("TRANSFORM: after transform shape:",raw_vector.shape,"content:",raw_vector)
            raw_vector = torch.squeeze(raw_vector, dim=0)
            raw_vector = torch.squeeze(raw_vector, dim=0)
            if(self.verbose==True):
                print("TRANSFORM: after double squeeze shape:",raw_vector.shape,"content:",raw_vector)
        
        return raw_vector, label        

# Normalization of raw audio

We calculate mean and std deviation

In [65]:
mean_save_filename_raw = './data/fma_small_raw_array_22050_overlapped/train_mean'
std_save_filename_raw = './data/fma_small_raw_array_22050_overlapped/train_std_deviation'

# Calculation of mean raw (Long)

In [None]:
train_dataset = MyDatasetRaw(file_paths_train, Y_train)
batch_size=1
total_n_batches = len(train_dataset)/batch_size
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
current_sum=0

#iter all the training set by batches and calculate the sum of all the sample values (513*128 values for each sample)
for batch_idx, batch in enumerate(train_loader):
    if(batch_idx%1000==0):
        print("batch",batch_idx,"/",total_n_batches,"(",round((batch_idx/len(train_dataset)*100)),"%), current_sum:",current_sum)
    
    inputs = batch[0]
    labels = batch[1]
    #print("inputs: shape:",inputs.shape,"content:",inputs)
    #print("labels:",labels)
    for sample in inputs:
        #print("sample: shape",sample.shape,"content:",sample)
        current_sum += torch.sum(sample)
       
        #print("type of current_sum:",current_sum.dtype)
        #print("current_sum:",current_sum)
print("final sum:",current_sum)

In [None]:
print("current_sum",current_sum)
mean_raw = current_sum/(len(train_dataset)*66150) #divide the sum for the total number of values considerated
print("mean of training set:",mean_raw)

print("Saving the mean in file:",mean_save_filename_raw) 
np.save(mean_save_filename_raw,mean_raw)

# Calculation of std deviation raw (Long)

In [None]:
#now let's calculate the standard deviation (squared root of the variance)

batch_size=1
total_n_batches = len(train_dataset)/batch_size
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
current_sum_of_squares = 0

for batch_idx, batch in enumerate(train_loader):
    if(batch_idx%1000==0):
        print("batch",batch_idx,"/",total_n_batches,round((batch_idx/len(train_dataset)*100)),"%, current_sum_of_squares:",current_sum_of_squares)
    inputs = batch[0]
    labels = batch[1]
    #print("inputs: shape:",inputs.shape,"content:\n",inputs)
    #print("labels:",labels)
    for elem in inputs:
        elem = elem.double() #convert to float64 for precise calculations
        #print("elem: shape",elem.shape,"content:\n",elem)
        difference = elem - mean_raw
        #print("difference: shape:",difference.shape,"content:\n", difference)
        difference_squared = difference**2
        #print("difference_squared: shape:",difference_squared.shape,"content:\n", difference_squared)
        current_sum_of_squares += torch.sum(difference_squared)
        #print("current_sum_of_squares:",current_sum_of_squares)
print("final sum of squares:",current_sum_of_squares)

In [None]:
import math

variance_raw = current_sum_of_squares/((len(train_dataset)*66150)-1)
std_deviation_raw = math.sqrt(variance)

print("variance raw:",variance)
print("std_deviation_raw:",std_deviation_raw)

print("Saving the std_deviation_raw in file:",std_save_filename_raw) 
np.save(std_save_filename_raw,std_deviation_raw)

# Load mean and std from file (fast)

In [66]:
loaded_mean_raw = np.load(mean_save_filename_raw+'.npy')
print("loaded mean:",loaded_mean_raw)

loaded_std_raw = np.load(std_save_filename_raw+'.npy')
print("loaded std:",loaded_std_raw)

loaded mean: -16.984083
loaded std: 1032.9510216986293


## Create dataset for raw audio

The labels are the same as the STFT dataset (in the same order).

In [67]:
folder_path="data/fma_small_raw_array_22050_overlapped"

train_folder = os.path.join(folder_path,'train') # concatenate train folder to path
validation_folder = os.path.join(folder_path,'validation') # concatenate train folder to path
test_folder = os.path.join(folder_path,'test') # concatenate train folder to path

train_file_paths, _ = get_sorted_file_paths(train_folder)
train_dataset = MyDatasetRaw(train_file_paths, Y_train)
print("len of train dataset: ",len(train_dataset))

validation_file_paths, _ = get_sorted_file_paths(validation_folder)
validation_dataset = MyDatasetRaw(validation_file_paths, Y_validation)
print("len of validation dataset: ",len(validation_file_paths))

test_file_paths, _ = get_sorted_file_paths(test_folder)
test_dataset = MyDatasetRaw(test_file_paths, Y_test)
print("len of test dataset: ",len(test_dataset))

len of train dataset:  127940
len of validation dataset:  16000
len of test dataset:  16000


# Neural Network Architecture for raw audio

We implemented a lightweight CNN to classify the samples with their raw audio.

In [68]:
class NNet_Raw(nn.Module):
    def __init__(self, dropout_rate=0.5):
        super(NNet_Raw, self).__init__()
        self.conv1 = nn.Conv1d(1, 32, kernel_size=16)
        self.conv2 = nn.Conv1d(32, 8, kernel_size=16)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool1d(kernel_size=32)
        self.maxpool1 = nn.MaxPool1d(kernel_size=8)
        self.batchnorm1 = nn.BatchNorm1d(32)
        self.batchnorm2 = nn.BatchNorm1d(8)
        self.batchnorm3 = nn.BatchNorm1d(24)
        self.dropout = nn.Dropout(0.2)
        self.fc1 = nn.Linear(248, 24)
        self.fc3 = nn.Linear(24, 8)
        self.softmax=nn.Softmax(dim=1)
        
    def forward(self, x):
        x=self.maxpool1(x.float())
        x = self.conv1(x)
        x = self.relu(x)
        x = self.batchnorm1(x)
        x=self.maxpool1(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.batchnorm2(x)     
        x = self.maxpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.batchnorm3(x)
        x = self.dropout(x)
        x = self.fc3(x)
        x = self.softmax(x)
        return x

In [22]:
MyModel=NNet_Raw()
summary(MyModel, [1,66150])
print(MyModel)

TypeError: rand() argument after * must be an iterable, not int

In [58]:
train_loss_list, train_acc_list, val_loss_list, val_acc_list =train(MyModel, train_dataset, batch_size=128, num_epochs=10, learning_rate=0.01, verbose=False)

NameError: name 'MyModel' is not defined

# Results Loader

In [41]:
def find_best_value(values,loss):
    if loss==True:
        max_v=100
    else:
        max_v=0
    index=-1
    for i in range(len(values)):
        if loss==True:
            if values[i]<max_v:
                max_v=values[i]
                index=i+1
        else:
            if values[i]>max_v:
                max_v=values[i]
                index=i+1
    return max_v,index
            

In [61]:
res_directory="./results/"
models=os.listdir(res_directory) 
for i in models:
    best_loss=100
    best_acc=0
    best_loss_ep=0
    best_acc_ep=0
    model_folder=res_directory+i
    trials=os.listdir(model_folder)
    if(len(trials)==0):
        continue
    best_trials=[(best_loss,best_loss_ep),(best_acc,best_acc_ep)]
    best_trials_names=['','']
    for j in trials:
        res=np.loadtxt(model_folder+"/"+j)
        loss,epoch_l=find_best_value(res[0],True)
        accuracy,epoch_a=find_best_value(res[1],False)
        if(loss<best_loss):
            best_trials_names[0]=j
            best_loss=loss
            best_loss_ep=epoch_l
            best_trials[0]=(best_loss,best_loss_ep)
        if(accuracy>best_acc):
            best_trials_names[1]=j
            best_acc=accuracy
            best_acc_ep=epoch_a
            best_trials[1]=(best_acc,best_acc_ep)
            
    print("Model:",i)
    print("Best Model for accuracy:",best_trials_names[1])
    print("Value:",best_trials[1][0],"Epoch:",best_trials[1][1])
    print("Best Model for loss:",best_trials_names[0])
    print("Value:",best_trials[0][0],"Epoch:",best_trials[0][1])
    print("")
    
print(models)

Model: NNet_Raw
Best Model for accuracy: lr_0001_reg_00001
Value: 41.58124923706055 Epoch: 9
Best Model for loss: lr_0001_reg_00001
Value: 1.8506742839217185 Epoch: 9

Model: NNet1
Best Model for accuracy: lr_00001_reg_0001
Value: 50.9375 Epoch: 2
Best Model for loss: lr_0001_reg_000001
Value: 1.7595391278117896 Epoch: 4

Model: NNet1_Small
Best Model for accuracy: lr_00001_reg_00001
Value: 50.51250076293945 Epoch: 9
Best Model for loss: lr_0001_reg_000001
Value: 1.7661717012748122 Epoch: 10

Model: Ensemble
Best Model for accuracy: lr_00001_reg_000001
Value: 51.54375076293945 Epoch: 4
Best Model for loss: lr_00001_reg_000001
Value: 1.7636730014681816 Epoch: 4

Model: NNet2
Best Model for accuracy: lr_00001_reg_000001
Value: 38.79375076293945 Epoch: 9
Best Model for loss: lr_00001_reg_000001
Value: 1.8803115216344595 Epoch: 9

Model: Ensemble_No_Weights
Best Model for accuracy: lr_0001_reg_000001
Value: 49.03125 Epoch: 7
Best Model for loss: lr_0001_reg_000001
Value: 1.854172814361751 

# NN on echonest features 

As you can see below, unfortunately we cannot perform a train using echonest features (danceability, tempo, acousticness, ...) because there are only ►1300 tracks with echonest features which are not NaN.

In [26]:
print("opening csvs...")

tracks = utils.load('data/fma_metadata/tracks.csv')

echonest = utils.load('data/fma_metadata/echonest.csv')
echonest = echonest['echonest', 'audio_features']
small = tracks[tracks['set', 'subset'] <= 'small']

print("small dataset shape:",small.shape)
print("echonest csv shape (only audio features):",echonest.shape)

opening csvs...
small dataset shape: (8000, 52)
echonest csv shape (only audio features): (13129, 8)


In [27]:
#select small dataset from echonest csv

track_ids = small.index.values.tolist()
print("Track ids shape:",len(track_ids),"content:",track_ids[:10],"...")
echonest_small = pd.DataFrame(echonest,index=track_ids)

ipd.display(echonest_small)

Track ids shape: 8000 content: [2, 5, 10, 140, 141, 148, 182, 190, 193, 194] ...


Unnamed: 0,acousticness,danceability,energy,instrumentalness,liveness,speechiness,tempo,valence
2,0.416675,0.675894,0.634476,0.010628,0.177647,0.159310,165.922,0.576661
5,0.043567,0.745566,0.701470,0.000697,0.373143,0.124595,100.260,0.621661
10,0.951670,0.658179,0.924525,0.965427,0.115474,0.032985,111.562,0.963590
140,0.376312,0.734079,0.265685,0.669581,0.085995,0.039068,107.952,0.609991
141,0.963657,0.435933,0.075632,0.345493,0.105686,0.026658,33.477,0.163950
...,...,...,...,...,...,...,...,...
154308,,,,,,,,
154309,,,,,,,,
154413,,,,,,,,
154414,,,,,,,,


In [28]:
X_train_echonest = echonest_small.to_numpy(dtype=np.float16)
print("X_train_echonest: shape:",X_train_echonest.shape)

nan_rows = np.argwhere(np.isnan(X_train_echonest).all(axis=1))

print("There are",len(nan_rows),"rows containing NaN only as data")
#nan_rows = nan_rows.squeeze()
#for i in nan_rows:
#    print("row:",i,":",X_train_echonest[i])

X_train_echonest: shape: (8000, 8)
There are 6706 rows containing NaN only as data


# ResNet
We try to use transfer learning using a ResNet18 by torch

In [34]:
import torchvision


class MyModel_ResNet18(nn.Module):
    def __init__(self, pretrained=True):
        super(MyModel_ResNet18, self).__init__()
        pretrained_model = torchvision.models.resnet18(pretrained=pretrained)
        layers=list(pretrained_model.children())[:-3]
        for param in pretrained_model.parameters():
            param.requires_grad = False
        #print(layers)
        self.features = nn.Sequential(*layers)
        
        
        
        self.pool= nn.AdaptiveAvgPool2d(1)
        self.flatten=nn.Flatten() 
        self.dropout=nn.Dropout(0.2)
        self.fc1 = nn.Linear(256, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.fc2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.fc3= nn.Linear(64, 8)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        
        x = self.features(x.float())
        x = self.pool(x)
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.bn1(x)
        x = torch.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = torch.relu(x)
        x = self.dropout(x)
        x = self.fc3(x)
        x = self.softmax(x)
        return x
    





In [35]:
    
model = MyModel_ResNet18()
summary(model, (3,128,513))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [-1, 64, 64, 257]           9,408
       BatchNorm2d-2          [-1, 64, 64, 257]             128
              ReLU-3          [-1, 64, 64, 257]               0
         MaxPool2d-4          [-1, 64, 32, 129]               0
            Conv2d-5          [-1, 64, 32, 129]          36,864
       BatchNorm2d-6          [-1, 64, 32, 129]             128
              ReLU-7          [-1, 64, 32, 129]               0
            Conv2d-8          [-1, 64, 32, 129]          36,864
       BatchNorm2d-9          [-1, 64, 32, 129]             128
             ReLU-10          [-1, 64, 32, 129]               0
       BasicBlock-11          [-1, 64, 32, 129]               0
           Conv2d-12          [-1, 64, 32, 129]          36,864
      BatchNorm2d-13          [-1, 64, 32, 129]             128
             ReLU-14          [-1, 64, 

## RGB Dataset
Resnet18 expects RGB input images, so our dataset need to be converted from a one to a three-channel. We'll do dat by copying three times the same image in the three channels.

In [36]:
# Define the custom class for accessing our dataset
class MyDatasetRGB(Dataset):
    def __init__(self, file_list, labels, transform=None, verbose = False):
        self.file_list = file_list
        self.labels=labels
        self.transform = transform
        self.verbose = verbose

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        # returns a training sample and its label
        file_path = self.file_list[idx]
        label = torch.tensor(self.labels[idx])
        stft_vector = torch.tensor(np.load(file_path)) #load from file
        
        # Normalize your data here
        if self.transform:
            if(self.verbose==True):
                print("TRANSFORM: applying transform to tensor shape:",stft_vector.shape,"content:",stft_vector)
            stft_vector = self.transform(torch.unsqueeze(stft_vector, dim=0)) #unsqueeze needed for the torchvision normalize method
            if(self.verbose==True):
                print("TRANSFORM: after transform shape:",stft_vector.shape,"content:",stft_vector)
            stft_vector = torch.squeeze(stft_vector, dim=0)
            if(self.verbose==True):
                print("TRANSFORM: after squeeze shape:",stft_vector.shape,"content:",stft_vector)
                
        #do ResNet18 normalization:
        #transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

        #copy the channel 3 times (need to unsqueeze to create a new dimension first)
        #print("DATASET*  sample shape is:",stft_vector.shape,"content:",stft_vector)
        stft_vector = stft_vector.unsqueeze(0).repeat(3,1,1)
        stft_vector = stft_vector.to(torch.float32) #float32 needed for ResNet18 model (downcast from float64)
        #print("DATASET* sample shape after repeat is:",stft_vector.shape,"content:",stft_vector)
        #print("stft_vector dtype:",stft_vector.dtype)

        
        return stft_vector, label

In [37]:

transform_ResNet18 = transforms.Compose([
    transforms.Normalize(mean= [loaded_mean,loaded_mean,loaded_mean], std=[loaded_std,loaded_std,loaded_std]) #our values
    #transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) #ResNet18 specific values
])

train_dataset = MyDatasetRGB(train_file_paths, Y_train,  transform = transform)
validation_dataset = MyDatasetRGB(validation_file_paths, Y_validation,  transform = transform)
test_dataset = MyDatasetRGB(test_file_paths, Y_test,  transform = transform)

train_data_loader = DataLoader(train_dataset, batch_size = 10, shuffle=True)



'\nfor batch_idx, batch  in enumerate(train_data_loader):\n    print ("batch index:",batch_idx)\n    inputs = batch[0]\n    labels = batch[1]\n    \n    for idx, sample in enumerate(inputs):\n        label = labels[idx]\n        print("inputs: shape:",inputs.shape)\n        print("sample: shape:",sample.shape)\n'

In [38]:
#train the model

#todo: convert input tensor from float64 to double

#todo normalize again using resnet18 suggested mean and std
#TODO: FIX TEST FUNCTION USING RGB (maybe already works)
#A member not well specificated didn't read the line above this one and completely wasted 6 golden hours getting 
#the error just 5 minutes after he went out from his house.


train_loss_list, train_acc_list, val_loss_list, val_acc_list =train(model, train_dataset, batch_size=128, num_epochs=5, learning_rate=0.001,verbose=False, RGB=True)

  0%|          | 0/1000 [00:00<?, ?batch/s]

Epoch [1/5],Train Loss: 1.7927. Train Accuracy: 48.46484375 Val Loss: 1.7948411099910737 Val Accuracy: tensor([47.2500])


  0%|          | 0/1000 [00:00<?, ?batch/s]

Epoch [2/5],Train Loss: 1.7411. Train Accuracy: 53.0828125 Val Loss: 1.783997517593205 Val Accuracy: tensor([48.2250])


  0%|          | 0/1000 [00:00<?, ?batch/s]

Epoch [3/5],Train Loss: 1.7272. Train Accuracy: 54.36015625 Val Loss: 1.7856278691217304 Val Accuracy: tensor([48.1125])


  0%|          | 0/1000 [00:00<?, ?batch/s]

Epoch [4/5],Train Loss: 1.7195. Train Accuracy: 55.05859375 Val Loss: 1.791919251486659 Val Accuracy: tensor([47.6188])


  0%|          | 0/1000 [00:00<?, ?batch/s]

Epoch [5/5],Train Loss: 1.7120. Train Accuracy: 55.890625 Val Loss: 1.7769502711519598 Val Accuracy: tensor([49.1063])


In [69]:
#TODO: Add batch_size optimization loop
save_directory="./best_models/"

learning_rate_list = [0.001]
reg_list=[0.0001]
epochs=9

for i in learning_rate_list:
    for j in reg_list:
        filename=save_directory+"results/NNet_Raw"        
        print(filename)
        model = NNet_Raw()
        train_loss_list, train_acc_list, val_loss_list, val_acc_list =train(model, train_dataset, batch_size=64, num_epochs=epochs, learning_rate=i, reg=j)
        print("Trained with learning rate=",i," and with regularization term=",j)
        torch.save(model.state_dict(), save_directory+"best_models/NNet_Raw")

./best_models/results/NNet_Raw


  0%|          | 0/2000 [00:00<?, ?batch/s]

Epoch [1/9],Train Loss: 1.9081. Train Accuracy: 36.209375 Val Loss: 1.9033585867509246 Val Accuracy: tensor([36.6875])


  0%|          | 0/2000 [00:00<?, ?batch/s]

Epoch [2/9],Train Loss: 1.8704. Train Accuracy: 39.68046875 Val Loss: 1.8821116178780795 Val Accuracy: tensor([38.4313])


  0%|          | 0/2000 [00:00<?, ?batch/s]

Epoch [3/9],Train Loss: 1.8518. Train Accuracy: 41.653125 Val Loss: 1.8654355867728591 Val Accuracy: tensor([40.1188])


  0%|          | 0/2000 [00:00<?, ?batch/s]

Epoch [4/9],Train Loss: 1.8400. Train Accuracy: 42.8296875 Val Loss: 1.8804640002697706 Val Accuracy: tensor([38.5812])


  0%|          | 0/2000 [00:00<?, ?batch/s]

Epoch [5/9],Train Loss: 1.8357. Train Accuracy: 43.2171875 Val Loss: 1.8809217572957277 Val Accuracy: tensor([38.5687])


  0%|          | 0/2000 [00:00<?, ?batch/s]

Epoch [6/9],Train Loss: 1.8293. Train Accuracy: 43.86484375 Val Loss: 1.845809885494411 Val Accuracy: tensor([42.0812])


  0%|          | 0/2000 [00:00<?, ?batch/s]

Epoch [7/9],Train Loss: 1.8249. Train Accuracy: 44.26328125 Val Loss: 1.844606993086636 Val Accuracy: tensor([42.4313])


  0%|          | 0/2000 [00:00<?, ?batch/s]

Epoch [8/9],Train Loss: 1.8219. Train Accuracy: 44.684375 Val Loss: 1.8478830751031636 Val Accuracy: tensor([41.6562])


  0%|          | 0/2000 [00:00<?, ?batch/s]

Epoch [9/9],Train Loss: 1.8189. Train Accuracy: 44.88984375 Val Loss: 1.8483450606167315 Val Accuracy: tensor([41.8750])
Trained with learning rate= 0.001  and with regularization term= 0.0001


In [79]:
model_path="./best_models/best_models/"
model_name="NNet1"

test_model=NNet1()
test_model.load_state_dict(torch.load(model_path+model_name), strict=False)

print(test(test_model,train_dataset, Y_train))



(tensor([62.6583]), 1.6450505791631773)
