In [2]:
import time
import os
import pprint
import torch
import torch.nn as nn
from sklearn.metrics import ConfusionMatrixDisplay
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import librosa
import librosa.display
import tqdm.notebook as tq
import utils
from pydub import AudioSegment
from tkinter import Tcl # file sorting by name

In [3]:
def create_single_dataset(folder_path, tracks_dataframe, genre_dictionary):    
    labels = []
   
    _, file_list = get_sorted_file_paths(folder_path)
    
    for i,file in enumerate(file_list):
        #print("considering file:",file, "({}/{})".format(i,len(file_list)))
        track_id_clip_id = file.split('.')[0]
        track_id = track_id_clip_id.split('_')[0]
        #print("track id with clip: {}, track id: {}".format(track_id_clip_id, track_id))
        genre = tracks_dataframe.loc[int(track_id)]
        #print("genre from dataframe: ", genre)
        label = genre_dictionary[genre]
        #print("label from dictionary:",label)
        labels.append(label)
    print("labels length: {}".format(len(labels)))
    return labels
    

#create the train,validation and test vectors using the files in the train/validation/test folders
def create_dataset_splitted(folder_path):
    train_folder = os.path.join(folder_path,'train') # concatenate train folder to path
    validation_folder = os.path.join(folder_path,'validation') # concatenate train folder to path
    test_folder = os.path.join(folder_path,'test') # concatenate train folder to path
    
    print("train_folder:",train_folder)
    print("validation_folder:",validation_folder)
    print("test_folder:",test_folder,"\n")
    
    AUDIO_DIR = os.environ.get('AUDIO_DIR')
    print("audio directory: ",AUDIO_DIR)
    print("Loading tracks.csv...")
    tracks = utils.load('data/fma_metadata/tracks.csv')
    
    #get only the small subset of the dataset
    small = tracks[tracks['set', 'subset'] <= 'small']
    print("small dataset shape:",small.shape)    

    small_training = small.loc[small[('set', 'split')] == 'training']['track']
    small_validation = small.loc[small[('set', 'split')] == 'validation']['track']
    small_test = small.loc[small[('set', 'split')] == 'test']['track']

    print("Track.csv: {} training samples, {} validation samples, {} test samples\n".format(len(small_training), len(small_validation), len(small_test)))

    small_training_top_genres = small_training['genre_top']
    small_validation_top_genres = small_validation['genre_top']
    small_test_top_genres = small_test['genre_top']
    
    #create dictionary of genre classes:
    unique_genres = small_training_top_genres.unique()
    unique_genres = np.array(unique_genres)
    print("there are {} unique genres".format(len(unique_genres)))
    genre_dictionary = {}
    for i,genre in enumerate(unique_genres):
        genre_dictionary[genre] = i
    print("Dictionary of genres created:",genre_dictionary)
    
    
    Y_train = create_single_dataset(train_folder, small_training_top_genres, genre_dictionary)
    Y_validation = create_single_dataset(validation_folder, small_validation_top_genres, genre_dictionary)
    Y_test = create_single_dataset(test_folder, small_test_top_genres, genre_dictionary)
    
    return Y_train, Y_validation, Y_test
 
def get_sorted_file_paths(folder_path):
    file_list = os.listdir(folder_path)
    #sort the dataset files in alphabetical order (important to associate correct labels created using track_id in track.csv)
    file_list = Tcl().call('lsort', '-dict', file_list) # sort file by name: 2_0,2_1, ... 2_9,3_0, ... 400_0,400_1, ...
    file_paths = [os.path.join(folder_path, file_name) for file_name in file_list] #join filename with folder path
    #print("There are {} in the folder: {}".format(len(file_list),file_list))
    return file_paths, file_list
    
    
folder_path="data/fma_small_stft_transposed_22050_overlapped"
Y_train, Y_validation, Y_test = create_dataset_splitted(folder_path)

train_folder: data/fma_small_stft_transposed_22050_overlapped/train
validation_folder: data/fma_small_stft_transposed_22050_overlapped/validation
test_folder: data/fma_small_stft_transposed_22050_overlapped/test 

audio directory:  ./data/fma_small/
Loading tracks.csv...
small dataset shape: (8000, 52)
Track.csv: 6400 training samples, 800 validation samples, 800 test samples

there are 8 unique genres
Dictionary of genres created: {'Hip-Hop': 0, 'Pop': 1, 'Folk': 2, 'Rock': 3, 'Experimental': 4, 'International': 5, 'Electronic': 6, 'Instrumental': 7}
labels length: 127940
labels length: 16000
labels length: 16000


In [6]:
# Define the custom class for accessing our dataset
class MyDataset(Dataset):
    def __init__(self, stft_file_list,raw_file_list, labels, transform=None, verbose = False):
        self.stft_file_list = stft_file_list
        self.raw_file_list = raw_file_list
        self.labels=labels
        self.transform = transform
        self.verbose = verbose

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # returns a training sample and its label
        stft_file_path = self.stft_file_list[idx]
        raw_file_path = self.raw_file_list[idx]
        label = torch.tensor(self.labels[idx])
        stft_vector = torch.tensor(np.load(stft_file_path)) #load from file
        raw_vector=torch.tensor(np.load(raw_file_path)) 
                
        return [stft_vector,raw_vector], label

In [9]:
stft_folder_path="data/fma_small_stft_transposed_22050_overlapped"
raw_folder_path="data/fma_small_raw_array_22050_overlapped"

stft_train_folder = os.path.join(stft_folder_path,'train') # concatenate train folder to path
raw_train_folder = os.path.join(raw_folder_path,'train') # concatenate train folder to path
stft_validation_folder = os.path.join(stft_folder_path,'validation') # concatenate train folder to path
raw_validation_folder = os.path.join(raw_folder_path,'validation') # concatenate train folder to path
stft_test_folder = os.path.join(stft_folder_path,'test') # concatenate train folder to path
raw_test_folder = os.path.join(raw_folder_path,'test') # concatenate train folder to path

stft_train_file_paths, _ = get_sorted_file_paths(stft_train_folder)
raw_train_file_paths, _ = get_sorted_file_paths(raw_train_folder)
train_dataset = MyDataset(stft_train_file_paths,raw_train_file_paths, Y_train)
print("len of train dataset: ",len(train_dataset))

stft_validation_file_paths, _ = get_sorted_file_paths(stft_validation_folder)
raw_validation_file_paths, _ = get_sorted_file_paths(raw_validation_folder)
validation_dataset = MyDataset(stft_validation_file_paths,raw_validation_file_paths, Y_validation)
print("len of validation dataset: ",len(validation_dataset))

stft_test_file_paths, _ = get_sorted_file_paths(stft_test_folder)
raw_test_file_paths, _ = get_sorted_file_paths(raw_test_folder)
test_dataset = MyDataset(stft_test_file_paths,raw_test_file_paths, Y_test)
print("len of test dataset: ",len(test_dataset))

len of train dataset:  127940
len of validation dataset:  16000
len of test dataset:  16000


In [10]:
def test(model, validation_dataset, Y_validation):
    # Stop parameters learning
    model.eval()

    validation_loader = torch.utils.data.DataLoader(validation_dataset)

    criterion = nn.CrossEntropyLoss()
    correct = 0
    total = 0
    total_loss = 0
    #confusion_matrix = np.zeros((8, 8), dtype=int)

    with torch.no_grad():
        for sample, label in validation_loader:
            
            sample = sample.unsqueeze(1)

            # Predict label
            output = model(sample)
            
            # Compute loss
            loss = criterion(output, label)
            total_loss += loss.item()

            max_index = torch.argmax(output).item()  # The index with maximum probability

            #confusion_matrix[label][max_index] += 1

            correct += (max_index == label)

    #cm = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix)
    #cm.plot()
    #print(confusion_matrix)
    accuracy = 100 * correct / len(Y_validation)
    average_loss = total_loss / len(Y_validation)

    model.train()
    return accuracy, average_loss

In [11]:
def train(model, dataset, batch_size, num_epochs, learning_rate, verbose = False, RGB=False, reg=1e-5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    val_loss_list=[]
    val_acc_list=[]
    train_loss_list=[]
    train_acc_list=[]
    counted_labels=[0,0,0,0,0,0,0,0]
    
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=reg)
    criterion = nn.CrossEntropyLoss()

    if not isinstance(dataset, Dataset):
        raise ValueError("The dataset parameter should be an instance of torch.utils.data.Dataset.")

    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    num_batches = len(data_loader)
    
    
    for epoch in range(num_epochs):
        running_loss = 0.0 
        running_accuracy = 0.0
        #initialize correctly predicted samples
        
        # Initialize the progress bar
        progress_bar = tq.tqdm(total=num_batches, unit="batch")
    
        # Initialize the progress bar description
        progress_bar.set_description(f"Epoch {epoch+1}/{num_epochs}")
        start_time = time.time()
        
        for batch_idx, batch in enumerate(data_loader):
            
            correct = 0 # reset train accuracy each batch
            
            inputs,labels = batch[0],batch[1]
            if(verbose == True):
                print("\ninputs shape:",inputs.size(),", dtype:",inputs.dtype," content: ",inputs)
                print("min value:",torch.min(inputs))
                print("max value:",torch.max(inputs))
                print("\nlabels shape:",labels.size(),",dtype:",labels.dtype,", content: ",labels)
            if(RGB==False):
                inputs = inputs.unsqueeze(1) #add a dimension if input is to be considered just grayscale
                #if input is RGB, there are already 3 channels
            
            # Extract the inputs and targets
            optimizer.zero_grad()
            outputs = model(inputs)
            
            if(verbose == True):
                print("\noutputs size:",outputs.size(),"content:",outputs)
                print("List of labels until now:",counted_labels)

            loss = criterion(outputs, labels) #labels need to be a vector of class indexes (0-7) of dim (batch_size)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            
            #calculate train accuracy
            for index, output in enumerate(outputs):
                max_index = torch.argmax(output).item() #the index with maximum probability
                counted_labels[labels[index].item()]+=1
                if(labels[index].item() == max_index):
                    correct += 1
            
                if(verbose==True):
                    print("considering output at index {}:".format(index,output))
                    print("max output index = {}",max_index)
                    if(labels[index].item() == max_index):
                        print("correct! in fact labels[index] = {}, max_index = {}".format(labels[index].item(),max_index))
                    else:
                        print("NOT correct! in fact labels[index] = {}, max_index = {}".format(labels[index].item(),max_index))

            
            accuracy = 100 * correct / batch_size
            running_accuracy += accuracy #epoch running_accuracy
            
            # Update the progress bar description and calculate bps
            #progress_bar.set_postfix({"Loss": running_loss / (batch_idx + 1)})
            average_accuracy = running_accuracy / (batch_idx + 1)
            average_loss = running_loss / (batch_idx + 1)
            progress_bar.set_postfix({"avg_loss": average_loss, "acc": accuracy, "avg_acc": average_accuracy})

            # Update the progress bar
            progress_bar.update(1)
            # Evaluate the model on the validation dataset
        
        #calculate train loss and accuracy
        average_loss = running_loss / len(data_loader)
        average_accuracy = running_accuracy / len(data_loader)
        train_loss_list.append(average_loss)
        train_acc_list.append(average_accuracy)
        
        #calculate validation loss and accuracy
        val_acc, val_loss = test(model, validation_dataset, Y_validation)
        val_loss_list.append(val_loss)
        val_acc_list.append(val_acc)
        
        
        print(f"Epoch [{epoch+1}/{num_epochs}],Train Loss: {average_loss:.4f}. Train Accuracy: {average_accuracy} Val Loss: {val_loss} Val Accuracy: {val_acc}")
        progress_bar.close()
    return train_loss_list, train_acc_list, val_loss_list, val_acc_list

In [12]:
class NNet1_Small(nn.Module):
    def __init__(self):
        super(NNet1_Small, self).__init__()

        self.conv1 = nn.Conv2d(1, 64, kernel_size=(2, 513))
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU()

        self.maxpool1 = nn.MaxPool2d(kernel_size=(4, 1))
        self.conv2 = nn.Conv2d(64,128, kernel_size=(2, 1))
        self.bn2 = nn.BatchNorm2d(128)
        self.maxpool2 = nn.MaxPool2d(kernel_size=(4, 1))
        self.conv3 = nn.Conv2d(128,64, kernel_size=(4, 1))
        self.bn3 = nn.BatchNorm2d(64)
        self.avgpool = nn.AvgPool2d(kernel_size=(2, 1))
        self.maxpool = nn.MaxPool2d(kernel_size=(2, 1))
        self.flatten = nn.Flatten()
        self.dropout = nn.Dropout(0.3)
        self.dense1 = nn.Linear(256, 64)
        self.bn4 = nn.BatchNorm1d(64)
        self.dense2 = nn.Linear(128,64)
        self.bn5 = nn.BatchNorm1d(64)
        self.dense3 = nn.Linear(64, 8)
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, x):
        x = self.conv1(x.float())
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool1(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.maxpool2(x)
        x = self.conv3(x)
        x = self.bn3(x)
        x_avg = self.avgpool(x)
        x_max = self.maxpool(x)
        x = torch.cat([x_avg, x_max], dim=1)
        x = self.flatten(x)
        x = self.dense1(x)
        x = self.dropout(x)
        x = self.bn4(x)
        x = self.relu(x)
        #x = self.dense2(x)
        #x = self.dropout(x)
        #x = self.bn5(x)
        #x = self.relu(x)
        x = self.dense3(x)
        x = self.softmax(x)
        return x

In [13]:
class NNet_Raw(nn.Module):
    def __init__(self, dropout_rate=0.5):
        super(NNet_Raw, self).__init__()
        self.conv1 = nn.Conv1d(1, 32, kernel_size=16)
        self.conv2 = nn.Conv1d(32, 8, kernel_size=16)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool1d(kernel_size=32)
        self.maxpool1 = nn.MaxPool1d(kernel_size=8)
        self.batchnorm1 = nn.BatchNorm1d(32)
        self.batchnorm2 = nn.BatchNorm1d(8)
        self.batchnorm3 = nn.BatchNorm1d(24)
        self.dropout = nn.Dropout(0.2)
        self.fc1 = nn.Linear(248, 24)
        self.fc3 = nn.Linear(24, 8)
        self.softmax=nn.Softmax(dim=1)
        
    def forward(self, x):
        x=self.maxpool1(x.float())
        x = self.conv1(x)
        x = self.relu(x)
        x = self.batchnorm1(x)
        x=self.maxpool1(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.batchnorm2(x)     
        x = self.maxpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.batchnorm3(x)
        x = self.dropout(x)
        x = self.fc3(x)
        x = self.softmax(x)
        return x    

In [15]:
class Ensemble(nn.Module):
    def __init__(self):
        super(Ensemble, self).__init__()

        self.raw_net=NNet_Raw()
        self.raw_net.load_state_dict(torch.load("./best_models/models/NNet_Raw"), strict=False)
        self.stft_net=NNet1_Small()
        self.stft_net.load_state_dict(torch.load("./best_models/models/NNet1_Small"), strict=False)
        
        self.dense1 = nn.Linear(16,16)
        self.bn = nn.BatchNorm1d(16)
        self.dense2 = nn.Linear(16, 8)
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, input_value):
        stft=input_value[0]
        raw=input_value[1]
                
        x_raw=self.raw_net(raw)
        x_stft=self.stft_net(stft)
        
        x = torch.cat([x_stft, x_raw], dim=1)
        x=self.bn(x)
        x=self.dense1(x)
        x=self.bn(x)
        x=self.dense2(x)
        x=self.softmax(x)
        
        
        return x

In [16]:
model = Ensemble()
summary(model, [(1, 128, 513),(1,66150)])

FileNotFoundError: [Errno 2] No such file or directory: './best_models/models/NNet_Raw'