In [1]:
import time
import os
import pprint
import torch
import torch.nn as nn
from sklearn.metrics import ConfusionMatrixDisplay
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import librosa
import librosa.display
import tqdm.notebook as tq
import utils
from pydub import AudioSegment
from tkinter import Tcl # file sorting by name

In [2]:
def create_single_dataset(folder_path, tracks_dataframe, genre_dictionary):    
    labels = []
   
    _, file_list = get_sorted_file_paths(folder_path)
    
    for i,file in enumerate(file_list):
        #print("considering file:",file, "({}/{})".format(i,len(file_list)))
        track_id_clip_id = file.split('.')[0]
        track_id = track_id_clip_id.split('_')[0]
        #print("track id with clip: {}, track id: {}".format(track_id_clip_id, track_id))
        genre = tracks_dataframe.loc[int(track_id)]
        #print("genre from dataframe: ", genre)
        label = genre_dictionary[genre]
        #print("label from dictionary:",label)
        labels.append(label)
    print("labels length: {}".format(len(labels)))
    return labels
    

#create the train,validation and test vectors using the files in the train/validation/test folders
def create_dataset_splitted(folder_path):
    train_folder = os.path.join(folder_path,'train') # concatenate train folder to path
    validation_folder = os.path.join(folder_path,'validation') # concatenate train folder to path
    test_folder = os.path.join(folder_path,'test') # concatenate train folder to path
    
    print("train_folder:",train_folder)
    print("validation_folder:",validation_folder)
    print("test_folder:",test_folder,"\n")
    
    AUDIO_DIR = os.environ.get('AUDIO_DIR')
    print("audio directory: ",AUDIO_DIR)
    print("Loading tracks.csv...")
    tracks = utils.load('data/fma_metadata/tracks.csv')
    
    #get only the small subset of the dataset
    small = tracks[tracks['set', 'subset'] <= 'small']
    print("small dataset shape:",small.shape)    

    small_training = small.loc[small[('set', 'split')] == 'training']['track']
    small_validation = small.loc[small[('set', 'split')] == 'validation']['track']
    small_test = small.loc[small[('set', 'split')] == 'test']['track']

    print("Track.csv: {} training samples, {} validation samples, {} test samples\n".format(len(small_training), len(small_validation), len(small_test)))

    small_training_top_genres = small_training['genre_top']
    small_validation_top_genres = small_validation['genre_top']
    small_test_top_genres = small_test['genre_top']
    
    #create dictionary of genre classes:
    unique_genres = small_training_top_genres.unique()
    unique_genres = np.array(unique_genres)
    print("there are {} unique genres".format(len(unique_genres)))
    genre_dictionary = {}
    for i,genre in enumerate(unique_genres):
        genre_dictionary[genre] = i
    print("Dictionary of genres created:",genre_dictionary)
    
    
    Y_train = create_single_dataset(train_folder, small_training_top_genres, genre_dictionary)
    Y_validation = create_single_dataset(validation_folder, small_validation_top_genres, genre_dictionary)
    Y_test = create_single_dataset(test_folder, small_test_top_genres, genre_dictionary)
    
    return Y_train[0::20], Y_validation[0::20], Y_test[0::20]
 
def get_sorted_file_paths(folder_path):
    file_list = os.listdir(folder_path)
    #sort the dataset files in alphabetical order (important to associate correct labels created using track_id in track.csv)
    file_list = Tcl().call('lsort', '-dict', file_list) # sort file by name: 2_0,2_1, ... 2_9,3_0, ... 400_0,400_1, ...
    file_paths = [os.path.join(folder_path, file_name) for file_name in file_list] #join filename with folder path
    #print("There are {} in the folder: {}".format(len(file_list),file_list))
    return file_paths, file_list
    
    
folder_path="data/fma_small_stft_transposed_22050_overlapped"
Y_train, Y_validation, Y_test = create_dataset_splitted(folder_path)

train_folder: data/fma_small_stft_transposed_22050_overlapped/train
validation_folder: data/fma_small_stft_transposed_22050_overlapped/validation
test_folder: data/fma_small_stft_transposed_22050_overlapped/test 

audio directory:  ./data/fma_small/
Loading tracks.csv...
small dataset shape: (8000, 52)
Track.csv: 6400 training samples, 800 validation samples, 800 test samples

there are 8 unique genres
Dictionary of genres created: {'Hip-Hop': 0, 'Pop': 1, 'Folk': 2, 'Rock': 3, 'Experimental': 4, 'International': 5, 'Electronic': 6, 'Instrumental': 7}
labels length: 127940
labels length: 16000
labels length: 16000


In [3]:
def reduce_files(file_list):
    files=[]
    sample=[]
    count=0
    #print(file_list[0])
    for file in file_list[0]:
        count+=1
        sample.append(file)
        if count>0 and count%20==0:
            #print("ciao")
            files.append(sample)
            sample=[]
        
            
        
    #print(files)
    return files

In [4]:
class MyDatasetRaw(Dataset):
    def __init__(self, file_list, labels, transform=None, verbose=False):
        self.file_list = file_list
        #print(file_list)
        print("Element in this set:",len(file_list))
        self.labels=labels
        self.transform = transform
        self.verbose=verbose
        #print(file_list)

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        file_path = self.file_list[idx]
        label = torch.tensor(self.labels[idx])
        
        #print(self.file_list)
        
        raw_vector=[]
        
        for file in file_path:
            raw_vector.append(np.load(file))
        
        raw_vector=torch.tensor(np.array(raw_vector))
        #print("raw vector shape:",raw_vector.shape)
        
        # Calculate the indices to keep (even indices)
        indices_to_keep = torch.arange(0, raw_vector.size(0), 2)

        # Use the calculated indices to select the elements to keep
        raw_vector= raw_vector.index_select(0, indices_to_keep)
        
            
        
        '''
        # Normalize your data here
        if self.transform:
            
            #convert to float64 tensor
            raw_vector = raw_vector.double()
            if(self.verbose==True):
                print("TRANSFORM: applying transform to tensor shape:",raw_vector.shape,"content:",raw_vector)
            raw_vector = torch.unsqueeze(raw_vector, dim=0)
            #print("TRANSFORM: after first unsqueeze:",raw_vector.shape,"content:",raw_vector)
            raw_vector = torch.unsqueeze(raw_vector, dim=0) #unsqueeze two times (needed for torchvision normalize method)
            #print("TRANSFORM: after second unsqueeze:",raw_vector.shape,"content:",raw_vector)
            raw_vector = self.transform(raw_vector) #normalize the sample
            if(self.verbose==True):
                print("TRANSFORM: after transform shape:",raw_vector.shape,"content:",raw_vector)
            raw_vector = torch.squeeze(raw_vector, dim=0)
            raw_vector = torch.squeeze(raw_vector, dim=0)
            if(self.verbose==True):
                print("TRANSFORM: after double squeeze shape:",raw_vector.shape,"content:",raw_vector)
            '''
        
        return raw_vector, label  

In [5]:
folder_path="./data/fma_small_raw_array_22050_overlapped"

train_folder = os.path.join(folder_path,'train') # concatenate train folder to path
validation_folder = os.path.join(folder_path,'validation') # concatenate train folder to path
test_folder = os.path.join(folder_path,'test') # concatenate train folder to path

train_file_paths, _ = get_sorted_file_paths(train_folder)
train_dataset = MyDatasetRaw(reduce_files(get_sorted_file_paths(train_folder)), Y_train)
print("len of train dataset: ",len(train_dataset))

validation_file_paths, _ = get_sorted_file_paths(validation_folder)
validation_dataset = MyDatasetRaw(reduce_files(get_sorted_file_paths(validation_folder)), Y_validation)
print("len of validation dataset: ",len(validation_dataset))

test_file_paths, _ = get_sorted_file_paths(test_folder)
test_dataset = MyDatasetRaw(reduce_files(get_sorted_file_paths(test_folder)), Y_test)
print("len of test dataset: ",len(test_dataset))

Element in this set: 6397
len of train dataset:  6397
Element in this set: 800
len of validation dataset:  800
Element in this set: 800
len of test dataset:  800


In [6]:
def test(model, validation_dataset, Y_validation):
    # Stop parameters learning
    model.eval()

    validation_loader = torch.utils.data.DataLoader(validation_dataset)

    criterion = nn.CrossEntropyLoss()
    correct = 0
    total = 0
    total_loss = 0
    confusion_matrix = np.zeros((8, 8), dtype=int)

    with torch.no_grad():
        for sample, label in validation_loader:
            
            sample = sample.unsqueeze(1)

            # Predict label
            output = model(sample)
            
            # Compute loss
            loss = criterion(output, label)
            total_loss += loss.item()

            max_index = torch.argmax(output).item()  # The index with maximum probability

            confusion_matrix[label][max_index] += 1

            correct += (max_index == label)

    #cm = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix)
    #cm.plot()
    print(confusion_matrix)
    accuracy = 100 * correct / len(Y_validation)
    average_loss = total_loss / len(Y_validation)

    model.train()
    return accuracy, average_loss

In [7]:
def train(model, dataset, batch_size, num_epochs, learning_rate, verbose = False, RGB=False, reg=1e-5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    val_loss_list=[]
    val_acc_list=[]
    train_loss_list=[]
    train_acc_list=[]
    counted_labels=[0,0,0,0,0,0,0,0]
    
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=reg)
    criterion = nn.CrossEntropyLoss()

    if not isinstance(dataset, Dataset):
        raise ValueError("The dataset parameter should be an instance of torch.utils.data.Dataset.")

    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    num_batches = len(data_loader)
    
    
    for epoch in range(num_epochs):
        running_loss = 0.0 
        running_accuracy = 0.0
        #initialize correctly predicted samples
        
        # Initialize the progress bar
        progress_bar = tq.tqdm(total=num_batches, unit="batch")
    
        # Initialize the progress bar description
        progress_bar.set_description(f"Epoch {epoch+1}/{num_epochs}")
        start_time = time.time()
        
        for batch_idx, batch in enumerate(data_loader):
            
            correct = 0 # reset train accuracy each batch
            
            inputs,labels = batch[0],batch[1]
            if(verbose == True):
                print("\ninputs shape:",inputs.size(),", dtype:",inputs.dtype," content: ",inputs)
                print("min value:",torch.min(inputs))
                print("max value:",torch.max(inputs))
                print("\nlabels shape:",labels.size(),",dtype:",labels.dtype,", content: ",labels)
            if(RGB==False):
                inputs = inputs.unsqueeze(1) #add a dimension if input is to be considered just grayscale
                #if input is RGB, there are already 3 channels
            
            # Extract the inputs and targets
            optimizer.zero_grad()
            outputs = model(inputs)
            
            if(verbose == True):
                print("\noutputs size:",outputs.size(),"content:",outputs)
                print("List of labels until now:",counted_labels)

            loss = criterion(outputs, labels) #labels need to be a vector of class indexes (0-7) of dim (batch_size)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            
            #calculate train accuracy
            for index, output in enumerate(outputs):
                max_index = torch.argmax(output).item() #the index with maximum probability
                counted_labels[labels[index].item()]+=1
                if(labels[index].item() == max_index):
                    correct += 1
            
                if(verbose==True):
                    print("considering output at index {}:".format(index,output))
                    print("max output index = {}",max_index)
                    if(labels[index].item() == max_index):
                        print("correct! in fact labels[index] = {}, max_index = {}".format(labels[index].item(),max_index))
                    else:
                        print("NOT correct! in fact labels[index] = {}, max_index = {}".format(labels[index].item(),max_index))

            
            accuracy = 100 * correct / batch_size
            running_accuracy += accuracy #epoch running_accuracy
            
            # Update the progress bar description and calculate bps
            #progress_bar.set_postfix({"Loss": running_loss / (batch_idx + 1)})
            average_accuracy = running_accuracy / (batch_idx + 1)
            average_loss = running_loss / (batch_idx + 1)
            progress_bar.set_postfix({"avg_loss": average_loss, "acc": accuracy, "avg_acc": average_accuracy})

            # Update the progress bar
            progress_bar.update(1)
            # Evaluate the model on the validation dataset
        
        #calculate train loss and accuracy
        average_loss = running_loss / len(data_loader)
        average_accuracy = running_accuracy / len(data_loader)
        train_loss_list.append(average_loss)
        train_acc_list.append(average_accuracy)
        
        #calculate validation loss and accuracy
        val_acc, val_loss = test(model, validation_dataset, Y_validation)
        val_loss_list.append(val_loss)
        val_acc_list.append(val_acc)
        
        
        print(f"Epoch [{epoch+1}/{num_epochs}],Train Loss: {average_loss:.4f}. Train Accuracy: {average_accuracy} Val Loss: {val_loss} Val Accuracy: {val_acc}")
        progress_bar.close()
    return train_loss_list, train_acc_list, val_loss_list, val_acc_list

In [8]:
class MyLSTMNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(MyLSTMNetwork, self).__init__()

        self.bn1=nn.BatchNorm1d(300)
        self.bn2=nn.BatchNorm1d(128)
        self.bn3=nn.BatchNorm1d(32)
        
        self.maxpool=nn.MaxPool1d(kernel_size=30)
        self.maxpool1=nn.MaxPool1d(kernel_size=128)
        self.minpool=nn.AvgPool1d(kernel_size=30)
        self.relu=nn.ReLU()
        # LSTM layer for sequence encoding
        self.lstm1 = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True,bidirectional=True)
        self.lstm2 = nn.LSTM(2*hidden_size, 200, num_layers, batch_first=True)
        # Feedforward layers for classification
        self.fc1 = nn.Linear(2000, 300)
        self.fc2 = nn.Linear(300, 128)
        self.fc3 = nn.Linear(128, 32)
        self.fc4 = nn.Linear(32,num_classes)
        self.dropout=nn.Dropout(0.2)
        self.softmax=nn.Softmax(dim=1)

    def forward(self, x):
        # LSTM encoding
        #print(x.shape)
        batch=x.shape[0]
        x=x.squeeze(dim=1)
        x=x.float()
        #x=self.batchnorm(x)
        #input_data = x.view(128, 20,66150)
        out=self.maxpool(x)
        x=self.minpool(x)
        out = torch.cat([out, x], dim=2)
        
        out, _ = self.lstm1(out)
        out, _ = self.lstm2(out)
        
        # Get the encoding of the last sequence element (last vector)
        #last_output = out[:, -1, :]
        concatenated_output = out.contiguous().view(batch, -1)
        out=torch.relu(concatenated_output)
        # Feedforward layers for classification
        out = self.fc1(out)
        out=self.dropout(out)
        out =self.bn1(out)
        out = torch.relu(out)
        out = self.fc2(out)
        out=self.dropout(out)
        out =self.bn2(out)
        out = torch.relu(out)
        out = self.fc3(out)
        out=self.dropout(out)
        out =self.bn3(out)
        out = torch.relu(out)
        out = self.fc4(out)
        out=self.softmax(out)
        
        return out

# Define the hyperparameters
input_size = 4410  # Input vector size
hidden_size = 400   # LSTM hidden size
num_layers = 3     # Number of LSTM layers
num_classes = 8  # Number of output classes (adjust as needed)

# Create an instance of the model
model = MyLSTMNetwork(input_size, hidden_size, num_layers, num_classes)


In [9]:
train(model, train_dataset, batch_size=128, num_epochs=40, learning_rate=0.0001, reg=0.0001)

  0%|          | 0/50 [00:00<?, ?batch/s]

KeyboardInterrupt: 

In [14]:
class MyRNNNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(MyRNNNetwork, self).__init__()
        
        self.bn1 = nn.BatchNorm1d(100)
        self.bn2 = nn.BatchNorm1d(20)
        self.bn = nn.BatchNorm2d(1)
        
        #self.maxpool = nn.MaxPool1d(kernel_size=5)
        #self.minpool = nn.AvgPool1d(kernel_size=5)
        
        # RNN layer for sequence encoding (replace LSTM with RNN)
        self.rnn1 = nn.RNN(input_size, hidden_size, num_layers, nonlinearity ='relu',batch_first=True, dropout=0.2)
        self.rnn2= nn.RNN(hidden_size, 50, num_layers, nonlinearity ='relu', batch_first=True, dropout=0.2)
        
        # RNN layer for sequence encoding (replace LSTM with RNN)
        #self.rnn1 = nn.GRU(input_size, hidden_size, num_layers,batch_first=True)
        #self.rnn2= nn.GRU(hidden_size, 50, num_layers, batch_first=True)
        
        
        # Feedforward layers for classification
        self.fc1 = nn.Linear(500, 100)
        self.fc2 = nn.Linear(100, 20)
        self.fc3 = nn.Linear(20, num_classes)
        self.dropout = nn.Dropout(0.4)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        # LSTM encoding
        batch = x.shape[0]
        #x=self.bn(x.float())
        x = x.squeeze(dim=1)
        
        #out = self.maxpool(x)
        #x = self.minpool(x)
        #out = torch.cat([out, x], dim=2)
        
        out, _ = self.rnn1(x.float())  # Use RNN instead of LSTM
        out, _ = self.rnn2(out)
        
        concatenated_output = out.contiguous().view(batch, -1)
        out = torch.relu(concatenated_output)
        
        out = self.fc1(out)
        out = self.dropout(out)
        out = self.bn1(out)
        out = torch.relu(out)
        out = self.fc2(out)
        out=self.dropout(out)
        out = self.bn2(out)
        out = torch.relu(out)
        out = self.fc3(out)
        out = self.softmax(out)
        
        return out

# Define the hyperparameters
input_size = 66150  # Input vector size
hidden_size = 300   # LSTM hidden size
num_layers = 2    # Number of LSTM layers
num_classes = 8  # Number of output classes (adjust as needed)

# Create an instance of the model
model = MyRNNNetwork(input_size, hidden_size, num_layers, num_classes)


In [None]:
#summary(model,(1,20,66150))
train(model, train_dataset, batch_size=32, num_epochs=40, learning_rate=0.001, reg=0.001)

  0%|          | 0/200 [00:00<?, ?batch/s]

[[ 0  0 48  0  0  0 52  0]
 [ 1  0 80  0  0  0 19  0]
 [ 0  0 96  0  0  0  4  0]
 [ 2  0 75  1  0  0 22  0]
 [ 0  0 93  0  0  0  7  0]
 [ 1  0 91  0  0  0  8  0]
 [ 3  0 72  0  0  0 25  0]
 [ 0  0 93  0  0  0  7  0]]
Epoch [1/40],Train Loss: 2.0586. Train Accuracy: 16.640625 Val Loss: 2.066371323019266 Val Accuracy: tensor([15.2500])


  0%|          | 0/200 [00:00<?, ?batch/s]

[[35  0 42  0  0  0 23  0]
 [11  0 70  0  0  0 12  7]
 [ 2  0 94  0  0  0  0  4]
 [16  0 74  0  0  0 10  0]
 [ 5  0 71  0  0  0  4 20]
 [ 4  0 74  0  0  0  4 18]
 [17  0 68  0  0  0 14  1]
 [ 5  0 90  0  0  0  4  1]]
Epoch [2/40],Train Loss: 2.0474. Train Accuracy: 18.453125 Val Loss: 2.0543525072932245 Val Accuracy: tensor([18.])


  0%|          | 0/200 [00:00<?, ?batch/s]

[[14  0 49  0  0  0 37  0]
 [ 8  0 75  0  0  0 17  0]
 [ 2  0 97  0  0  0  1  0]
 [ 8  0 73  0  0  0 19  0]
 [ 4  0 91  0  0  0  5  0]
 [ 7  0 89  0  0  0  4  0]
 [10  0 70  0  0  0 20  0]
 [ 4  0 92  0  0  0  4  0]]
Epoch [3/40],Train Loss: 2.0429. Train Accuracy: 18.625 Val Loss: 2.0520949923992156 Val Accuracy: tensor([16.3750])


  0%|          | 0/200 [00:00<?, ?batch/s]

[[49  0 31  0  0  0 20  0]
 [21  0 70  0  0  0  9  0]
 [ 5  0 90  0  0  0  5  0]
 [39  0 56  0  0  0  5  0]
 [11  0 87  0  0  0  2  0]
 [10  0 86  0  0  0  4  0]
 [27  0 58  0  0  0 15  0]
 [14  0 85  0  0  0  1  0]]
Epoch [4/40],Train Loss: 2.0339. Train Accuracy: 20.015625 Val Loss: 2.0449326513707637 Val Accuracy: tensor([19.2500])


  0%|          | 0/200 [00:00<?, ?batch/s]

[[56  0 28  0  0  0 16  0]
 [19  0 71  0  0  0 10  0]
 [ 5  0 95  0  0  0  0  0]
 [46  0 49  0  0  0  5  0]
 [13  0 86  0  0  0  1  0]
 [17  0 80  0  0  0  3  0]
 [31  0 54  0  0  0 15  0]
 [15  0 85  0  0  0  0  0]]
Epoch [5/40],Train Loss: 2.0310. Train Accuracy: 20.46875 Val Loss: 2.04007319688797 Val Accuracy: tensor([20.7500])


  0%|          | 0/200 [00:00<?, ?batch/s]

[[53  0 31  0  0  0 16  0]
 [18  0 73  0  0  0  9  0]
 [ 6  0 94  0  0  0  0  0]
 [26  0 61  0  0  0 13  0]
 [10  0 87  0  0  0  3  0]
 [ 8  0 82  0  5  0  5  0]
 [27  0 56  0  0  0 17  0]
 [10  0 88  0  0  0  2  0]]
Epoch [6/40],Train Loss: 2.0278. Train Accuracy: 21.21875 Val Loss: 2.0401698476076127 Val Accuracy: tensor([20.5000])


  0%|          | 0/200 [00:00<?, ?batch/s]