In [1]:
!wget -O fma_small.zip https://os.unil.cloud.switch.ch/fma/fma_small.zip
!wget https://os.unil.cloud.switch.ch/fma/fma_metadata.zip

--2023-04-29 10:49:56--  https://os.unil.cloud.switch.ch/fma/fma_small.zip
Resolving os.unil.cloud.switch.ch (os.unil.cloud.switch.ch)... 86.119.28.16
Connecting to os.unil.cloud.switch.ch (os.unil.cloud.switch.ch)|86.119.28.16|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7679594875 (7.2G) [application/zip]
Saving to: ‘fma_small.zip’


2023-04-29 10:59:30 (12.8 MB/s) - ‘fma_small.zip’ saved [7679594875/7679594875]

--2023-04-29 10:59:31--  https://os.unil.cloud.switch.ch/fma/fma_metadata.zip
Resolving os.unil.cloud.switch.ch (os.unil.cloud.switch.ch)... 86.119.28.16
Connecting to os.unil.cloud.switch.ch (os.unil.cloud.switch.ch)|86.119.28.16|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 358412441 (342M) [application/zip]
Saving to: ‘fma_metadata.zip’


2023-04-29 10:59:58 (13.1 MB/s) - ‘fma_metadata.zip’ saved [358412441/358412441]



In [3]:
import zipfile
from tqdm import tqdm

with zipfile.ZipFile('fma_small.zip', 'r') as zip_ref:
    # Get the total number of files to be extracted
    total_files = len(zip_ref.infolist())
    with tqdm(total=total_files, desc='Extracting files') as pbar:
        for file in zip_ref.infolist():
            zip_ref.extract(file)
            pbar.update()

with zipfile.ZipFile('fma_metadata.zip', 'r') as zip_ref:
    # Get the total number of files to be extracted
    total_files = len(zip_ref.infolist())
    with tqdm(total=total_files, desc='Extracting files') as pbar:
        for file in zip_ref.infolist():
            zip_ref.extract(file)
            pbar.update()

Extracting files: 100%|██████████████████████████████████████████████████████████████████████████████| 8002/8002 [14:30<00:00,  9.19it/s]
Extracting files: 100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [01:49<00:00,  9.13s/it]


In [4]:
import os
import shutil
import pandas as pd
from tqdm import tqdm

# Load metadata
tracks = pd.read_csv('fma_metadata/tracks.csv', index_col=0, header=[0, 1])

# Filter top genres
genres = ['Hip-Hop', 'Rock', 'Pop', 'Folk', 'Experimental', 'Electronic', 'Instrumental', 'International']
tracks_top_genres = tracks[tracks[('track', 'genre_top')].isin(genres)]

# Create directories for each genre
os.makedirs('fma_filtered', exist_ok=True)
for genre in genres:
    os.makedirs(f'fma_filtered/{genre}', exist_ok=True)

# Number of songs per genre to copy
x = 10

# Counter for songs copied per genre
counter = {genre: 0 for genre in genres}

# Source directory
src_dir = 'fma_small'

# Iterate over subdirectories in fma_small
for subdir in os.listdir(src_dir):
    # Check if it's a directory
    if os.path.isdir(os.path.join(src_dir, subdir)):
        # Iterate over songs in each subdirectory
        for song in os.listdir(os.path.join(src_dir, subdir)):
            # Check if song exists in metadata
            song_id = int(song.split('.')[0])
            if song_id in tracks_top_genres.index:
                # Get the genre of the song
                genre = tracks_top_genres.loc[song_id][('track', 'genre_top')]
                # Check if we've already copied enough songs of this genre
                if counter[genre] < x:
                    # Copy the song to the corresponding genre folder
                    shutil.copy(os.path.join(src_dir, subdir, song), f'fma_filtered/{genre}/{str(counter[genre]).zfill(4)}_{genre}.mp3')
                    # Update the counter
                    counter[genre] += 1

# Print the counter to check how many songs were copied for each genre
print(counter)

{'Hip-Hop': 10, 'Rock': 10, 'Pop': 10, 'Folk': 10, 'Experimental': 10, 'Electronic': 10, 'Instrumental': 10, 'International': 10}


In [5]:
import os
import shutil
import pandas as pd
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from tqdm import tqdm
from scipy.ndimage import zoom

# Create directories for spectrograms
os.makedirs('fma_spectrograms', exist_ok=True)
for genre in genres:
    os.makedirs(f'fma_spectrograms/{genre}', exist_ok=True)

# Convert audio files to spectrograms and save them in the 'fma_spectrograms' directory
for genre in genres:
    for song in os.listdir(f'fma_filtered/{genre}'):
        # Load the audio file
        y, sr = librosa.load(f'fma_filtered/{genre}/{song}')
        
        # Compute the spectrogram
        D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)

        # Resize the spectrogram
        resize_shape = (224, 224)
        D_resized = zoom(D, (resize_shape[0]/D.shape[0], resize_shape[1]/D.shape[1]))

        # Plot and save the spectrogram
        plt.figure()
        plt.imshow(D_resized, aspect='auto', cmap='viridis')

        # Remove axes
        plt.axis('off')

        plt.savefig(f'fma_spectrograms/{genre}/{song.split(".")[0]}.png', bbox_inches='tight', pad_inches = 0)
        plt.close()

In [19]:
#!git clone https://github.com/microsoft/unilm.git

In [76]:
import os
import torch
import librosa
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [3]:
from unilm.beats.BEATs import BEATs, BEATsConfig
from unilm.beats.Tokenizers import Tokenizers, TokenizersConfig

In [4]:
# Load the audio files and genre labels
genres = ['Hip-Hop', 'Rock', 'Pop', 'Folk', 'Experimental', 'Electronic', 'Instrumental', 'International']
audio_files = [] 
genre_labels = []
for genre in genres:
    for song in os.listdir(f'../fma_filtered/{genre}'):
        audio_files.append(f'../fma_filtered/{genre}/{song}')
        genre_labels.append(genre)

In [5]:
def load_audio(file_path, sr=16000):
    y, _ = librosa.load(file_path, sr=sr)
    return y

In [73]:
# Tokenize the audio files using the pre-trained tokenizer
tokenizer_checkpoint = torch.load('Tokenizer_iter3_plus_AS2M.pt')
tokenizer_cfg = TokenizersConfig(tokenizer_checkpoint['cfg'])
tokenizer = Tokenizers(tokenizer_cfg)
tokenizer.load_state_dict(tokenizer_checkpoint['model'])
tokenizer.eval()

total_audio_files = len(audio_files)
tokenized_audio = []

with tqdm(total=total_audio_files, desc='Tokenizing audio files') as pbar:
    for file in audio_files:
        tokenized = tokenizer.extract_labels(torch.tensor(load_audio(file)).unsqueeze(0), 
                                             padding_mask=torch.zeros(1, len(load_audio(file))).bool())
        tokenized_audio.append(tokenized)
        pbar.update()

Tokenizing audio files: 100%|███████████████████| 80/80 [11:01<00:00,  8.27s/it]


In [90]:
tokenized_audio_tensor = torch.stack(tokenized_audio)
tokenized_audio = tokenized_audio_tensor.numpy()

In [108]:
# Prepare the dataset for training
class AudioDataset(Dataset):
    def __init__(self, audio_data, labels):
        self.audio_data = audio_data
        self.labels = labels

    def __len__(self):
        return len(self.audio_data)

    def __getitem__(self, idx):
        return self.audio_data[idx], self.labels[idx]

le = LabelEncoder()
genre_labels_enc = le.fit_transform(genre_labels)
    
X_train, X_test, y_train, y_test = train_test_split(tokenized_audio, genre_labels_enc, test_size=0.1, random_state=42,
                                                   stratify=genre_labels)

train_dataset = AudioDataset(X_train, y_train)
test_dataset = AudioDataset(X_test, y_test)

# train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [109]:
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

classifier = SVC()
classifier.fit(X_train, y_train)

In [110]:
y_pred_train = classifier.predict(X_train)
y_pred_test = classifier.predict(X_test)
accuracy_train = accuracy_score(y_train, y_pred_train)
accuracy_test = accuracy_score(y_test, y_pred_test)
print(f"Mean-per-class accuracy on train set: {accuracy_train * 100:.2f}%")
print(f"Mean-per-class accuracy on test set: {accuracy_test * 100:.2f}%")

Mean-per-class accuracy on train set: 100.00%
Mean-per-class accuracy on test set: 37.50%


In [17]:
# Fine-tune the BEATs model for genre classification
beats_checkpoint = torch.load('BEATs_iter3_plus_AS2M.pt')
beats_cfg = BEATsConfig(beats_checkpoint['cfg'])
beats_model = BEATs(beats_cfg)
beats_model.load_state_dict(beats_checkpoint['model'])
beats_model.eval()

# Update the final classification layer to match the number of genre classes
num_classes = len(set(genre_labels))
if beats_model.predictor is not None:
    beats_model.predictor = torch.nn.Linear(beats_model.predictor.in_features, num_classes)
else:
    # Assumes the encoder_embed_dim is the input feature size for the predictor
    input_features = beats_model.cfg.encoder_embed_dim
    beats_model.predictor = torch.nn.Linear(input_features, num_classes)

In [40]:
# Fine-tune the model
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#beats_model.to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(beats_model.parameters(), lr=1e-4)

num_epochs = 10
for epoch in range(num_epochs):
    beats_model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        #data, target = data.to(device), target.to(device)
#         optimizer.zero_grad()
#         #pad = torch.zeros(1, len(data)).bool()
#         pad = nn.functional.pad(data, (0, 0, 7, 7))
#         output, _ = beats_model.extract_features(data, padding_mask=pad)
#         loss = criterion(output, target)
#         loss.backward()
#         optimizer.step()
        # create padding mask
        batch_size, n_channels, n_frames = data.shape
        max_frames = max([x.shape[0] for x in data])
        pad = (0, 0, 0, max_frames - n_frames)
        padding_mask = torch.ones_like(data)
        padding_mask = torch.nn.functional.pad(padding_mask, pad, mode='constant', value=0)
        data = torch.nn.functional.pad(data, pad, mode='constant', value=0)
        
        optimizer.zero_grad()
        output, _ = beats_model.extract_features(data, padding_mask)
        output = output.float()  # Convert output tensor to float
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

    # Evaluate the model
    beats_model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data, target in test_loader:
            #data, target = data.to(device), target.to(device)
            # create padding mask
            batch_size, n_channels, n_frames = data.shape
            max_frames = max([x.shape[0] for x in data])
            pad = (0, 0, 0, max_frames - n_frames)
            padding_mask = torch.ones_like(data)
            padding_mask = torch.nn.functional.pad(padding_mask, pad, mode='constant', value=0)
            data = torch.nn.functional.pad(data, pad, mode='constant', value=0)
            
            output, _ = beats_model.extract_features(data, padding_mask)
            _, predicted = torch.max(output.data, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()

    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}, Test Accuracy: {correct / total * 100:.2f}%')

ValueError: not enough values to unpack (expected 3, got 2)