## Install and Imports

In [None]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import os
from tqdm import tqdm
import numpy as np
import librosa
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import Wav2Vec2ForCTC, AutoFeatureExtractor, Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification, AdamW

In [None]:
# Mount google drive to use a persistent directory structure
import os
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
os.chdir('/content/drive/MyDrive/deep_learning/final_proj')

Mounted at /content/drive


## Data processing

In [None]:
# Load the genres and audio files
genres = ['Hip-Hop', 'Rock', 'Pop', 'Folk', 'Experimental', 'Electronic', 'Instrumental', 'International']
audio_files = [] 
genre_labels = []
for genre in genres:
    for song in os.listdir(f'fma_filtered/{genre}'):
        audio_files.append(f'fma_filtered/{genre}/{song}')
        genre_labels.append(genre)

In [None]:
import os
counter = 0
for file in os.listdir('preprocessed_tensors'):
  counter += 1
print(counter)

7973


In [None]:
len(audio_files)

7977

In [None]:
def preprocess_audio_file(audio_file, feature_extractor, max_length, target_sr=16000):
    waveform, sr = torchaudio.load(audio_file)
    waveform = waveform.mean(dim=0, keepdim=True)  # Convert to mono by averaging the channels

    if sr != target_sr:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)
        waveform = resampler(waveform)

    if waveform.size(1) > max_length * target_sr:
        waveform = waveform[:, : max_length * target_sr]

    features = feature_extractor(waveform.numpy(), sampling_rate=target_sr, return_tensors="pt")
    input_values = features.input_values.squeeze(0)

    return input_values

# Preprocess and save tensors
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
max_length = 30
save_dir = 'preprocessed_tensors_full'

if not os.path.exists(save_dir):
    os.makedirs(save_dir)

genre_labels_new = []

with tqdm(total=len(audio_files), desc='Saving processed files') as pbar:
  for i, audio_file in enumerate(audio_files):
      tensor_filename = os.path.join(save_dir, os.path.basename(audio_file) + '.pt')
      if not os.path.exists(tensor_filename):
        try:
          input_values = preprocess_audio_file(audio_file, feature_extractor, max_length)
          genre_labels_new.append(genre_labels[i])
          torch.save(input_values, tensor_filename)
        except Exception as e:
          print(f"Error preprocessing file {audio_file}: {e}")
      pbar.update()

Downloading (…)rocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

Saving processed files:   2%|▏         | 174/7977 [01:06<19:55,  6.53it/s]

Error preprocessing file fma_filtered/Hip-Hop/0176_Hip-Hop.mp3: Failed to process a packet. (Invalid data found when processing input). 
Error preprocessing file fma_filtered/Hip-Hop/0177_Hip-Hop.mp3: Failed to process a packet. (Invalid data found when processing input). 


Saving processed files:   3%|▎         | 212/7977 [01:11<12:21, 10.48it/s]

Error preprocessing file fma_filtered/Hip-Hop/0212_Hip-Hop.mp3: Failed to process a packet. (Invalid data found when processing input). 


Saving processed files:  19%|█▉        | 1537/7977 [15:23<2:20:20,  1.31s/it]

Error preprocessing file fma_filtered/Rock/0541_Rock.mp3: Failed to open the input "fma_filtered/Rock/0541_Rock.mp3" (Invalid argument).


Saving processed files: 100%|██████████| 7977/7977 [2:48:34<00:00,  1.27s/it]


In [None]:
class AudioDataset(torch.utils.data.Dataset):
    def __init__(self, tensor_files, genre_labels):
        self.tensor_files = tensor_files
        self.genre_labels = genre_labels

    def __len__(self):
        return len(self.tensor_files)

    def __getitem__(self, idx):
        input_values = torch.load(self.tensor_files[idx]).squeeze(0)
        label = genres.index(self.genre_labels[idx])

        return input_values, label

In [None]:
# save labels
np.save('genre_labels_new.npy', np.array(genre_labels_new, dtype=str), allow_pickle=True)

In [None]:
# load labels
genre_labels_new = np.load('genre_labels_new.npy', allow_pickle=True)

In [None]:
tensor_files = []
save_dir = "preprocessed_tensors_full"
for file in os.listdir(f'{save_dir}'):
  tensor_files.append(f'{save_dir}/{file}')

full_dataset = AudioDataset(tensor_files, genre_labels_new)

In [None]:
# save dataset
torch.save(full_dataset, "full_dataset.pt")

In [None]:
# load dataset
full_dataset = torch.load("full_dataset.pt")

In [None]:
# Define the indices
indices = list(range(len(full_dataset)))
np.random.shuffle(indices) #TODO: Standardize across models

# Split the data into training (60%), validation (20%) and testing (20%)
train_split = int(np.floor(0.6 * len(full_dataset)))
valid_split = int(np.floor(0.8 * len(full_dataset)))

train_indices = indices[:train_split]
valid_indices = indices[train_split:valid_split]
test_indices = indices[valid_split:]

# Create Samplers
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(valid_indices)
test_sampler = SubsetRandomSampler(test_indices)

batch_size = 8

# Create DataLoaders
dataloaders = {
    'train': DataLoader(full_dataset, batch_size=batch_size, sampler=train_sampler),
    'valid': DataLoader(full_dataset, batch_size=batch_size, sampler=valid_sampler),
    'test': DataLoader(full_dataset, batch_size=batch_size, sampler=test_sampler)
}

In [None]:
tensor_files[0]

'preprocessed_tensors_full/0999_Instrumental.mp3.pt'

In [None]:
dataloaders['train'].dataset[0][0].shape

torch.Size([479626])