In [71]:
!pip -qqq install transformers datasets nnAudio

In [72]:
from transformers import Wav2Vec2FeatureExtractor
from transformers import AutoModel
import torch
from torch import nn
import torchaudio.transforms as T
from datasets import Dataset, Audio, concatenate_datasets, Split
import os

In [73]:
# mount drive and set path to dataset
from google.colab import drive
drive.mount('/content/drive')
data_dir = "/content/drive/Shareddrives/DeepLearningProject/minibabyslakh"
# make sure 
os.listdir(data_dir)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


['train', 'test']

In [74]:
# loading our model weights
model = AutoModel.from_pretrained("m-a-p/MERT-v0", trust_remote_code=True)
# loading the corresponding preprocessor config
processor = Wav2Vec2FeatureExtractor.from_pretrained("m-a-p/MERT-v0",trust_remote_code=True)

In [75]:
# # load demo audio and set processor
# dataset = Dataset.load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
# dataset = dataset.sort("id")
# sampling_rate = dataset.features["audio"].sampling_rate

In [88]:
# Function to load the audio files from the directory structure
def get_data_files(directory):
    bass_files = []
    residual_files = []
    tracks = []
    for track_dir in os.listdir(directory):
        track_path = os.path.join(directory, track_dir)
        if os.path.isdir(track_path):
            bass_audio_dir = os.path.join(track_path, 'bass')
            # bass_file = os.path.join(bass_audio_dir, 'bass.wav')
            # residual_file = os.path.join(bass_audio_dir, 'residuals.wav')
            if os.path.isdir(bass_audio_dir):
                for file in os.listdir(bass_audio_dir):
                    if file.startswith('bass') and file.endswith('.wav'):
                        bass_file = os.path.join(bass_audio_dir, file)
                        bass_files.append(bass_file)
                        residual_file = os.path.join(bass_audio_dir, 'residuals' + file[4:])
                        residual_files.append(residual_file)
                        tracks.append(track_dir)
        
    return {"bass": bass_files, "residuals": residual_files, "track": tracks}

In [89]:
# Get the audio filenames from the dataset directory
train_files = get_data_files(os.path.join(data_dir, "train"))
test_files = get_data_files(os.path.join(data_dir, "test"))
# validation_data = load_audio_files(os.path.join(data_dir, "validation"))
train_files

{'bass': ['/content/drive/Shareddrives/DeepLearningProject/minibabyslakh/train/Track00002/bass/bass.wav',
  '/content/drive/Shareddrives/DeepLearningProject/minibabyslakh/train/Track00001/bass/bass.wav',
  '/content/drive/Shareddrives/DeepLearningProject/minibabyslakh/train/Track00003/bass/bass.wav'],
 'residuals': ['/content/drive/Shareddrives/DeepLearningProject/minibabyslakh/train/Track00002/bass/residuals.wav',
  '/content/drive/Shareddrives/DeepLearningProject/minibabyslakh/train/Track00001/bass/residuals.wav',
  '/content/drive/Shareddrives/DeepLearningProject/minibabyslakh/train/Track00003/bass/residuals.wav'],
 'track': ['Track00002', 'Track00001', 'Track00003']}

In [100]:
# Create the dataset objects
train_dataset = Dataset.from_dict(train_files, split="train") \
                    .cast_column("bass", Audio()) \
                    .cast_column("residuals", Audio()) \
                    .sort("track")
test_dataset = Dataset.from_dict(test_files, split="test") \
                    .cast_column("bass", Audio()) \
                    .cast_column("residuals", Audio()) \
                    .sort("track")
combined_dataset = concatenate_datasets([train_dataset, test_dataset])

train_dataset

Dataset({
    features: ['bass', 'residuals', 'track'],
    num_rows: 3
})

In [94]:
sampling_rate = train_dataset["residuals"][0]['sampling_rate']
resample_rate = processor.sampling_rate
# make sure the sample_rate aligned
if resample_rate != sampling_rate:
    print(f'setting rate from {sampling_rate} to {resample_rate}')
    resampler = T.Resample(sampling_rate, resample_rate)
else:
    resampler = None

In [95]:
# audio file is decoded on the fly
if resampler is None:
    input_audio = train_dataset[0]["residuals"]["array"]
else:
  input_audio = resampler(torch.from_numpy(train_dataset[0]["residuals"]["array"]))

In [96]:
# The whole audio file is too big to run in colab
input_audio = input_audio[0:93680]

In [97]:
inputs = processor(input_audio, sampling_rate=resample_rate, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs, output_hidden_states=True)

In [98]:
# take a look at the output shape, there are 13 layers of representation
# each layer performs differently in different downstream tasks, you should choose empirically
all_layer_hidden_states = torch.stack(outputs.hidden_states).squeeze()
print(all_layer_hidden_states.shape) # [13 layer, Time steps, 768 feature_dim]

# for utterance level classification tasks, you can simply reduce the representation in time
time_reduced_hidden_states = all_layer_hidden_states.mean(-2)
print(time_reduced_hidden_states.shape) # [13, 768]

# you can even use a learnable weighted average representation
aggregator = nn.Conv1d(in_channels=13, out_channels=1, kernel_size=1)
weighted_avg_hidden_states = aggregator(time_reduced_hidden_states.unsqueeze(0)).squeeze()
print(weighted_avg_hidden_states.shape) # [768]

torch.Size([13, 292, 768])
torch.Size([13, 768])
torch.Size([768])


In [99]:
outputs

BaseModelOutput(last_hidden_state=tensor([[[-0.0606,  0.1180,  0.0528,  ..., -0.0023,  0.0381,  0.1199],
         [-0.0716,  0.1062,  0.0683,  ..., -0.0027,  0.0267,  0.1263],
         [-0.0757,  0.1017,  0.0629,  ..., -0.0036,  0.0363,  0.1262],
         ...,
         [-0.0952,  0.1915,  0.0160,  ...,  0.0156,  0.0945,  0.2356],
         [ 0.0911,  0.1736,  0.0869,  ...,  0.0308, -0.0166,  0.3092],
         [ 0.0073,  0.3786,  0.0152,  ..., -0.0048, -0.2137,  0.3118]]]), hidden_states=(tensor([[[-0.1507, -0.4730,  0.0677,  ...,  0.2829, -0.1228, -0.1929],
         [-0.1839, -0.4138,  0.0656,  ...,  0.2732, -0.1123, -0.1129],
         [-0.2044, -0.3500,  0.0509,  ...,  0.2640, -0.1125, -0.0760],
         ...,
         [ 0.5956, -0.1389, -0.1069,  ...,  0.3431, -0.3568,  0.1605],
         [ 0.2148,  0.0763, -0.1094,  ...,  0.2505, -0.2648,  0.1472],
         [-0.0154,  0.3685, -0.1005,  ...,  0.2217, -0.1888, -0.0043]]]), tensor([[[-0.0886, -0.2003,  0.1322,  ...,  0.0872, -0.4373, -0.2