In [1]:
# Load the data
from datasets import load_dataset

gtzan = load_dataset("marsyas/gtzan", "all")

In [2]:
# Split the data
gtzan = gtzan["train"].train_test_split(seed=42, shuffle=True, test_size=0.1)
gtzan

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 899
    })
    test: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 100
    })
})

In [3]:
# View an example
example = gtzan["train"][0] # features: file (path), audio {}, genre
print("We have", len(example), "features:",
      '\n', list(example.keys()),
     '\n', [type(example[i]) for i in list(example.keys())])

We have 3 features: 
 ['file', 'audio', 'genre'] 
 [<class 'str'>, <class 'dict'>, <class 'int'>]


In [4]:
# Expand 'audio'
audio = example['audio'] #path, array, sampling_rate
print("We have", len(audio), "features:",
      '\n', list(audio.keys()),
     '\n', [type(audio[i]) for i in list(audio.keys())])

We have 3 features: 
 ['path', 'array', 'sampling_rate'] 
 [<class 'str'>, <class 'numpy.ndarray'>, <class 'int'>]


In [18]:
# The audio['path'] feature duplicates the example['file'] path
example['file'] == audio['path']

True

In [19]:
# We have three features (truly) describing audio:
# the waveform array, sampling rate and the genre label

'''
Even sampling rate is more a condition than a feature.
So we really have the waveform array and the label.
It seems like so little to go on. 

--- does their hubert model outperform cosine knn?
--- what's the dimensionality of each array?
'''

'\nEven sampling rate is more a condition than a feature.\nSo we really have the waveform array and the label.\nIt seems like so little to go on. \n\n'

In [None]:
# Next?
# We could view the waveform and see what patterns there are form genre when we overlay them
# We could also find this pattern mathematically:
    # Does the hubert model outperform cosine similarity?

In [5]:
# What genre is '7'?
id2label_fn = gtzan["train"].features["genre"].int2str
id2label_fn(gtzan["train"][0]["genre"])

'pop'

In [None]:
# Use the feature extractor
from transformers import AutoFeatureExtractor

model_id = "ntu-spml/distilhubert"
feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id, do_normalize=True, return_attention_mask=True
)

In [None]:
# What is the model's sampling rate?
sampling_rate = feature_extractor.sampling_rate
sampling_rate

In [None]:
# Resample the data to match the model's 16kHz sampling rate
from datasets import Audio

gtzan = gtzan.cast_column("audio", Audio(sampling_rate=sampling_rate))

In [None]:
# Check our resampling work
gtzan["train"][0]['audio']['sampling_rate']

In [None]:
# Normalize one record as an example
import numpy as np

sample = gtzan["train"][0]["audio"]

print(f"Mean: {np.mean(sample['array']):.3}, Variance: {np.var(sample['array']):.3}")

inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])

print(f"inputs keys: {list(inputs.keys())}")

print(
    f"Mean: {np.mean(inputs['input_values']):.3}, Variance: {np.var(inputs['input_values']):.3}"
)

In [None]:
# Create a function to preprocess all the data
max_duration = 30.0


def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
        return_attention_mask=True,
    )
    return inputs

In [None]:
# Preprocess the data in batches of 100 records
gtzan_encoded = gtzan.map(
    preprocess_function,
    remove_columns=["audio", "file"],
    batched=True,
    batch_size=100,
    num_proc=1,
)
gtzan_encoded

In [None]:
# Rename the genre column for the Trainer class
gtzan_encoded = gtzan_encoded.rename_column("genre", "label")

In [None]:
# Collect the labels
id2label = {
    str(i): id2label_fn(i)
    for i in range(len(gtzan_encoded["train"].features["label"].names))
}
label2id = {v: k for k, v in id2label.items()}

id2label["7"]

In [None]:
# Ready the model
from transformers import AutoModelForAudioClassification

num_labels = len(id2label)

model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

In [None]:
# Obtain HF token to push model checkpoints to hub
from huggingface_hub import notebook_login

notebook_login()

In [None]:
# Fine-tune the model for genre classification
!pip install --force-reinstall -v "accelerate>=0.20.1"
from transformers import TrainingArguments

model_name = model_id.split("/")[-1]
batch_size = 8
gradient_accumulation_steps = 1
num_train_epochs = 10

training_args = TrainingArguments(
    f"{model_name}-finetuned-gtzan",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    push_to_hub=True,
)