## Install and Imports

In [1]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [2]:
!pip install transformers
!pip install datasets
!pip install accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m63.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2
Looking in in

In [3]:
import os
from tqdm import tqdm
import numpy as np
import librosa
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
import datasets
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import Wav2Vec2ForCTC, AutoFeatureExtractor, Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification, AdamW, AutoConfig, Wav2Vec2Processor

In [4]:
# Mount google drive to use a persistent directory structure
import os
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
os.chdir('/content/drive/MyDrive/deep_learning/final_proj')

Mounted at /content/drive


## Data loading and processing

In [5]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset

# Define the genres and their corresponding folders
genres = ['Hip-Hop', 'Rock', 'Pop', 'Folk', 'Experimental', 'Electronic', 'Instrumental', 'International']
data_path = 'fma_filtered'  # Path to your data folder

# Create a list to store the data
data = []

# Loop over each genre folder
for genre in genres:
    genre_folder = os.path.join(data_path, genre)
    
    # Get the audio files in the genre folder
    audio_files = os.listdir(genre_folder)
    
    # Sample 100 files from each genre
    audio_files = audio_files[:400]
    
    # Add the data to the list
    for audio_file in audio_files:
        audio_path = os.path.join(genre_folder, audio_file)
        data.append({
            'input_values': audio_path,
            'labels': genre
        })

# Convert the list to a DataFrame
df = pd.DataFrame(data)

# Split the data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=101, stratify=df['labels'])

# Save the train and test sets as CSV files
train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)

# Load the dataset using the Hugging Face datasets library
dataset = load_dataset('csv', data_files={'train': 'train.csv', 'validation': 'test.csv'}, cache_dir=False)

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-af35a94804cc8375/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-af35a94804cc8375/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
# Get the train and validation datasets
train_dataset = dataset['train']
eval_dataset = dataset['validation']

# Preprocess the data
def preprocess_function(examples):
    # Perform your preprocessing steps here
    return examples

train_dataset = train_dataset.map(preprocess_function)
eval_dataset = eval_dataset.map(preprocess_function)



In [9]:
# We need to specify the input and output column
input_column = "input_values"
output_column = "labels"

In [10]:
# we need to distinguish the unique labels in our SER dataset
label_list = train_dataset.unique(output_column)
label_list.sort()  # Let's sort it for determinism
num_labels = len(label_list)
print(f"A classification problem with {num_labels} classes:\n {label_list}")

A classification problem with 8 classes:
 ['Electronic', 'Experimental', 'Folk', 'Hip-Hop', 'Instrumental', 'International', 'Pop', 'Rock']


In [11]:
model_name_or_path = "facebook/wav2vec2-base-100k-voxpopuli"
pooling_mode = "mean"

config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    finetuning_task="wav2vec2_clf",
)
setattr(config, 'pooling_mode', pooling_mode)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

In [12]:
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
target_sampling_rate = feature_extractor.sampling_rate
print(f"The target sampling rate: {target_sampling_rate}")

Downloading (…)rocessor_config.json:   0%|          | 0.00/213 [00:00<?, ?B/s]

The target sampling rate: 16000


In [13]:
train_dataset

Dataset({
    features: ['input_values', 'labels'],
    num_rows: 640
})

In [14]:

def label_to_id(label, label_list):
    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1
    return label

from datasets import Dataset

def load_preprocessed(example):
    speech_path = os.path.join("preprocessed_tensors_full/", example['input_values'].split("/")[-1] + '.pt')
    if os.path.exists(speech_path):
        speech = torch.load(speech_path).squeeze(0)[:479626]
        example['input_values'] = torch.tensor(speech)  # ensure that 'speech' is a tensor
        example['labels'] = label_to_id(example['labels'], label_list)
        example['file_exists'] = True
    else:
        example['file_exists'] = False
    return example

In [15]:
def check_file_exists(example):
    speech_path = os.path.join("preprocessed_tensors_full/", example['input_values'].split("/")[-1] + '.pt')
    return os.path.exists(speech_path)

# Find the indices of examples that you want to keep
train_indices = [i for i, example in enumerate(train_dataset) if check_file_exists(example)]
eval_indices = [i for i, example in enumerate(eval_dataset) if check_file_exists(example)]

# Select only the examples with those indices
train_dataset = train_dataset.select(train_indices)
eval_dataset = eval_dataset.select(eval_indices)

In [16]:
# Now apply your map function only on the selected examples
# train_dataset = train_dataset.map(load_preprocessed, remove_columns=["file_exists"])
# eval_dataset = eval_dataset.map(load_preprocessed, remove_columns=["file_exists"])

# Now apply your map function only on the selected examples
train_dataset = train_dataset.map(load_preprocessed)
eval_dataset = eval_dataset.map(load_preprocessed)

Map:   0%|          | 0/640 [00:00<?, ? examples/s]

  example['input_values'] = torch.tensor(speech)  # ensure that 'speech' is a tensor


Map:   0%|          | 0/160 [00:00<?, ? examples/s]

In [17]:
train_dataset.set_format(type='torch', columns=['input_values', 'labels'])
eval_dataset.set_format(type='torch', columns=['input_values', 'labels'])

In [18]:
# train_dataset.save_to_disk("train_dataset.hf")
# eval_dataset.save_to_disk("eval_dataset.hf")

In [19]:
train_dataset

Dataset({
    features: ['input_values', 'labels', 'file_exists'],
    num_rows: 640
})

In [20]:
train_dataset[120]['labels']

tensor(5)

## Define Model / Train Set-Up

In [21]:
from datasets import load_dataset, load_metric
metric = load_metric("accuracy")

  metric = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [22]:
model_checkpoint = "facebook/wav2vec2-base"
#model_checkpoint = "facebook/wav2vec2-large"
batch_size = 32

In [23]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(label_list)
model = AutoModelForAudioClassification.from_pretrained(
    model_checkpoint, 
    num_labels=num_labels,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    hidden_dropout = 0.1,
    activation_dropout = 0.1,
    attention_dropout = 0.1,
    feat_proj_dropout = 0.1,
    feat_quantizer_dropout = 0.0,
    final_dropout = 0.1
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]



Downloading pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForSequenceClassification: ['quantizer.weight_proj.bias', 'quantizer.codevectors', 'project_q.bias', 'quantizer.weight_proj.weight', 'project_hid.bias', 'project_hid.weight', 'project_q.weight']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['projector.weight', 'classifier.weight', 'classifi

In [24]:
# class CustomTrainer(Trainer):
#     def log(self, logs: Dict[str, Any], iterator: Optional[tqdm] = None) -> None:
#         if self.state.is_local_process_zero:
#             self._total_flos += self.state.total_flos
#         logs["total_flos"] = self._total_flos

#         if self.epoch_logging is not None:
#             logs["epoch"] = self.epoch_logging

#         # Add training accuracy to logs
#         if 'loss' in logs:
#             predictions, labels = self.predict(self.train_dataset)
#             predictions = np.argmax(predictions, axis=1)
#             train_acc = np.sum(predictions == labels) / labels.shape[0]
#             logs['train_accuracy'] = train_acc

#         self._log(logs)

In [26]:
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    output_dir="wav2vec2-base-ek-d",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=6,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=6,
    num_train_epochs=20,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

In [27]:
import numpy as np

def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [28]:
# from transformers import TrainerCallback
# from copy import deepcopy

# class CustomCallback(TrainerCallback):
    
#     def __init__(self, trainer) -> None:
#         super().__init__()
#         self._trainer = trainer
    
#     def on_epoch_end(self, args, state, control, **kwargs):
#         if control.should_evaluate:
#             control_copy = deepcopy(control)
#             self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
#             return control_copy

In [29]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics
)

In [30]:
# trainer.add_callback(CustomCallback(trainer)) 

## Train and Evaluate

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
0,2.076,2.051767,0.3125
1,1.9713,1.917079,0.34375
2,1.8085,1.77727,0.4375
4,1.7708,1.727639,0.40625
4,1.6078,1.66665,0.45
5,1.5335,1.696872,0.39375
6,1.4327,1.602646,0.48125
8,1.4111,1.554166,0.5
8,1.2219,1.507935,0.51875
9,1.1211,1.437137,0.54375


In [None]:
# checkpoint = torch.load("wav2vec2-base-ek-2/checkpoint-1440")
# model.load_state_dict(checkpoint['state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer'])
#     return model, optimizer, checkpoint['epoch']

In [None]:
trainer.save_model("wav2vec2-base-ek-d")

In [None]:
evaluation_metrics = trainer.evaluate()

In [None]:
import json
from sklearn.metrics import confusion_matrix
import numpy as np

# Get evaluation metrics
eval_metrics = trainer.evaluate()

# Save evaluation metrics
with open("metrics/evaluation_metrics_base_d.json", "w") as outfile:
    json.dump(eval_metrics, outfile)

In [None]:
# Get the prediction and labels
predictions = trainer.predict(eval_dataset).predictions
predictions = np.argmax(predictions, axis=1)
labels = eval_dataset["labels"]

In [None]:
# Print classification report
report = classification_report(labels, predictions)
print(report)
with open('metrics/classification_report_base_d.txt', 'w') as f:
    f.write(report)

# Compute the confusion matrix
cm = confusion_matrix(labels, predictions)
# Save confusion matrix
np.savetxt("metrics/confusion_matrix_base_d.csv", cm, delimiter=",")