In [1]:
!pip install transformers
!pip install datasets
!pip install accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m50.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m104.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2
Looking in i

In [2]:
import os
from tqdm import tqdm
import numpy as np
import librosa
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
import datasets
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoFeatureExtractor, ASTForAudioClassification
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
os.chdir('/content/drive/MyDrive/final_proj')

Mounted at /content/drive


In [4]:
# Define the genres and their corresponding folders
genres = ['Hip-Hop', 'Rock', 'Pop', 'Folk', 'Experimental', 'Electronic', 'Instrumental', 'International']
data_path = 'fma_filtered'  # Path to your data folder

# Create a list to store the data
data = []

# Loop over each genre folder
for genre in genres:
    genre_folder = os.path.join(data_path, genre)
    
    # Get the audio files in the genre folder
    audio_files = os.listdir(genre_folder)
    
    # Sample 100 files from each genre
    audio_files = audio_files[:100]
    
    # Add the data to the list
    for audio_file in audio_files:
        audio_path = os.path.join(genre_folder, audio_file)
        data.append({
            'input_values': audio_path,
            'labels': genre
        })

# Convert the list to a DataFrame
df = pd.DataFrame(data)

# Split the data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=101, stratify=df['labels'])

# Save the train and test sets as CSV files
train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)

# Load the dataset using the Hugging Face datasets library
dataset = load_dataset('csv', data_files={'train': 'train.csv', 'validation': 'test.csv'}, cache_dir=False)

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-af3a53a3e5906a7d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-af3a53a3e5906a7d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
# Get the train and validation datasets
train_dataset = dataset['train']
eval_dataset = dataset['validation']

In [6]:
# We need to specify the input and output column
input_column = "input_values"
output_column = "labels"

In [7]:
# we need to distinguish the unique labels in our SER dataset
label_list = train_dataset.unique(output_column)
label_list.sort()  # Let's sort it for determinism
num_labels = len(label_list)
print(f"A classification problem with {num_labels} classes:\n {label_list}")

A classification problem with 8 classes:
 ['Electronic', 'Experimental', 'Folk', 'Hip-Hop', 'Instrumental', 'International', 'Pop', 'Rock']


In [8]:
def label_to_id(label, label_list):
    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1
    return label

from datasets import Dataset

# eesha's

def load_preprocessed(example):
    speech_path = os.path.join("preprocessed_tensors_full/", example['input_values'].split("/")[-1] + '.pt')
    if os.path.exists(speech_path):
        speech = torch.load(speech_path).squeeze(0)[:479626] # max 
        example['input_values'] = torch.tensor(speech)  # ensure that 'speech' is a tensor
        example['labels'] = label_to_id(example['labels'], label_list)
        example['file_exists'] = True
    else:
        example['file_exists'] = False
    return example

In [9]:
train_dataset.set_format(type='torch', columns=['input_values', 'labels'])
eval_dataset.set_format(type='torch', columns=['input_values', 'labels'])

In [10]:
def check_file_exists(example):
    speech_path = os.path.join("preprocessed_tensors_full/", example['input_values'].split("/")[-1] + '.pt')
    return os.path.exists(speech_path)

# Find the indices of examples that you want to keep
train_indices = [i for i, example in enumerate(train_dataset) if check_file_exists(example)]
eval_indices = [i for i, example in enumerate(eval_dataset) if check_file_exists(example)]

# Select only the examples with those indices
train_dataset = train_dataset.select(train_indices)
eval_dataset = eval_dataset.select(eval_indices)

In [11]:
# Now apply your map function only on the selected examples
train_dataset = train_dataset.map(load_preprocessed)
eval_dataset = eval_dataset.map(load_preprocessed)

Map:   0%|          | 0/589 [00:00<?, ? examples/s]

  example['input_values'] = torch.tensor(speech)  # ensure that 'speech' is a tensor


Map:   0%|          | 0/144 [00:00<?, ? examples/s]

In [12]:
train_dataset = train_dataset.remove_columns("file_exists")
eval_dataset = eval_dataset.remove_columns("file_exists")

In [13]:
from transformers import AutoConfig, ASTConfig, ASTModel, ASTFeatureExtractor

model_name_or_path = 'MIT/ast-finetuned-audioset-10-10-0.4593'

feature_extractor = ASTFeatureExtractor.from_pretrained(model_name_or_path)
target_sampling_rate = feature_extractor.sampling_rate

Downloading (…)rocessor_config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

In [14]:
def preprocess_ast(examples):
  # audio_arrays = [x for x in examples["input_values"]]
  audio_arrays = []
  for x in examples['input_values']:
    np_x = x.numpy() # without this the fn will throw sequence in array typeerror
    audio_arrays.append(np_x)
  inputs = feature_extractor(
        audio_arrays, 
        sampling_rate=target_sampling_rate, 
        truncation=True, 
    )
  return inputs

In [15]:
encoded_train_dataset = train_dataset.map(preprocess_ast, batched=True)
encoded_eval_dataset = eval_dataset.map(preprocess_ast, batched=True)

Map:   0%|          | 0/589 [00:00<?, ? examples/s]

Map:   0%|          | 0/144 [00:00<?, ? examples/s]

In [16]:
from datasets import load_dataset, load_metric
metric = load_metric("accuracy")

  metric = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [17]:
model_checkpoint = "MIT/ast-finetuned-audioset-10-10-0.4593"
batch_size = 32

In [19]:
from transformers import ASTForAudioClassification
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

# # this does not work. throws dim error for input
# ex = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

# num_labels = len(label_list)
ex_2 = AutoModelForAudioClassification.from_pretrained(
    model_checkpoint, 
    num_labels=num_labels,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    ignore_mismatched_sizes=True
    # hidden_dropout = 0.1,
    # activation_dropout = 0.1,
    # attention_dropout = 0.1,
    # feat_proj_dropout = 0.0,
    # feat_quantizer_dropout = 0.0,
    # final_dropout = 0.1
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

args = TrainingArguments(
    output_dir="ast-jl-experiment-2",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5, 
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=4,
    num_train_epochs=20,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    weight_decay=0.01)

In [20]:
import numpy as np

def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [34]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

trainer = Trainer(
    ex_2, #
    args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_eval_dataset,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics
)

In [41]:

torch.cuda.empty_cache()

In [42]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
0,1.8263,1.633572,0.463415
1,1.1374,1.189197,0.574913
2,0.7523,1.160932,0.609756
4,0.5614,1.1261,0.644599
4,0.235,1.150486,0.644599
5,0.1679,1.185237,0.641115
6,0.0662,1.274876,0.651568
8,0.0253,1.327362,0.662021
8,0.0066,1.433402,0.651568


In [98]:
ex_4 = AutoModelForAudioClassification.from_pretrained(
    model_checkpoint, 
    num_labels=num_labels,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    ignore_mismatched_sizes=True
)

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [99]:
ex_4.config

ASTConfig {
  "_name_or_path": "MIT/ast-finetuned-audioset-10-10-0.4593",
  "architectures": [
    "ASTForAudioClassification"
  ],
  "attention_probs_dropout_prob": 0.0,
  "frequency_stride": 10,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": "Electronic",
    "1": "Experimental",
    "2": "Folk",
    "3": "Hip-Hop",
    "4": "Instrumental",
    "5": "International",
    "6": "Pop",
    "7": "Rock"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "Electronic": 0,
    "Experimental": 1,
    "Folk": 2,
    "Hip-Hop": 3,
    "Instrumental": 4,
    "International": 5,
    "Pop": 6,
    "Rock": 7
  },
  "layer_norm_eps": 1e-12,
  "max_length": 1024,
  "model_type": "audio-spectrogram-transformer",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_mel_bins": 128,
  "patch_size": 16,
  "qkv_bias": true,
  "time_stride": 10,
  "torch_dtype": "float32",
  "transformers_version": "4.29.2"
}

In [94]:
args = TrainingArguments(
    output_dir="ast-jl-experiment-5",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5, 
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=2,
    num_train_epochs=20,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy")

In [95]:
trainer_5 = Trainer(
    ex_4, #
    args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_eval_dataset,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics
)

In [84]:
import gc

gc.collect()

torch.cuda.empty_cache()

In [92]:
torch.cuda.empty_cache()

In [96]:
trainer_5.train()


Epoch,Training Loss,Validation Loss,Accuracy
0,1.9672,2.061303,0.236111
1,1.5859,2.030078,0.361111
2,1.3721,1.761064,0.423611
4,1.0973,1.601682,0.451389
4,1.0537,1.802351,0.458333
5,0.8305,1.835195,0.479167
6,0.8501,1.653123,0.506944
8,0.5116,2.060816,0.493056
8,0.3999,2.037217,0.493056
