In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("chrisfilo/urbansound8k")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/urbansound8k


In [5]:
!pip install transformers[torch] datasets[audio] audiomentations

Collecting audiomentations
  Downloading audiomentations-0.38.0-py3-none-any.whl.metadata (12 kB)
Collecting numpy-minmax<1,>=0.3.0 (from audiomentations)
  Downloading numpy_minmax-0.3.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.0 kB)
Collecting numpy-rms<1,>=0.4.2 (from audiomentations)
  Downloading numpy_rms-0.4.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.5 kB)
Collecting scipy<1.13,>=1.4 (from audiomentations)
  Downloading scipy-1.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Downloading audiomentations-0.38.0-py3-none-any.whl (82 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.6/82.6 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy_minmax-0.3.1-cp310-c

In [6]:
import pandas as pd
import numpy as np

from datasets import Dataset, Audio, ClassLabel, Features

df = pd.read_csv('/kaggle/input/urbansound8k/UrbanSound8K.csv')

my_classes = df['class'].unique().tolist()

# назначаем для каждого класса соответствующий id
map_class_to_id = {j: i for i, j in enumerate(my_classes)}
map_class_to_id

# Define class labels
class_labels = ClassLabel(names=my_classes)


# df['filenames'] = '/content/for_training/fold' + df['fold'].astype(str) + '/' + df['slice_file_name'].astype(str)
df['filenames'] = '/kaggle/input/urbansound8k/fold' + df['fold'].astype(str) + '/' + df['slice_file_name'].astype(str)


train_df = df[df['fold'] < 8]

targets = train_df['class'].apply(lambda name: map_class_to_id[name]).to_list()

# Define features with audio and label columns
features = Features({
    "audio": Audio(),  # Define the audio feature
    "labels": class_labels  # Assign the class labels
})

# Construct the dataset from a dictionary
dataset = Dataset.from_dict({
    "audio": train_df['filenames'].to_list(),
    "labels": targets,  # Corresponding labels for the audio files
}, features=features)

dataset

Dataset({
    features: ['audio', 'labels'],
    num_rows: 6273
})

In [7]:
# cast target and audio column
dataset = dataset.cast_column("labels", class_labels)
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

num_labels = len(my_classes)

Casting the dataset:   0%|          | 0/6273 [00:00<?, ? examples/s]

In [8]:
from transformers import AutoFeatureExtractor, ASTForAudioClassification, ASTFeatureExtractor
from datasets import load_dataset
import torch

# Define the pretrained model and instantiate the feature extractor
pretrained_model = "MIT/ast-finetuned-audioset-10-10-0.4593"
feature_extractor = ASTFeatureExtractor.from_pretrained(pretrained_model)
model_input_name = feature_extractor.model_input_names[0]
SAMPLING_RATE = feature_extractor.sampling_rate

preprocessor_config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

In [9]:
def preprocess_audio(batch):
    wavs = [audio["array"] for audio in batch["input_values"]]
    # inputs are spectrograms as torch.tensors now
    inputs = feature_extractor(wavs, sampling_rate=SAMPLING_RATE, return_tensors="pt")

    output_batch = {model_input_name: inputs.get(model_input_name), "labels": list(batch["labels"])}
    return output_batch

# Apply the transformation to the dataset
dataset = dataset.rename_column("audio", "input_values")  # rename audio column
dataset.set_transform(preprocess_audio, output_all_columns=False)

In [10]:
# split training data
if "test" not in dataset:
    dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=0, stratify_by_column="labels")

In [11]:
# import evaluate
from transformers import ASTConfig, ASTForAudioClassification, TrainingArguments, Trainer

# Load configuration from the pretrained model
config = ASTConfig.from_pretrained(pretrained_model)
config.num_labels = len(map_class_to_id)
config.label2id = map_class_to_id
config.id2label = {v: k for k, v in map_class_to_id.items()}

# Initialize the model with the updated configuration
model = ASTForAudioClassification.from_pretrained(pretrained_model, config=config, ignore_mismatched_sizes=True)
model.init_weights()

config.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from transformers import TrainingArguments

# Configure training run with TrainingArguments class
training_args = TrainingArguments(
    output_dir="./runs/ast_classifier",
    logging_dir="./logs/ast_classifier",
    report_to="tensorboard",
    learning_rate=5e-5,  # Learning rate
    push_to_hub=False,
    num_train_epochs=10,  # Number of epochs
    per_device_train_batch_size=8,  # Batch size per device
    eval_strategy="epoch",  # Evaluation strategy
    save_strategy="epoch",
    eval_steps=1,
    save_steps=1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_strategy="steps",
    logging_steps=20,
)

In [13]:
!pip install evaluate

  pid, fd = os.forkpty()


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [14]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")
recall = evaluate.load("recall")
precision = evaluate.load("precision")
f1 = evaluate.load("f1")

AVERAGE = "macro" if config.num_labels > 2 else "binary"

def compute_metrics(eval_pred):
    logits = eval_pred.predictions
    predictions = np.argmax(logits, axis=1)
    metrics = accuracy.compute(predictions=predictions, references=eval_pred.label_ids)
    metrics.update(precision.compute(predictions=predictions, references=eval_pred.label_ids, average=AVERAGE))
    metrics.update(recall.compute(predictions=predictions, references=eval_pred.label_ids, average=AVERAGE))
    metrics.update(f1.compute(predictions=predictions, references=eval_pred.label_ids, average=AVERAGE))
    return metrics

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [17]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

model_cuda = model.to(device)
# dataset = dataset.to(device)

In [18]:
from transformers import Trainer

# Setup the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics,  # Use the metrics function from above
)

In [19]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2704,0.147624,0.958566,0.958967,0.959548,0.95866
2,0.2081,0.15961,0.967331,0.971566,0.967198,0.968992
3,0.2083,0.107253,0.979283,0.981859,0.978254,0.979887
4,0.1108,0.143441,0.971315,0.971345,0.972149,0.971595
5,0.0002,0.149317,0.976892,0.979798,0.978749,0.979055
6,0.0037,0.161505,0.972112,0.974521,0.972893,0.973368
7,0.0002,0.154296,0.979283,0.981638,0.979323,0.980387
8,0.0,0.152175,0.98008,0.982466,0.980731,0.981462
9,0.0,0.141493,0.978486,0.980915,0.978654,0.979683
10,0.0,0.141596,0.978486,0.980895,0.978625,0.979673


TrainOutput(global_step=6280, training_loss=0.0745629820690981, metrics={'train_runtime': 9500.126, 'train_samples_per_second': 5.282, 'train_steps_per_second': 0.661, 'total_flos': 3.4015856832282624e+18, 'train_loss': 0.0745629820690981, 'epoch': 10.0})

In [20]:
trainer.evaluate()

{'eval_loss': 0.1521751880645752,
 'eval_accuracy': 0.9800796812749004,
 'eval_precision': 0.9824660849256295,
 'eval_recall': 0.9807312742559834,
 'eval_f1': 0.9814618938095425,
 'eval_runtime': 68.2559,
 'eval_samples_per_second': 18.387,
 'eval_steps_per_second': 2.3,
 'epoch': 10.0}

In [22]:
trainer.save_model("/kaggle/working/finetuned_ast_2024_12_08")

### Inference

In [None]:
model_inference = ASTForAudioClassification.from_pretrained('/kaggle/working/finetuned_ast_2024_12_08')

In [47]:
from IPython.display import Audio

for_test = df.sample()

wav_file_name = for_test['filenames'].iloc[0]
classs_true = for_test['class'].iloc[0]

print(wav_file_name, classs_true)

Audio(wav_file_name)

/kaggle/input/urbansound8k/fold7/168846-5-0-2.wav engine_idling


In [48]:
import librosa
from transformers import ASTFeatureExtractor

feature_extractor = ASTFeatureExtractor()

audio, sr = librosa.load(wav_file_name, sr=16000, mono=True)

# аналогично можно с помощью librosa
# librosa.feature.melspectrogram(y=audio, sr=sr), но надо ещё обернуть в соответствующую структуру
inputs_example = feature_extractor(audio, sampling_rate=sr, return_tensors="pt")

# inputs_exapmle = librosa.feature.melspectrogram(y=audio, sr=sr)

with torch.no_grad():
    outputs = model_inference(**inputs_example)

predicted_class_ids = torch.argmax(outputs.logits, dim=-1).item()
predicted_label = model_inference.config.id2label[predicted_class_ids]

predicted_label

'engine_idling'