### Add label

In [None]:
#| eval: false

from datasets import load_dataset, Features, Value, Audio, ClassLabel
import json

feats = Features({"path": Value("string"),
                  "audio": Audio(sampling_rate=16_000),
                  "label": ClassLabel(names=["not found","found"])}
                  )
def _generate_examples(example, tag):
        example['label'] = 1 if example['label'] in tag else 0
        example['audio'] = example['path']
        return example

with open('tags_data.json', 'r') as f:
    data = json.load(f)

data_files = {'train': 'dataset/slices_train.csv', 'test': 'dataset/slices_test.csv', 'val': 'dataset/slices_val.csv'}
dataset = load_dataset("csv", data_files=data_files)
dataset = dataset.remove_columns(column_names=['Unnamed: 0', 'split'])
tags_pool = [k for k, v in data.items() if 'chow mein' in v['tags']]
dataset = dataset.map(_generate_examples, fn_kwargs={'tag': tags_pool}, features=feats)
dataset = dataset.rename_column('path', 'file')
id2label = {0: 'not found', 1: 'found'}
label2id = {v: k for k, v in id2label.items()}

Using custom data configuration default-c58ed15a5d5a3dac
Reusing dataset csv (/home/jovyan/.cache/huggingface/datasets/csv/default-c58ed15a5d5a3dac/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/36993 [00:00<?, ?ex/s]

  0%|          | 0/4648 [00:00<?, ?ex/s]

  0%|          | 0/4586 [00:00<?, ?ex/s]

In [None]:
#| eval: false

dataset['train'][0]

{'file': '/home/jovyan/.cache/panda/audio_slices/water_0_1655689126-SIP-A90CCE12F2CF-000041b2-chunk3.wav',
 'audio': {'path': '/home/jovyan/.cache/panda/audio_slices/water_0_1655689126-SIP-A90CCE12F2CF-000041b2-chunk3.wav',
  'array': array([ 6.1035156e-05,  3.3569336e-04, -4.2724609e-04, ...,
         -4.8522949e-03,  1.3031006e-02,  2.9037476e-02], dtype=float32),
  'sampling_rate': 16000},
 'label': 0}

In [None]:
#| eval: false

dataset['train'].to_pandas().label.value_counts()

0    35787
1     1206
Name: label, dtype: int64

In [None]:
#| eval: false

def _filter_by_duration(example, duration):
    return len(example['audio']['array']) < duration * example['audio']['sampling_rate']

dataset = dataset.filter(_filter_by_duration, fn_kwargs={'duration': 1})


  0%|          | 0/37 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

In [None]:
#| eval: false

class_counts = dataset['train'].to_pandas().label.value_counts()
weight_positive_class = class_counts.iloc[0]/class_counts.iloc[1]
print(weight_positive_class)

30.22027972027972


In [None]:
#| eval: false

from transformers import AutoFeatureExtractor
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer
from datasets import load_dataset, load_metric

model_checkpoint = "facebook/wav2vec2-base"
batch_size = 32
max_duration = 1

feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint)

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, 
        sampling_rate=feature_extractor.sampling_rate, 
        max_length=int(feature_extractor.sampling_rate * max_duration), 
        truncation=True, 
    )
    return inputs

encoded_dataset = dataset.map(preprocess_function, remove_columns=["audio", "file"], batched=True)

num_labels = len(id2label)

model = AutoModelForAudioClassification.from_pretrained(
    model_checkpoint, 
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"{model_name}-finetuned-ks",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=15,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
)

loading feature extractor configuration file https://huggingface.co/facebook/wav2vec2-base/resolve/main/preprocessor_config.json from cache at /home/jovyan/.cache/huggingface/transformers/d4583dd9e59eb6295f8fe8b18833ae54d963a122d69aa1df7ecce6caafe18c8f.bc3155ca0bae3a39fc37fc6d64829c6a765f46480894658bb21c08db6155358d
loading configuration file https://huggingface.co/facebook/wav2vec2-base/resolve/main/config.json from cache at /home/jovyan/.cache/huggingface/transformers/c7746642f045322fd01afa31271dd490e677ea11999e68660a92619ec7c892b4.ce1f96bfaf3d7475cb8187b9668c7f19437ade45fb9ceb78d2b06a2cec198015
Model config Wav2Vec2Config {
  "_name_or_path": "facebook/wav2vec2-base",
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForPreTraining"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 256,
  "contrastive_logits_temp

  0%|          | 0/36 [00:00<?, ?ba/s]

  tensor = as_tensor(value)


  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/facebook/wav2vec2-base/resolve/main/config.json from cache at /home/jovyan/.cache/huggingface/transformers/c7746642f045322fd01afa31271dd490e677ea11999e68660a92619ec7c892b4.ce1f96bfaf3d7475cb8187b9668c7f19437ade45fb9ceb78d2b06a2cec198015
Model config Wav2Vec2Config {
  "_name_or_path": "facebook/wav2vec2-base",
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForPreTraining"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 256,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": false,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "sum",
  "ctc_zero_infinity":

In [None]:
#| eval: false

metric = load_metric("accuracy")

In [None]:
#| eval: false

import numpy as np
import torch
from torch import nn

def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)


class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (2 labels with different weights)
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 1.35]).cuda())
        logits_view = logits.view(-1, self.model.config.num_labels)
        loss = loss_fct(logits_view, labels.view(-1))
        return (loss, outputs) if return_outputs else loss

trainer = CustomTrainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["val"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics
)


In [None]:
#| eval: false

trainer.train()

***** Running training *****
  Num examples = 35708
  Num Epochs = 15
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 4185


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4112,0.306926,0.882207
2,0.2834,0.227008,0.90473
3,0.2428,0.193498,0.914414
4,0.221,0.16687,0.923874
5,0.2278,0.159021,0.926802
6,0.1976,0.158407,0.931081
7,0.2077,0.141965,0.934685
8,0.1817,0.148205,0.929054
9,0.1728,0.138138,0.936036
10,0.1868,0.141322,0.933333


***** Running Evaluation *****
  Num examples = 4440
  Batch size = 32
Saving model checkpoint to wav2vec2-base-finetuned-ks/checkpoint-279
Configuration saved in wav2vec2-base-finetuned-ks/checkpoint-279/config.json
Model weights saved in wav2vec2-base-finetuned-ks/checkpoint-279/pytorch_model.bin
Feature extractor saved in wav2vec2-base-finetuned-ks/checkpoint-279/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 4440
  Batch size = 32
Saving model checkpoint to wav2vec2-base-finetuned-ks/checkpoint-558
Configuration saved in wav2vec2-base-finetuned-ks/checkpoint-558/config.json
Model weights saved in wav2vec2-base-finetuned-ks/checkpoint-558/pytorch_model.bin
Feature extractor saved in wav2vec2-base-finetuned-ks/checkpoint-558/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 4440
  Batch size = 32
Saving model checkpoint to wav2vec2-base-finetuned-ks/checkpoint-837
Configuration saved in wav2vec2-base-finetuned-ks/checkpoint-837/config.

TrainOutput(global_step=4185, training_loss=0.21789302935594584, metrics={'train_runtime': 8628.3807, 'train_samples_per_second': 62.077, 'train_steps_per_second': 0.485, 'total_flos': 4.3021878417689375e+18, 'train_loss': 0.21789302935594584, 'epoch': 15.0})

In [None]:
#| eval: false

dataset.cleanup_cache_files()

{'train': 5, 'test': 1, 'val': 0, 'validation': 0}

In [None]:
#| eval: false

trainer.save_model(f"wav2vec2-base-finetuned-ks/best_checkpoint")

Saving model checkpoint to wav2vec2-base-finetuned-ks/best_checkpoint
Configuration saved in wav2vec2-base-finetuned-ks/best_checkpoint/config.json
Model weights saved in wav2vec2-base-finetuned-ks/best_checkpoint/pytorch_model.bin
Feature extractor saved in wav2vec2-base-finetuned-ks/best_checkpoint/preprocessor_config.json


In [None]:
#| eval: false

inputs = encoded_dataset['test']

with torch.no_grad():
        result = trainer.predict(test_dataset = inputs)
result

***** Running Prediction *****
  Num examples = 4479
  Batch size = 32


PredictionOutput(predictions=array([[ 1.2470611, -1.2712642],
       [ 3.5061896, -3.6580167],
       [ 3.2142575, -3.3506398],
       ...,
       [ 3.4247284, -3.5656137],
       [ 3.067631 , -3.2232652],
       [ 3.5443592, -3.709301 ]], dtype=float32), label_ids=array([0, 0, 0, ..., 0, 0, 0]), metrics={'test_loss': 0.1558578908443451, 'test_accuracy': 0.9278856887698147, 'test_runtime': 28.6291, 'test_samples_per_second': 156.449, 'test_steps_per_second': 4.89})

## Chow mein

In [None]:
#| eval: false

predicted_labels = np.argmax(result.predictions, axis=-1)
positive_hits = [p==t for p, t in zip(predicted_labels, inputs['label']) if t == 1]
correct_positive = sum(positive_hits)
correct_negative = sum([p==t for p, t in zip(predicted_labels, inputs['label']) if t == 0])
n_positive = sum(inputs['label'])
n_negative = len(inputs['label']) - n_positive

print(f"Overall accuracy: {(correct_positive + correct_negative) / (n_positive + n_negative)}")
print(f"Positive accutacy: {correct_positive / n_positive}")
print(f"Negative accuracy: {correct_negative / n_negative}")

Overall accuracy: 0.9752176825184193
Positive accutacy: 0.7633587786259542
Negative accuracy: 0.9816007359705612


In [None]:
#| eval: false

import shutil
from pathlib import Path

positive_hits_files = [f for p, t, f in zip(predicted_labels, dataset['test']['label'], dataset['test']['file']) if t == 1 and p==t]
for f in positive_hits_files:
    shutil.copy(f, f"found/{Path(f).name}")

In [None]:
#| eval: false

import shutil
from pathlib import Path

positive_miss_files = [f for p, t, f in zip(predicted_labels, dataset['test']['label'], dataset['test']['file']) if t == 1 and p!=t]
for f in positive_miss_files:
    shutil.copy(f, f"not found/{Path(f).name}")