# Setup

First, we need to import required libraries and functions.

In [None]:
!pip install tensorflow

In [2]:
import tensorflow as tf

2023-07-04 12:50:17.213337: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-07-04 12:50:17.215730: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-04 12:50:17.266746: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-04 12:50:17.267531: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
tf.keras.utils.set_random_seed(42)

In [4]:
#line to render the plots under the code cell that created it
%matplotlib inline
import json  # for working with json files
import sys  # Python system library needed to load custom functions
import numpy as np  # for performing calculations on numerical arrays
import pandas as pd  # home of the DataFrame construct, _the_ most important object for Data Science
import torch  # library to work with PyTorch tensors and to figure out if we have a GPU available
import os     # for changing the directory

from tensorflow import keras
from tensorflow.keras import layers

from datasets import load_dataset, Audio  # required tools to create, load and process our audio dataset
from transformers import ASTFeatureExtractor, ASTForAudioClassification, TrainingArguments, Trainer  # required classes to perform the model training

sys.path.append('../src')  # add the source directory to the PYTHONPATH. This allows to import local functions and modules.
from gdsc_utils import download_directory, PROJECT_DIR # function to download the needed files from the official GDSC s3 bucket and our root directory
from config import DEFAULT_BUCKET  # S3 bucket with the GDSC data
from preprocessing import calculate_stats, preprocess_audio_arrays  # functions to calculate dataset statistics and preprocess the dataset with ASTFeatureExtractor
from gdsc_eval import make_predictions, compute_metrics  # functions to create predictions and evaluate them
os.chdir(PROJECT_DIR) # changing our directory to root

In [None]:
#os.chdir('..')

In [5]:
# Maximum duration of the input audio file we feed to our Wav2Vec 2.0 model.
MAX_DURATION = 2
# Sampling rate is the number of samples of audio recorded every second
SAMPLING_RATE = 22050
BATCH_SIZE = 16  # Batch-size for training and evaluating our model.
NUM_CLASSES = 66  # Number of classes our dataset will have (11 in our case).
HIDDEN_DIM = 768  # Dimension of our model output (768 in case of Wav2Vec 2.0 - Base).
MAX_SEQ_LENGTH = MAX_DURATION * SAMPLING_RATE  # Maximum length of the input audio file.
# Wav2Vec 2.0 results in an output frequency with a stride of about 20ms.
MAX_FRAMES = 137
MAX_EPOCHS = 4  # Maximum number of training epochs.
MODEL_CHECKPOINT = "facebook/wav2vec2-base"  # Name of pretrained model from Hugging Face Model Hub

## Downloading data

In [None]:
download_directory('data/', 'dataWav2Vec', DEFAULT_BUCKET) 

## Creating the datasets

In [6]:
print(os.getcwd())

/root/data/experiments


In [7]:
# paths for the train and validation datasets
train_path = 'dataWav2Vec/data/train'

In [8]:
f"{train_path}/metadata.csv"

'dataWav2Vec/data/train/metadata.csv'

In [9]:
train_meta_df = pd.read_csv(f"{train_path}/metadata.csv")

In [10]:
speech_commands_v1 = load_dataset("audiofolder", data_dir=train_path)

Resolving data files:   0%|          | 0/1753 [00:00<?, ?it/s]

Found cached dataset audiofolder (/root/.cache/huggingface/datasets/audiofolder/default-d2f57ec63a146101/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc)


  0%|          | 0/1 [00:00<?, ?it/s]

In [11]:
speech_commands_v1

DatasetDict({
    train: Dataset({
        features: ['audio', 'label'],
        num_rows: 1752
    })
})

In [12]:
speech_commands_v1 = speech_commands_v1.class_encode_column("label")

Loading cached processed dataset at /root/.cache/huggingface/datasets/audiofolder/default-d2f57ec63a146101/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc/cache-6e1d530764242002.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/audiofolder/default-d2f57ec63a146101/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc/cache-bfcd8af5980748f1.arrow


In [13]:
speech_commands_v1 = speech_commands_v1["train"].train_test_split(
    train_size=0.8, test_size=0.2, stratify_by_column="label"
)

Loading cached split indices for dataset at /root/.cache/huggingface/datasets/audiofolder/default-d2f57ec63a146101/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc/cache-4c8cda4bf41931c4.arrow and /root/.cache/huggingface/datasets/audiofolder/default-d2f57ec63a146101/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc/cache-4d77d141314252cf.arrow


In [14]:
# speech_commands_v1 = speech_commands_v1.filter(
#     lambda x: x["label"]
#     != (
#         speech_commands_v1["train"].features["label"].names.index("_unknown_")
#         and speech_commands_v1["train"].features["label"].names.index("_silence_")
#     )
# )

In [15]:
speech_commands_v1["train"] = speech_commands_v1["train"].select(
    [i for i in range((len(speech_commands_v1["train"]) // BATCH_SIZE) * BATCH_SIZE)]
)
speech_commands_v1["test"] = speech_commands_v1["test"].select(
    [i for i in range((len(speech_commands_v1["test"]) // BATCH_SIZE) * BATCH_SIZE)]
)

print(speech_commands_v1)

DatasetDict({
    train: Dataset({
        features: ['audio', 'label'],
        num_rows: 1392
    })
    test: Dataset({
        features: ['audio', 'label'],
        num_rows: 336
    })
})


In [16]:
labels = speech_commands_v1["train"].features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

print(id2label)

{'0': '0', '1': '1', '2': '10', '3': '11', '4': '12', '5': '13', '6': '14', '7': '15', '8': '16', '9': '17', '10': '18', '11': '19', '12': '2', '13': '20', '14': '21', '15': '22', '16': '23', '17': '24', '18': '25', '19': '26', '20': '27', '21': '28', '22': '29', '23': '3', '24': '30', '25': '31', '26': '32', '27': '33', '28': '34', '29': '35', '30': '36', '31': '37', '32': '38', '33': '39', '34': '4', '35': '40', '36': '41', '37': '42', '38': '43', '39': '44', '40': '45', '41': '46', '42': '47', '43': '48', '44': '49', '45': '5', '46': '50', '47': '51', '48': '52', '49': '53', '50': '54', '51': '55', '52': '56', '53': '57', '54': '58', '55': '59', '56': '6', '57': '60', '58': '61', '59': '62', '60': '63', '61': '64', '62': '65', '63': '7', '64': '8', '65': '9'}


In [17]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained(
    MODEL_CHECKPOINT, return_attention_mask=True
)



In [18]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=MAX_SEQ_LENGTH,
        truncation=True,
        padding=True,
    )
    return inputs

In [19]:
# This line with pre-process our speech_commands_v1 dataset. We also remove the "audio"
# column as they will be of no use to us while training.
processed_speech_commands_v1 = speech_commands_v1.map(
    preprocess_function, remove_columns=["audio"], batched=True, batch_size=32
)

Loading cached processed dataset at /root/.cache/huggingface/datasets/audiofolder/default-d2f57ec63a146101/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc/cache-0f82b4ee1531aa00.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/audiofolder/default-d2f57ec63a146101/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc/cache-f6fa5ce4384abf96.arrow


In [20]:
processed_speech_commands_v1

DatasetDict({
    train: Dataset({
        features: ['label', 'input_values', 'attention_mask'],
        num_rows: 1392
    })
    test: Dataset({
        features: ['label', 'input_values', 'attention_mask'],
        num_rows: 336
    })
})

In [21]:
# Load the whole dataset splits as a dict of numpy arrays
train = processed_speech_commands_v1["train"].shuffle(seed=42).with_format("numpy")[:]
test = processed_speech_commands_v1["test"].shuffle(seed=42).with_format("numpy")[:]

Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/audiofolder/default-d2f57ec63a146101/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc/cache-66d7d2e3d39b95bf.arrow
Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/audiofolder/default-d2f57ec63a146101/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc/cache-4bae70baa666c85b.arrow


In [22]:
#train['label'] = tf.one_hot(train['label'], 66)
#test['label'] = tf.one_hot(test['label'], 66)

In [23]:
from transformers import TFWav2Vec2Model

In [24]:
def mean_pool(hidden_states, feature_lengths):
    attenion_mask = tf.sequence_mask(
        feature_lengths, maxlen=MAX_FRAMES, dtype=tf.dtypes.int64
    )
    padding_mask = tf.cast(
        tf.reverse(tf.cumsum(tf.reverse(attenion_mask, [-1]), -1), [-1]),
        dtype=tf.dtypes.bool,
    )
    hidden_states = tf.where(
        tf.broadcast_to(
            tf.expand_dims(~padding_mask, -1), (BATCH_SIZE, MAX_FRAMES, HIDDEN_DIM)
        ),
        0.0,
        hidden_states,
    )
    pooled_state = tf.math.reduce_sum(hidden_states, axis=1) / tf.reshape(
        tf.math.reduce_sum(tf.cast(padding_mask, dtype=tf.dtypes.float32), axis=1),
        [-1, 1],
    )
    return pooled_state


class TFWav2Vec2ForAudioClassification(layers.Layer):
    """Combines the encoder and decoder into an end-to-end model for training."""

    def __init__(self, model_checkpoint, num_classes):
        super().__init__()
        # Instantiate the Wav2Vec 2.0 model without the Classification-Head
        self.wav2vec2 = TFWav2Vec2Model.from_pretrained(
            model_checkpoint, apply_spec_augment=False, from_pt=True
        )
        self.pooling = layers.GlobalAveragePooling1D()
        # Drop-out layer before the final Classification-Head
        self.intermediate_layer_dropout = layers.Dropout(0.5)
        # Classification-Head
        self.final_layer = layers.Dense(num_classes, activation="softmax")

    def call(self, inputs):
        # We take only the first output in the returned dictionary corresponding to the
        # output of the last layer of Wav2vec 2.0
        hidden_states = self.wav2vec2(inputs["input_values"])[0]

        # If attention mask does exist then mean-pool only un-masked output frames
        if tf.is_tensor(inputs["attention_mask"]):
            # Get the length of each audio input by summing up the attention_mask
            # (attention_mask = (BATCH_SIZE x MAX_SEQ_LENGTH) ∈ {1,0})
            audio_lengths = tf.cumsum(inputs["attention_mask"], -1)[:, -1]
            # Get the number of Wav2Vec 2.0 output frames for each corresponding audio input
            # length
            feature_lengths = self.wav2vec2.wav2vec2._get_feat_extract_output_lengths(
                audio_lengths
            )
            pooled_state = mean_pool(hidden_states, feature_lengths)
        # If attention mask does not exist then mean-pool only all output frames
        else:
            pooled_state = self.pooling(hidden_states)

        intermediate_state = self.intermediate_layer_dropout(pooled_state)
        final_state = self.final_layer(intermediate_state)

        return final_state


In [25]:
!pip install tensorflow-addons

[0m

In [26]:
import tensorflow_addons as tfa 


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [None]:
from tensorflow.keras import backend as K

def f1(y_true, y_pred):    
    def recall_m(y_true, y_pred):
        TP = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        Positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        
        recall = TP / (Positives+K.epsilon())    
        return recall 
    
    
    def precision_m(y_true, y_pred):
        TP = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        Pred_Positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    
        precision = TP / (Pred_Positives+K.epsilon())
        return precision 
    
    precision, recall = precision_m(y_true, y_pred), recall_m(y_true, y_pred)
    
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
#f1 = tfa.metrics.F1Score(num_classes=66, average='macro')

In [29]:
def build_model():
    # Model's input
    inputs = {
        "input_values": tf.keras.Input(shape=(MAX_SEQ_LENGTH,), dtype="float32"),
        "attention_mask": tf.keras.Input(shape=(MAX_SEQ_LENGTH,), dtype="int32"),
    }
    # Instantiate the Wav2Vec 2.0 model with Classification-Head using the desired
    # pre-trained checkpoint
    wav2vec2_model = TFWav2Vec2ForAudioClassification(MODEL_CHECKPOINT, NUM_CLASSES)(
        inputs
    )
    # Model
    model = tf.keras.Model(inputs, wav2vec2_model)
    # Loss
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False) # SparseCategoricalCrossentropy
    # Learning rate
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        2e-5,
        decay_steps=100000,
        decay_rate=0.96,
        staircase=True)

    # Optimizer
    optimizer = keras.optimizers.Adam(learning_rate=lr_schedule)
    # Compile and return
    model.compile(loss=loss, optimizer=optimizer, metrics=["sparse_categorical_accuracy"])
    return model


model = build_model()


TFWav2Vec2Model has backpropagation operations that are NOT supported on CPU. If you wish to train/fine-tine this model, you need a GPU or a TPU
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFWav2Vec2Model: ['project_hid.bias', 'project_q.bias', 'project_q.weight', 'project_hid.weight', 'quantizer.weight_proj.weight', 'quantizer.weight_proj.bias', 'quantizer.codevectors']
- This IS expected if you are initializing TFWav2Vec2Model from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFWav2Vec2Model from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFWav2Vec2Model were initialized from the PyTorch model.
If your task is similar to the task the model of the che

In [30]:
train_x = {x: y for x, y in train.items() if x != "label"}
test_x = {x: y for x, y in test.items() if x != "label"}

In [31]:
model_history = model.fit(
    train_x,
    train["label"],
    validation_data=(test_x, test["label"]),
    batch_size=BATCH_SIZE,
    epochs=MAX_EPOCHS,
)

Epoch 1/4
Epoch 2/4
 3/87 [>.............................] - ETA: 1:23:44 - loss: 3.8902 - sparse_categorical_accuracy: 0.1250