In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os  

In [4]:
os.getcwd() 

'/root/data/experiments/exp_beats'

In [5]:
import sys                                                                             # Python system library needed to load custom functions
import numpy as np                                                                     # for performing calculations on numerical arrays
import pandas as pd                                                                    # home of the DataFrame construct, _the_ most important object for Data Science
import seaborn as sns                                                                  # additional plotting library
import matplotlib.pyplot as plt                                                        # allows creation of insightful plots
import os                                                                              # for changing the directory

import sagemaker                                                                       # dedicated sagemaker library to execute training jobs
import boto3                                                                           # for interacting with S3 buckets

from sagemaker.huggingface import HuggingFace                                           # for executing the trainig jobs
from sklearn.metrics import precision_recall_fscore_support, accuracy_score             # tools to understand how our model is performing

#sys.path.append('')                                                               # Add the source directory to the PYTHONPATH. This allows to import local functions and modules.
from config import DEFAULT_BUCKET, DEFAULT_REGION  
from gdsc_utils import create_encrypted_bucket, download_and_extract_model, PROJECT_DIR # functions to create S3 buckets and to help with downloading models. Importing our root directory
from gdsc_eval import plot_confusion_matrix                                             # function for creating confusion matrix                                     # importing the bucket name that contains data for the challenge and the default region
os.chdir(PROJECT_DIR)                                                                   # changing our directory to root

In [6]:
import logging                                                    # module for displaying relevant information in the logs
import sys                                                        # to access to some variables used or maintained by the interpreter 
import argparse                                                   # to parse arguments from passed in the hyperparameters
import os                                                         # to manage environmental variables
import json                                                       # to open the json file with labels
from transformers import (                                        # required classes to perform the model training and implement early stopping
    ASTFeatureExtractor, 
    ASTForAudioClassification, 
    Trainer, 
    TrainingArguments, 
    EarlyStoppingCallback
)                                    
import torch                                                       # library to work with PyTorch tensors and to figure out if we have a GPU available
from datasets import load_dataset, Audio, Dataset                  # required tools to create, load and process our audio dataset
import pandas as pd                                                # home of the DataFrame construct, _the_ most important object for Data Science
from preprocessing import preprocess_audio_arrays                  # functions to preprocess the dataset with ASTFeatureExtractor
from gdsc_eval import compute_metrics, make_predictions            # functions to create predictions and evaluate them
from typing import Optional                                        # for type hints


In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [8]:
parser = argparse.ArgumentParser()

# hyperparameters sent from our jupyter notebook are passed as command-line arguments to the script
# preprocessing hyperparameters
# parser.add_argument("--sampling_rate", type=int, default=22050)                        # sampling rate to which we will cast audio files
parser.add_argument("--sampling_rate", type=int, default=16000)                        # sampling rate to which we will cast audio files
parser.add_argument("--fe_batch_size", type=int, default=32)                           # feature extractor batch size
# parser.add_argument("--train_dataset_mean", type=float, default=-8.076275929131292)                  # mean value of spectrograms of our data
# parser.add_argument("--train_dataset_std", type=float, default=3.984092920341275)    
# standard deviation value of spectrograms of our resampled data
parser.add_argument("--train_dataset_mean", type=float, default=-8.141991150530815)                  # mean value of spectrograms of our data
parser.add_argument("--train_dataset_std", type=float, default=4.095692486358449)                   # standard deviation value of spectrograms of our resampled data

# training hyperparameters
parser.add_argument("--model_name", type=str)                                          # name of the pretrained model from HuggingFace
parser.add_argument("--learning_rate", type=float, default=2e-5)                       # learning rate
parser.add_argument("--epochs", type=int, default=4)                                   # number of training epochs 
parser.add_argument("--train_batch_size", type=int, default=4)                        # training batch size
parser.add_argument("--eval_batch_size", type=int, default=64)                         # evaluation batch size
parser.add_argument("--patience", type=int, default=2)                                 # early stopping - how many epoch without improvement will stop the training 
# parser.add_argument("--data_channel", type=str, default=os.environ["SM_CHANNEL_DATA"]) # directory where input data from S3 is stored
parser.add_argument("--train_dir", type=str, default="train")                          # folder name with training data
parser.add_argument("--val_dir", type=str, default="val")                              # folder name with validation data
parser.add_argument("--test_dir", type=str, default="test")                            # folder name with test data
# parser.add_argument("--output_dir", type=str, default=os.environ['SM_MODEL_DIR'])      # output directory. This directory will be saved in the S3 bucket


args, _ = parser.parse_known_args()                    # parsing arguments from the notebook


# train_path = f"{args.data_channel}/{args.train_dir}"   # directory of our training dataset on the instance
# val_path = f"{args.data_channel}/{args.val_dir}"       # directory of our validation dataset on the instance
# test_path = f"{args.data_channel}/{args.test_dir}"     # directory of our test dataset on the instance

# train_path = 'data/data_small/train'
# val_path = 'data/data_small/val'

train_path = '../data/train'
val_path = '../data/val'

# experiments/data/data_small/train/Achetadomesticus_XC751747-dat009-001_edit1.wav
# Set up logging which allows to print information in logs
logger = logging.getLogger(__name__)

logging.basicConfig(
    level=logging.getLevelName("INFO"),
    handlers=[logging.StreamHandler(sys.stdout)],
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)

In [9]:
def get_feature_extractor(model_name: str, 
                          train_dataset_mean: Optional[float] = None, 
                          train_dataset_std: Optional[float] = None) -> ASTFeatureExtractor:
    """
    Retrieves a feature extractor for audio signal processing.

    Args:
        model_name (str): The name of the pre-trained model to use.
        train_dataset_mean (float, optional): The mean value of the training dataset. Defaults to None.
        train_dataset_std (float, optional): The standard deviation of the training dataset. Defaults to None.

    Returns:
        ASTFeatureExtractor: An instance of the ASTFeatureExtractor class.

    """
    if all((train_dataset_mean, train_dataset_std)):
        feature_extractor = ASTFeatureExtractor.from_pretrained(model_name, mean=train_dataset_mean, std=train_dataset_std, max_length=1024)
        logger.info(f" feature extractor loaded with dataset mean: {train_dataset_mean} and standard deviation: {train_dataset_std}")
    else:
        feature_extractor = ASTFeatureExtractor.from_pretrained(model_name)
        logger.info(" at least one of the optional arguments (mean, std) is missing")
        logger.info(f" feature extractor loaded with default dataset mean: {feature_extractor.mean} and standard deviation: {feature_extractor.std}")
        
    return feature_extractor

def preprocess_data_for_training(
    dataset_path: str,
    sampling_rate: int,
    # feature_extractor: ASTFeatureExtractor,
    fe_batch_size: int,
    dataset_name: str,
    shuffle: bool = False,
    extract_file_name: bool = True) -> Dataset:
    """
    Preprocesses audio data for training.

    Args:
        dataset_path (str): The path to the dataset.
        sampling_rate (int): The desired sampling rate for the audio.
        feature_extractor (ASTFeatureExtractor): The feature extractor to use for preprocessing.
        fe_batch_size (int): The batch size for feature extraction.
        dataset_name (str, optional): The name of the dataset. Defaults to None.
        shuffle (bool, optional): Whether to shuffle the dataset. Defaults to False.
        extract_file_name (bool, optional): Whether to extract paths from audio features. Defaults to True.

    Returns:
        dataset: The preprocessed dataset.

    """
    dataset = load_dataset("audiofolder", data_dir=dataset_path).get('train') # loading the dataset
    
    # perform shuffle if specified
    if shuffle:
        dataset = dataset.shuffle(seed=42)
        
    logger.info(f" loaded {dataset_name} dataset length is: {len(dataset)}")

    if extract_file_name:
        remove_metadata = lambda x: x.endswith(".wav")
        extract_file_name = lambda x: x.split('/')[-1]

        dataset_paths = list(dataset.info.download_checksums.keys())
        dataset_paths = list(filter(remove_metadata, dataset_paths))
        dataset_paths = list(map(extract_file_name, dataset_paths))
        dataset = dataset.add_column("file_name", dataset_paths)

    dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
    
    logger.info(f" {dataset_name} dataset sampling rate casted to: {sampling_rate}")

    # dataset_encoded = dataset.map(
    #     lambda x: preprocess_audio_arrays(x, 'audio', 'array', feature_extractor),
    #     remove_columns="audio",
    #     batched=True,
    #     batch_size=fe_batch_size
    # )
    
    logger.info(f" done extracting features for {dataset_name} dataset")
    
    
    # return dataset_encoded
    return dataset

In [10]:
# with open(f'data/data_small/labels.json', 'r') as f:
with open(f'../data/labels.json', 'r') as f:
        labels = json.load(f)
    
# Create mapping from label to id and id to label
label2id, id2label = dict(), dict()
for k, v in labels.items():
    label2id[k] = str(v)
    id2label[str(v)] = k

num_labels = len(label2id)  # define number of labels


In [11]:
# feature_extractor = get_feature_extractor(args.model_name, args.train_dataset_mean, args.train_dataset_std)

# creating train and validation datasets
train_dataset_encoded = preprocess_data_for_training(dataset_path=train_path, sampling_rate=args.sampling_rate,
                                                     fe_batch_size=args.fe_batch_size, dataset_name="train", shuffle=True, extract_file_name=False)

val_dataset_encoded = preprocess_data_for_training(dataset_path=val_path, sampling_rate=args.sampling_rate, 
                                                   fe_batch_size=args.fe_batch_size, dataset_name="validation")

# Download model from model hub
# model = ASTForAudioClassification.from_pretrained(args.model_name, num_labels=num_labels, label2id=label2id, id2label=id2label, ignore_mismatched_sizes=True)


Resolving data files:   0%|          | 0/1753 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]

2023-07-12 10:45:48,098 - __main__ - INFO -  loaded train dataset length is: 1752
2023-07-12 10:45:48,108 - __main__ - INFO -  train dataset sampling rate casted to: 16000
2023-07-12 10:45:48,109 - __main__ - INFO -  done extracting features for train dataset


Resolving data files:   0%|          | 0/580 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]

2023-07-12 10:45:48,897 - __main__ - INFO -  loaded validation dataset length is: 579
2023-07-12 10:45:48,941 - __main__ - INFO -  validation dataset sampling rate casted to: 16000
2023-07-12 10:45:48,942 - __main__ - INFO -  done extracting features for validation dataset


In [102]:
lengts = {}
for i in train_dataset_encoded:
    current_list = lengts.get(i['label'], list())
    lengts[i['label']] = current_list + [len(i['audio']['array']) / 16000]

In [103]:
information2 = {}
for k, v in lengts.items():
    information2[k] = {'mean': np.mean(v), 'std': np.std(v), 'num_of_files': len(v), 'sum_len': np.sum(v)}

In [107]:
import json

with open('data.json', 'w') as fp:
    json.dump(information2, fp)

In [None]:
pd.DataFrame(information2).transpose().sort_index().head(10)

Unnamed: 0,mean,std,num_of_files,sum_len
0,108.096906,116.452462,20.0,2161.938125
1,10.771442,4.701622,14.0,150.800188
2,6.626821,2.388108,7.0,46.38775
3,9.828785,7.101325,9.0,88.459063
4,20.998556,20.95596,9.0,188.987
5,33.038416,76.682292,26.0,858.998812
6,27.933693,34.120369,12.0,335.204313
7,247.166875,288.70616,7.0,1730.168125
8,35.185845,32.650929,31.0,1090.761188
9,14.453971,7.984019,80.0,1156.317687


### model and dataset initialization

In [12]:
import torch.nn as nn
from BEATs import BEATs, BEATsConfig
checkpoint = torch.load('exp_beats/BEATs_iter3_finetuned_on_AS2M_cpt1.pt')

cfg = BEATsConfig(checkpoint['cfg'])
BEATs_model = BEATs(cfg)
BEATs_model.load_state_dict(checkpoint['model'])

BEATs_model.predictor_dropout = nn.Dropout(p=0.2, inplace=False)
BEATs_model.predictor = nn.Linear(in_features=768, out_features=66, bias=True)

2023-07-12 10:46:22,314 - BEATs - INFO - BEATs Config: {'input_patch_size': 16, 'embed_dim': 512, 'conv_bias': False, 'encoder_layers': 12, 'encoder_embed_dim': 768, 'encoder_ffn_embed_dim': 3072, 'encoder_attention_heads': 12, 'activation_fn': 'gelu', 'layer_wise_gradient_decay_ratio': 0.6, 'layer_norm_first': False, 'deep_norm': True, 'dropout': 0.0, 'attention_dropout': 0.0, 'activation_dropout': 0.0, 'encoder_layerdrop': 0.05, 'dropout_input': 0.0, 'conv_pos': 128, 'conv_pos_groups': 16, 'relative_position_embedding': True, 'num_buckets': 320, 'max_distance': 800, 'gru_rel_pos': True, 'finetuned_model': True, 'predictor_dropout': 0.0, 'predictor_class': 527}


In [17]:
BEATs_model.encoder

TransformerEncoder(
  (pos_conv): Sequential(
    (0): Conv1d(768, 768, kernel_size=(128,), stride=(1,), padding=(64,), groups=16)
    (1): SamePad()
    (2): GELU(approximate='none')
  )
  (layers): ModuleList(
    (0): TransformerSentenceEncoderLayer(
      (self_attn): MultiheadAttention(
        (dropout_module): Dropout(p=0.0, inplace=False)
        (relative_attention_bias): Embedding(320, 12)
        (k_proj): Linear(in_features=768, out_features=768, bias=True)
        (v_proj): Linear(in_features=768, out_features=768, bias=True)
        (q_proj): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (grep_linear): Linear(in_features=64, out_features=8, bias=True)
      )
      (dropout1): Dropout(p=0.0, inplace=False)
      (dropout2): Dropout(p=0.0, inplace=False)
      (dropout3): Dropout(p=0.0, inplace=False)
      (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  

In [21]:
import torch.nn as nn
from BEATs import BEATs, BEATsConfig
import pickle

class BEATsPretrain(nn.Module):
    def __init__(self, num_class=66):
        super(BEATsPretrain, self).__init__()
        
        checkpoint = torch.load('exp_beats/BEATs_iter3_finetuned_on_AS2M_cpt1.pt')

        cfg = BEATsConfig(checkpoint['cfg'])
        BEATs_model = BEATs(cfg)
        BEATs_model.load_state_dict(checkpoint['model'])
        
        BEATs_model.predictor_dropout = nn.Dropout(p=0.2, inplace=False)
        BEATs_model.predictor = nn.Linear(in_features=768, out_features=66, bias=True)
        BEATs_model.predictor2 = nn.Linear(in_features=66, out_features=10, bias=True)
        self.model = BEATs_model
        
    def forward(self, x):
        out = self.model.extract_features(x, padding_mask=torch.zeros(x.shape).bool().to(device))[0]
        return out


In [22]:
test = BEATsPretrain()

2023-07-12 10:57:17,713 - BEATs - INFO - BEATs Config: {'input_patch_size': 16, 'embed_dim': 512, 'conv_bias': False, 'encoder_layers': 12, 'encoder_embed_dim': 768, 'encoder_ffn_embed_dim': 3072, 'encoder_attention_heads': 12, 'activation_fn': 'gelu', 'layer_wise_gradient_decay_ratio': 0.6, 'layer_norm_first': False, 'deep_norm': True, 'dropout': 0.0, 'attention_dropout': 0.0, 'activation_dropout': 0.0, 'encoder_layerdrop': 0.05, 'dropout_input': 0.0, 'conv_pos': 128, 'conv_pos_groups': 16, 'relative_position_embedding': True, 'num_buckets': 320, 'max_distance': 800, 'gru_rel_pos': True, 'finetuned_model': True, 'predictor_dropout': 0.0, 'predictor_class': 527}


In [23]:
test

BEATsPretrain(
  (model): BEATs(
    (post_extract_proj): Linear(in_features=512, out_features=768, bias=True)
    (patch_embedding): Conv2d(1, 512, kernel_size=(16, 16), stride=(16, 16), bias=False)
    (dropout_input): Dropout(p=0.0, inplace=False)
    (encoder): TransformerEncoder(
      (pos_conv): Sequential(
        (0): Conv1d(768, 768, kernel_size=(128,), stride=(1,), padding=(64,), groups=16)
        (1): SamePad()
        (2): GELU(approximate='none')
      )
      (layers): ModuleList(
        (0): TransformerSentenceEncoderLayer(
          (self_attn): MultiheadAttention(
            (dropout_module): Dropout(p=0.0, inplace=False)
            (relative_attention_bias): Embedding(320, 12)
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, 

In [28]:
class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, train=True):
        self.rnd = np.random.RandomState(0)
        
        # if train:
        self.data = train_dataset_encoded
        self.max_len = 1100000
        # else:
        #     self.x_data = torch.tensor(val_dataset_encoded[:]['input_values'], dtype=torch.float32).to(device)
        #     self.y_data = torch.tensor(val_dataset_encoded[:]['label'], dtype=torch.int64).to(device) 

        # self.n = len(self.x_data)

    def __len__(self):
        return self.data.num_rows

    def __getitem__(self, index):
        data = self.data[index]
        y = torch.tensor(data['label'], dtype=torch.int64).to(device) 
        x = torch.tensor(data['audio']['array'], dtype=torch.float32).to(device)
        x = torch.nn.functional.pad(x, (0, self.max_len - x.shape[0]), value=0)
        # torch.tensor(train_dataset_encoded[:]['input_values'], dtype=torch.float32).to(device)
        # x = self.x_data[index]
        # y = self.y_data[index]

        return (x, y)

In [13]:
def make_predictions2(examples: torch.Tensor, 
                     model: torch.nn.Module, 
                     device,
                     labels: torch.Tensor = None):
    model = model.to(device)
    examples = torch.tensor(examples, dtype=torch.float32).to(device)
    if labels:
        labels = torch.tensor(labels, dtype=torch.int64).to(device) 

    with torch.no_grad():
        logits = model(examples)
    predicted_class_id = [str(torch.argmax(item).item()) for item in logits]
    if isinstance(labels, torch.Tensor):
        loss = torch.nn.functional.cross_entropy(logits.view(-1, 66), labels.to(device).view(-1), reduction="none")
        loss = loss.view(len(examples), -1).cpu().numpy()

        return {'predicted_class_id': predicted_class_id, 'loss': loss, 'logits': logits}
    else:
        return {'predicted_class_id': predicted_class_id}

    
def compute_metrics(pred, labels):
    # labels = pred.label_ids
    # preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, pred, average="macro")
    acc = accuracy_score(labels, pred)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

### Training

In [None]:
from tqdm import tqdm
lrn_rate = 1e-3
max_epochs = 10
ep_log_interval = 1

classification_net = BEATsPretrain().to(device)

optimizer_class = torch.optim.AdamW(classification_net.parameters(), lr=lrn_rate)
categorical_crossentropy = torch.nn.functional.cross_entropy

train_class_ds = ClassificationDataset()

batch_size_class = 1
train_class_ldr = torch.utils.data.DataLoader(train_class_ds, batch_size=batch_size_class, shuffle=True)


for epoch in range(0, max_epochs):
    ep_loss = 0  # for one full epoch
    for (batch_idx, batch) in tqdm(enumerate(train_class_ldr)):
        X, y = batch
        output = classification_net(X)

        optimizer_class.zero_grad()       # reset gradients
        loss_val = categorical_crossentropy(output, y)

        ep_loss += loss_val.item()  # accumulate loss
        loss_val.backward()         # compute grads
        optimizer_class.step()            # update weights
    if epoch % ep_log_interval == 0:
        print("epoch = %4d  |  loss = %10.4f" % (epoch, ep_loss))
        # val_dataset_encoded2 = val_dataset_encoded.map(lambda x: make_predictions2(x['audio']['array'], classification_net, device, x['label']), batched=True, batch_size=1, remove_columns="input_values")
        # print(compute_metrics([int(i) for i in val_dataset_encoded2[:]['predicted_class_id']], val_dataset_encoded2[:]['label']))

2023-07-07 07:59:08,859 - BEATs - INFO - BEATs Config: {'input_patch_size': 16, 'embed_dim': 512, 'conv_bias': False, 'encoder_layers': 12, 'encoder_embed_dim': 768, 'encoder_ffn_embed_dim': 3072, 'encoder_attention_heads': 12, 'activation_fn': 'gelu', 'layer_wise_gradient_decay_ratio': 0.6, 'layer_norm_first': False, 'deep_norm': True, 'dropout': 0.0, 'attention_dropout': 0.0, 'activation_dropout': 0.0, 'encoder_layerdrop': 0.05, 'dropout_input': 0.0, 'conv_pos': 128, 'conv_pos_groups': 16, 'relative_position_embedding': True, 'num_buckets': 320, 'max_distance': 800, 'gru_rel_pos': True, 'finetuned_model': True, 'predictor_dropout': 0.0, 'predictor_class': 527}


0it [00:00, ?it/s]


### Testing

In [13]:
from tqdm import tqdm
class ClassificationDatasetTest(torch.utils.data.Dataset):
    def __init__(self, train=True):
        self.rnd = np.random.RandomState(0)
        
        # if train:
        self.data = val_dataset_encoded
        self.max_len = 1100000
        # else:
        #     self.x_data = torch.tensor(val_dataset_encoded[:]['input_values'], dtype=torch.float32).to(device)
        #     self.y_data = torch.tensor(val_dataset_encoded[:]['label'], dtype=torch.int64).to(device) 

        # self.n = len(self.x_data)

    def __len__(self):
        return self.data.num_rows

    def __getitem__(self, index):
        data = self.data[index]
        y = torch.tensor(data['label'], dtype=torch.int64).to(device) 
        x = torch.tensor(data['audio']['array'], dtype=torch.float32).to(device)
        x = torch.nn.functional.pad(x, (0, self.max_len - x.shape[0]), value=0)

        return (x, y)

In [14]:
import pickle
classification_net = BEATsPretrain().to(device)
classification_net.model = pickle.load(open('model_beats.pkl', 'rb'))

2023-07-07 08:09:05,869 - BEATs - INFO - BEATs Config: {'input_patch_size': 16, 'embed_dim': 512, 'conv_bias': False, 'encoder_layers': 12, 'encoder_embed_dim': 768, 'encoder_ffn_embed_dim': 3072, 'encoder_attention_heads': 12, 'activation_fn': 'gelu', 'layer_wise_gradient_decay_ratio': 0.6, 'layer_norm_first': False, 'deep_norm': True, 'dropout': 0.0, 'attention_dropout': 0.0, 'activation_dropout': 0.0, 'encoder_layerdrop': 0.05, 'dropout_input': 0.0, 'conv_pos': 128, 'conv_pos_groups': 16, 'relative_position_embedding': True, 'num_buckets': 320, 'max_distance': 800, 'gru_rel_pos': True, 'finetuned_model': True, 'predictor_dropout': 0.0, 'predictor_class': 527}


In [15]:
val_class_ds = ClassificationDatasetTest()

batch_size_class = 1
val_class_ds = torch.utils.data.DataLoader(val_class_ds, batch_size=batch_size_class, shuffle=True)

results = []
labels = []
for (batch_idx, batch) in tqdm(enumerate(val_class_ds)):
    X, y = batch
    output = classification_net(X)
    results.append(output)
    labels.append(y)


1it [00:08,  8.86s/it]


In [None]:
compute_metrics([int(i) for i in results], labels) # not tested yet. Probably return error