In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os  

In [5]:
os.getcwd() 

'/root/data/experiments/exp_2023-07-07_CNN'

In [6]:
import sys                                                                             # Python system library needed to load custom functions
import numpy as np                                                                     # for performing calculations on numerical arrays
import pandas as pd                                                                    # home of the DataFrame construct, _the_ most important object for Data Science
import seaborn as sns                                                                  # additional plotting library
import matplotlib.pyplot as plt                                                        # allows creation of insightful plots
import os                                                                              # for changing the directory

import sagemaker                                                                       # dedicated sagemaker library to execute training jobs
import boto3                                                                           # for interacting with S3 buckets

from sagemaker.huggingface import HuggingFace                                           # for executing the trainig jobs
from sklearn.metrics import precision_recall_fscore_support, accuracy_score             # tools to understand how our model is performing

#sys.path.append('')                                                               # Add the source directory to the PYTHONPATH. This allows to import local functions and modules.
from config import DEFAULT_BUCKET, DEFAULT_REGION  
from gdsc_utils import create_encrypted_bucket, download_and_extract_model, PROJECT_DIR # functions to create S3 buckets and to help with downloading models. Importing our root directory
from gdsc_eval import plot_confusion_matrix                                             # function for creating confusion matrix                                     # importing the bucket name that contains data for the challenge and the default region
os.chdir(PROJECT_DIR)                                                                   # changing our directory to root

In [19]:
import logging                                                    # module for displaying relevant information in the logs
import sys                                                        # to access to some variables used or maintained by the interpreter 
import argparse                                                   # to parse arguments from passed in the hyperparameters
import os                                                         # to manage environmental variables
import json                                                       # to open the json file with labels
from transformers import (                                        # required classes to perform the model training and implement early stopping
    ASTFeatureExtractor, 
    ASTForAudioClassification, 
    Trainer, 
    TrainingArguments, 
    EarlyStoppingCallback
)                                    
import torch                                                       # library to work with PyTorch tensors and to figure out if we have a GPU available
import torch.nn as nn
from datasets import load_dataset, Audio, Dataset                  # required tools to create, load and process our audio dataset
import pandas as pd                                                # home of the DataFrame construct, _the_ most important object for Data Science
from preprocessing import preprocess_audio_arrays                  # functions to preprocess the dataset with ASTFeatureExtractor
from gdsc_eval import compute_metrics, make_predictions            # functions to create predictions and evaluate them
from typing import Optional                                        # for type hints


In [8]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [9]:
device

device(type='cuda', index=0)

In [10]:
from torchvision.models import resnet50, ResNet50_Weights

  warn(


In [22]:
parser = argparse.ArgumentParser()

# hyperparameters sent from our jupyter notebook are passed as command-line arguments to the script
# preprocessing hyperparameters
# parser.add_argument("--sampling_rate", type=int, default=22050)                        # sampling rate to which we will cast audio files
parser.add_argument("--sampling_rate", type=int, default=16000)                        # sampling rate to which we will cast audio files
parser.add_argument("--fe_batch_size", type=int, default=32)                           # feature extractor batch size
# parser.add_argument("--train_dataset_mean", type=float, default=-8.076275929131292)                  # mean value of spectrograms of our data
# parser.add_argument("--train_dataset_std", type=float, default=3.984092920341275)    
# standard deviation value of spectrograms of our resampled data
parser.add_argument("--train_dataset_mean", type=float, default=-8.141991150530815)                  # mean value of spectrograms of our data
parser.add_argument("--train_dataset_std", type=float, default=4.095692486358449)                   # standard deviation value of spectrograms of our resampled data

# training hyperparameters
parser.add_argument("--model_name", type=str)                                          # name of the pretrained model from HuggingFace
parser.add_argument("--learning_rate", type=float, default=2e-5)                       # learning rate
parser.add_argument("--epochs", type=int, default=4)                                   # number of training epochs 
parser.add_argument("--train_batch_size", type=int, default=4)                        # training batch size
parser.add_argument("--eval_batch_size", type=int, default=64)                         # evaluation batch size
parser.add_argument("--patience", type=int, default=2)                                 # early stopping - how many epoch without improvement will stop the training 
# parser.add_argument("--data_channel", type=str, default=os.environ["SM_CHANNEL_DATA"]) # directory where input data from S3 is stored
parser.add_argument("--train_dir", type=str, default="train")                          # folder name with training data
parser.add_argument("--val_dir", type=str, default="val")                              # folder name with validation data
parser.add_argument("--test_dir", type=str, default="test")                            # folder name with test data
# parser.add_argument("--output_dir", type=str, default=os.environ['SM_MODEL_DIR'])      # output directory. This directory will be saved in the S3 bucket


args, _ = parser.parse_known_args()                    # parsing arguments from the notebook

args.model_name = "MIT/ast-finetuned-audioset-10-10-0.4593"

# train_path = f"{args.data_channel}/{args.train_dir}"   # directory of our training dataset on the instance
# val_path = f"{args.data_channel}/{args.val_dir}"       # directory of our validation dataset on the instance
# test_path = f"{args.data_channel}/{args.test_dir}"     # directory of our test dataset on the instance

train_path = 'data/data_small/train'
val_path = 'data/data_small/val'

# train_path = '../data/train'
# val_path = '../data/val'

# experiments/data/data_small/train/Achetadomesticus_XC751747-dat009-001_edit1.wav
# Set up logging which allows to print information in logs
logger = logging.getLogger(__name__)

logging.basicConfig(
    level=logging.getLevelName("INFO"),
    handlers=[logging.StreamHandler(sys.stdout)],
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)

In [23]:
def get_feature_extractor(model_name: str, 
                          train_dataset_mean: Optional[float] = None, 
                          train_dataset_std: Optional[float] = None) -> ASTFeatureExtractor:
    """
    Retrieves a feature extractor for audio signal processing.

    Args:
        model_name (str): The name of the pre-trained model to use.
        train_dataset_mean (float, optional): The mean value of the training dataset. Defaults to None.
        train_dataset_std (float, optional): The standard deviation of the training dataset. Defaults to None.

    Returns:
        ASTFeatureExtractor: An instance of the ASTFeatureExtractor class.

    """
    if all((train_dataset_mean, train_dataset_std)):
        feature_extractor = ASTFeatureExtractor.from_pretrained(model_name, mean=train_dataset_mean, std=train_dataset_std, max_length=1024)
        logger.info(f" feature extractor loaded with dataset mean: {train_dataset_mean} and standard deviation: {train_dataset_std}")
    else:
        feature_extractor = ASTFeatureExtractor.from_pretrained(model_name)
        logger.info(" at least one of the optional arguments (mean, std) is missing")
        logger.info(f" feature extractor loaded with default dataset mean: {feature_extractor.mean} and standard deviation: {feature_extractor.std}")
        
    return feature_extractor

def preprocess_data_for_training(
    dataset_path: str,
    sampling_rate: int,
    feature_extractor: ASTFeatureExtractor,
    fe_batch_size: int,
    dataset_name: str,
    shuffle: bool = False,
    extract_file_name: bool = True) -> Dataset:
    """
    Preprocesses audio data for training.

    Args:
        dataset_path (str): The path to the dataset.
        sampling_rate (int): The desired sampling rate for the audio.
        feature_extractor (ASTFeatureExtractor): The feature extractor to use for preprocessing.
        fe_batch_size (int): The batch size for feature extraction.
        dataset_name (str, optional): The name of the dataset. Defaults to None.
        shuffle (bool, optional): Whether to shuffle the dataset. Defaults to False.
        extract_file_name (bool, optional): Whether to extract paths from audio features. Defaults to True.

    Returns:
        dataset: The preprocessed dataset.

    """
    dataset = load_dataset("audiofolder", data_dir=dataset_path).get('train') # loading the dataset
    
    # perform shuffle if specified
    if shuffle:
        dataset = dataset.shuffle(seed=43)
        
    logger.info(f" loaded {dataset_name} dataset length is: {len(dataset)}")

    if extract_file_name:
        remove_metadata = lambda x: x.endswith(".wav")
        extract_file_name = lambda x: x.split('/')[-1]

        dataset_paths = list(dataset.info.download_checksums.keys())
        dataset_paths = list(filter(remove_metadata, dataset_paths))
        dataset_paths = list(map(extract_file_name, dataset_paths))
        dataset = dataset.add_column("file_name", dataset_paths)

    dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
    
    logger.info(f" {dataset_name} dataset sampling rate casted to: {sampling_rate}")

    dataset_encoded = dataset.map(
        lambda x: preprocess_audio_arrays(x, 'audio', 'array', feature_extractor),
        remove_columns="audio",
        batched=True,
        batch_size=fe_batch_size
    )
    
    logger.info(f" done extracting features for {dataset_name} dataset")
    
    return dataset_encoded

In [24]:
# with open(f'data/data_small/labels.json', 'r') as f:
with open(f'../data/labels.json', 'r') as f:
        labels = json.load(f)
    
# Create mapping from label to id and id to label
label2id, id2label = dict(), dict()
for k, v in labels.items():
    label2id[k] = str(v)
    id2label[str(v)] = k

num_labels = len(label2id)  # define number of labels


In [25]:
feature_extractor = get_feature_extractor(args.model_name, args.train_dataset_mean, args.train_dataset_std)

# creating train and validation datasets
train_dataset_encoded = preprocess_data_for_training(dataset_path=train_path, sampling_rate=args.sampling_rate, feature_extractor=feature_extractor,
                                                     fe_batch_size=args.fe_batch_size, dataset_name="train", shuffle=True, extract_file_name=False)

val_dataset_encoded = preprocess_data_for_training(dataset_path=val_path, sampling_rate=args.sampling_rate, feature_extractor=feature_extractor,
                                                   fe_batch_size=args.fe_batch_size, dataset_name="validation")

# Download model from model hub
# model = ASTForAudioClassification.from_pretrained(args.model_name, num_labels=num_labels, label2id=label2id, id2label=id2label, ignore_mismatched_sizes=True)


Downloading (…)rocessor_config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

2023-07-07 14:27:27,658 - __main__ - INFO -  feature extractor loaded with dataset mean: -8.141991150530815 and standard deviation: 4.095692486358449


Resolving data files:   0%|          | 0/177 [00:00<?, ?it/s]

Downloading and preparing dataset audiofolder/default to /root/.cache/huggingface/datasets/audiofolder/default-4ea904e8a4f39112/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc...


Downloading data files:   0%|          | 0/177 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset audiofolder downloaded and prepared to /root/.cache/huggingface/datasets/audiofolder/default-4ea904e8a4f39112/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

2023-07-07 14:27:28,064 - __main__ - INFO -  loaded train dataset length is: 176
2023-07-07 14:27:28,068 - __main__ - INFO -  train dataset sampling rate casted to: 16000


Map:   0%|          | 0/176 [00:00<?, ? examples/s]

2023-07-07 14:27:44,611 - __main__ - INFO -  done extracting features for train dataset


Resolving data files:   0%|          | 0/67 [00:00<?, ?it/s]

Downloading and preparing dataset audiofolder/default to /root/.cache/huggingface/datasets/audiofolder/default-99123276a0f5ab0b/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc...


Downloading data files:   0%|          | 0/67 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset audiofolder downloaded and prepared to /root/.cache/huggingface/datasets/audiofolder/default-99123276a0f5ab0b/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

2023-07-07 14:27:44,859 - __main__ - INFO -  loaded validation dataset length is: 66
2023-07-07 14:27:44,865 - __main__ - INFO -  validation dataset sampling rate casted to: 16000


Map:   0%|          | 0/66 [00:00<?, ? examples/s]

2023-07-07 14:27:49,805 - __main__ - INFO -  done extracting features for validation dataset


In [16]:
model = torch.hub.load("pytorch/vision", "resnet50", weights="IMAGENET1K_V2")

Downloading: "https://github.com/pytorch/vision/zipball/main" to /root/.cache/torch/hub/main.zip


In [20]:
model.fc = nn.Linear(in_features=2048, out_features=66, bias=True)

In [54]:
a = X.squeeze()

In [62]:
X.resize_(4,1,1024,128).shape

torch.Size([4, 1, 1024, 128])

In [65]:
y = X.expand(4, 3, 1024, 128)

In [67]:
y.shape

torch.Size([4, 3, 1024, 128])

In [70]:
model.to(device)(y)

tensor([[-0.0753,  0.0674,  0.0227, -0.0393,  0.0487,  0.0449, -0.0378, -0.0707,
         -0.0217, -0.0634, -0.0316,  0.0090, -0.0179, -0.0762, -0.2220,  0.0435,
         -0.0040, -0.0144, -0.1401,  0.0057,  0.1060,  0.0543, -0.0204, -0.0124,
         -0.1530, -0.0149,  0.1607, -0.0720,  0.0120, -0.0610,  0.1477, -0.1113,
         -0.0376, -0.0373, -0.1345, -0.0067, -0.0782,  0.0826, -0.0186,  0.0926,
          0.1733,  0.0157, -0.0985,  0.0557, -0.0099,  0.1344, -0.0687,  0.1669,
          0.0361, -0.0008,  0.0653, -0.0043, -0.0098,  0.0538,  0.0238,  0.1329,
         -0.0583,  0.0429, -0.1120, -0.0645,  0.0906, -0.1164,  0.1710,  0.0218,
         -0.0271, -0.0584],
        [-0.1051,  0.1952, -0.1550, -0.0662,  0.1017, -0.0060, -0.0606, -0.0533,
         -0.1076, -0.1157,  0.0262, -0.0280, -0.0487, -0.0177, -0.1300, -0.0295,
         -0.0493, -0.0041,  0.0254, -0.0524,  0.1747,  0.1224, -0.0424, -0.0749,
         -0.0706, -0.0290,  0.0958, -0.0695,  0.0086, -0.0809,  0.1500,  0.0413,


In [98]:
import pickle

class CNNPretrain(nn.Module):
    def __init__(self):
        super(CNNPretrain, self).__init__()
        self.model = torch.hub.load("pytorch/vision", "resnet50", weights="IMAGENET1K_V2")
        self.model = list(self.model.children())
        w = self.model[0].weight
        self.model[0] = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=2, bias=False)
        self.model[0].weight = nn.Parameter(torch.mean(w, dim=1, keepdim=True))
        self.model = nn.Sequential(*self.model)
        self.model.fc = nn.Linear(in_features=2048, out_features=66, bias=True)
    def forward(self, x):
        out = self.model(x)
        # out = self.model.extract_features(x, padding_mask=torch.zeros(x.shape).bool().to(device))[0]
        return out

In [99]:
class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, train=True):
        self.rnd = np.random.RandomState(0)
        
        if train:
            self.x_data = torch.tensor(train_dataset_encoded[:]['input_values'], dtype=torch.float32).to(device)
            self.y_data = torch.tensor(train_dataset_encoded[:]['label'], dtype=torch.int64).to(device) 
        else:
            self.x_data = torch.tensor(val_dataset_encoded[:]['input_values'], dtype=torch.float32).to(device)
            self.y_data = torch.tensor(val_dataset_encoded[:]['label'], dtype=torch.int64).to(device) 
    
        self.n = self.x_data.shape[0]
        
    def __len__(self):
        return self.n

    def __getitem__(self, index):
        x = self.x_data[index]
        y = self.y_data[index]
        return (x, y)

In [100]:
def make_predictions(examples: torch.Tensor, 
                     model: torch.nn.Module, 
                     device,
                     labels: torch.Tensor = None):
    """
    Generates predictions and loss values for a given batch of examples and labels with the use of a provided model.

    Args:
        examples (torch.Tensor): A tensor of shape (batch_size, sequence_length) containing input examples.
        labels (torch.Tensor): A tensor of shape (batch_size,) containing ground-truth labels.
        model (torch.nn.Module): A PyTorch model to use for making predictions.
        device (str or torch.device): The device to use for running the model (e.g. 'cpu' or 'cuda').

    Returns:
        dict: A dictionary containing one or two keys: 'predicted_class_id' (always) and 'loss' (optional).
        'predicted_class_id' is a list of strings representing the predicted class IDs for each example.
        'loss' is a numpy array of shape (batch_size, num_labels) containing the loss values for each example.
    """
    model = model.to(device)
    with torch.no_grad():
        logits = model(examples.to(device)).logits
    predicted_class_id = [str(torch.argmax(item).item()) for item in logits]
    if isinstance(labels, torch.Tensor):
        loss = torch.nn.functional.cross_entropy(logits.view(-1, model.num_labels), labels.to(device).view(-1), reduction="none")
        loss = loss.view(len(examples), -1).cpu().numpy()

        return {'predicted_class_id': predicted_class_id, 'loss': loss}
    else:
        return {'predicted_class_id': predicted_class_id}

In [101]:
def compute_metrics(pred, labels):
    # labels = pred.label_ids
    # preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, pred, average="macro")
    acc = accuracy_score(labels, pred)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [102]:
X.shape

torch.Size([4, 1024, 128])

### Training

In [103]:
from tqdm import tqdm
lrn_rate = 1e-3
max_epochs = 10
ep_log_interval = 1

classification_net = CNNPretrain().to(device)

optimizer_class = torch.optim.AdamW(classification_net.parameters(), lr=lrn_rate)
categorical_crossentropy = torch.nn.functional.cross_entropy

train_class_ds = ClassificationDataset(train=True)
val_class_ds = ClassificationDataset(train=False)

batch_size_class = 4
train_class_ldr = torch.utils.data.DataLoader(train_class_ds, batch_size=batch_size_class, shuffle=True)
val_class_ldr = torch.utils.data.DataLoader(val_class_ds, batch_size=batch_size_class, shuffle=True)


for epoch in range(0, max_epochs):
    ep_loss = 0  # for one full epoch
    for (batch_idx, batch) in tqdm(enumerate(train_class_ldr)):
        X, y = batch
        #X = X.expand(4, 3, 1024, 128)
        output = classification_net(X)

        optimizer_class.zero_grad()      # reset gradients
        loss_val = categorical_crossentropy(output, y)

        ep_loss += loss_val.item()  # accumulate loss
        loss_val.backward()         # compute grads
        optimizer_class.step()            # update weights
    if epoch % ep_log_interval == 0:
        print("epoch = %4d  |  loss = %10.4f" % (epoch, ep_loss))
        val_dataset_encoded2 = val_dataset_encoded.map(lambda x: make_predictions(x['input_values'], classification_net, device, x['label']), batched=True, batch_size=4, remove_columns="input_values")
        print(compute_metrics([int(i) for i in val_dataset_encoded2[:]['predicted_class_id']], val_dataset_encoded2[:]['label']))

Using cache found in /root/.cache/torch/hub/pytorch_vision_main


### Testing

In [None]:
from tqdm import tqdm
class ClassificationDatasetTest(torch.utils.data.Dataset):
    def __init__(self, train=True):
        self.rnd = np.random.RandomState(0)
        
        # if train:
        self.data = val_dataset_encoded
        self.max_len = 1100000
        # else:
        #     self.x_data = torch.tensor(val_dataset_encoded[:]['input_values'], dtype=torch.float32).to(device)
        #     self.y_data = torch.tensor(val_dataset_encoded[:]['label'], dtype=torch.int64).to(device) 

        # self.n = len(self.x_data)

    def __len__(self):
        return self.data.num_rows

    def __getitem__(self, index):
        data = self.data[index]
        y = torch.tensor(data['label'], dtype=torch.int64).to(device) 
        x = torch.tensor(data['audio']['array'], dtype=torch.float32).to(device)
        x = torch.nn.functional.pad(x, (0, self.max_len - x.shape[0]), value=0)

        return (x, y)

In [None]:
import pickle
classification_net = BEATsPretrain().to(device)
classification_net.model = pickle.load(open('model_beats.pkl', 'rb'))

In [None]:
val_class_ds = ClassificationDatasetTest()

batch_size_class = 1
val_class_ds = torch.utils.data.DataLoader(val_class_ds, batch_size=batch_size_class, shuffle=True)

results = []
labels = []
for (batch_idx, batch) in tqdm(enumerate(val_class_ds)):
    X, y = batch
    output = classification_net(X)
    results.append(output)
    labels.append(y)


In [None]:
compute_metrics([int(i) for i in results], labels) # not tested yet. Probably return error