In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os  

In [4]:
os.getcwd() 

'/root/data/experiments/exp_2023-07-12_AST_just_head'

In [5]:
import sys                                                                             # Python system library needed to load custom functions
import numpy as np                                                                     # for performing calculations on numerical arrays
import pandas as pd                                                                    # home of the DataFrame construct, _the_ most important object for Data Science
import seaborn as sns                                                                  # additional plotting library
import matplotlib.pyplot as plt                                                        # allows creation of insightful plots
import os                                                                              # for changing the directory

import sagemaker                                                                       # dedicated sagemaker library to execute training jobs
import boto3                                                                           # for interacting with S3 buckets

from sagemaker.huggingface import HuggingFace                                           # for executing the trainig jobs
from sklearn.metrics import precision_recall_fscore_support, accuracy_score             # tools to understand how our model is performing

#sys.path.append('')                                                               # Add the source directory to the PYTHONPATH. This allows to import local functions and modules.
from config import DEFAULT_BUCKET, DEFAULT_REGION  
from gdsc_utils import create_encrypted_bucket, download_and_extract_model, PROJECT_DIR # functions to create S3 buckets and to help with downloading models. Importing our root directory
from gdsc_eval import plot_confusion_matrix                                             # function for creating confusion matrix                                     # importing the bucket name that contains data for the challenge and the default region
os.chdir(PROJECT_DIR)                                                                   # changing our directory to root

In [6]:
import logging                                                    # module for displaying relevant information in the logs
import sys                                                        # to access to some variables used or maintained by the interpreter 
import argparse                                                   # to parse arguments from passed in the hyperparameters
import os                                                         # to manage environmental variables
import json                                                       # to open the json file with labels
from transformers import (                                        # required classes to perform the model training and implement early stopping
    ASTFeatureExtractor, 
    ASTForAudioClassification, 
    Trainer, 
    TrainingArguments, 
    EarlyStoppingCallback
)                                    
import torch                                                       # library to work with PyTorch tensors and to figure out if we have a GPU available
from datasets import load_dataset, Audio, Dataset                  # required tools to create, load and process our audio dataset
import pandas as pd                                                # home of the DataFrame construct, _the_ most important object for Data Science
from preprocessing import preprocess_audio_arrays                  # functions to preprocess the dataset with ASTFeatureExtractor
from gdsc_eval import compute_metrics, make_predictions            # functions to create predictions and evaluate them
from typing import Optional                                        # for type hints


In [7]:
def get_feature_extractor(model_name: str, 
                          train_dataset_mean: Optional[float] = None, 
                          train_dataset_std: Optional[float] = None) -> ASTFeatureExtractor:
    """
    Retrieves a feature extractor for audio signal processing.

    Args:
        model_name (str): The name of the pre-trained model to use.
        train_dataset_mean (float, optional): The mean value of the training dataset. Defaults to None.
        train_dataset_std (float, optional): The standard deviation of the training dataset. Defaults to None.

    Returns:
        ASTFeatureExtractor: An instance of the ASTFeatureExtractor class.

    """
    if all((train_dataset_mean, train_dataset_std)):
        feature_extractor = ASTFeatureExtractor.from_pretrained(model_name, mean=train_dataset_mean, std=train_dataset_std, max_length=1024)
        logger.info(f" feature extractor loaded with dataset mean: {train_dataset_mean} and standard deviation: {train_dataset_std}")
    else:
        feature_extractor = ASTFeatureExtractor.from_pretrained(model_name)
        logger.info(" at least one of the optional arguments (mean, std) is missing")
        logger.info(f" feature extractor loaded with default dataset mean: {feature_extractor.mean} and standard deviation: {feature_extractor.std}")
        
    return feature_extractor

def preprocess_data_for_training(
    dataset_path: str,
    sampling_rate: int,
    feature_extractor: ASTFeatureExtractor,
    fe_batch_size: int,
    dataset_name: str,
    shuffle: bool = False,
    extract_file_name: bool = True) -> Dataset:
    """
    Preprocesses audio data for training.

    Args:
        dataset_path (str): The path to the dataset.
        sampling_rate (int): The desired sampling rate for the audio.
        feature_extractor (ASTFeatureExtractor): The feature extractor to use for preprocessing.
        fe_batch_size (int): The batch size for feature extraction.
        dataset_name (str, optional): The name of the dataset. Defaults to None.
        shuffle (bool, optional): Whether to shuffle the dataset. Defaults to False.
        extract_file_name (bool, optional): Whether to extract paths from audio features. Defaults to True.

    Returns:
        dataset: The preprocessed dataset.

    """
    dataset = load_dataset("audiofolder", data_dir=dataset_path).get('train') # loading the dataset
    
    # perform shuffle if specified
    if shuffle:
        dataset = dataset.shuffle(seed=42)
        
    logger.info(f" loaded {dataset_name} dataset length is: {len(dataset)}")

    if extract_file_name:
        remove_metadata = lambda x: x.endswith(".wav")
        extract_file_name = lambda x: x.split('/')[-1]

        dataset_paths = list(dataset.info.download_checksums.keys())
        dataset_paths = list(filter(remove_metadata, dataset_paths))
        dataset_paths = list(map(extract_file_name, dataset_paths))
        dataset = dataset.add_column("file_name", dataset_paths)

    dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
    
    logger.info(f" {dataset_name} dataset sampling rate casted to: {sampling_rate}")

    dataset_encoded = dataset.map(
        lambda x: preprocess_audio_arrays(x, 'audio', 'array', feature_extractor),
        remove_columns="audio",
        batched=True,
        batch_size=fe_batch_size
    )
    
    logger.info(f" done extracting features for {dataset_name} dataset")
    
    return dataset_encoded

In [8]:
parser = argparse.ArgumentParser()

# hyperparameters sent from our jupyter notebook are passed as command-line arguments to the script
# preprocessing hyperparameters
parser.add_argument("--sampling_rate", type=int, default=22050)                        # sampling rate to which we will cast audio files
parser.add_argument("--fe_batch_size", type=int, default=32)                           # feature extractor batch size
# parser.add_argument("--train_dataset_mean", type=float, default=-8.076275929131292)                  # mean value of spectrograms of our data
# parser.add_argument("--train_dataset_std", type=float, default=3.984092920341275)    
# standard deviation value of spectrograms of our resampled data
parser.add_argument("--train_dataset_mean", type=float, default=-8.141991150530815)                  # mean value of spectrograms of our data
parser.add_argument("--train_dataset_std", type=float, default=4.095692486358449)                   # standard deviation value of spectrograms of our resampled data

# training hyperparameters
parser.add_argument("--model_name", type=str)                                          # name of the pretrained model from HuggingFace
parser.add_argument("--learning_rate", type=float, default=2e-5)                       # learning rate
parser.add_argument("--epochs", type=int, default=4)                                   # number of training epochs 
parser.add_argument("--train_batch_size", type=int, default=4)                        # training batch size
parser.add_argument("--eval_batch_size", type=int, default=64)                         # evaluation batch size
parser.add_argument("--patience", type=int, default=2)                                 # early stopping - how many epoch without improvement will stop the training 
# parser.add_argument("--data_channel", type=str, default=os.environ["SM_CHANNEL_DATA"]) # directory where input data from S3 is stored
parser.add_argument("--train_dir", type=str, default="train")                          # folder name with training data
parser.add_argument("--val_dir", type=str, default="val")                              # folder name with validation data
parser.add_argument("--test_dir", type=str, default="test")                            # folder name with test data
# parser.add_argument("--output_dir", type=str, default=os.environ['SM_MODEL_DIR'])      # output directory. This directory will be saved in the S3 bucket


args, _ = parser.parse_known_args()                    # parsing arguments from the notebook


# train_path = f"{args.data_channel}/{args.train_dir}"   # directory of our training dataset on the instance
# val_path = f"{args.data_channel}/{args.val_dir}"       # directory of our validation dataset on the instance
# test_path = f"{args.data_channel}/{args.test_dir}"     # directory of our test dataset on the instance

train_path = 'data/data_small/train'
val_path = 'data/data_small/val'

# train_path = '../data/train'
# val_path = '../data/val'

# experiments/data/data_small/train/Achetadomesticus_XC751747-dat009-001_edit1.wav
# Set up logging which allows to print information in logs
logger = logging.getLogger(__name__)

logging.basicConfig(
    level=logging.getLevelName("INFO"),
    handlers=[logging.StreamHandler(sys.stdout)],
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)

In [9]:
# with open(f'data/data_small/labels.json', 'r') as f:
with open(f'../data/labels.json', 'r') as f:
        labels = json.load(f)
    
# Create mapping from label to id and id to label
label2id, id2label = dict(), dict()
for k, v in labels.items():
    label2id[k] = str(v)
    id2label[str(v)] = k

num_labels = len(label2id)  # define number of labels


In [10]:
    output_dir='models/AST',                # directory for saving model checkpoints and logs


In [11]:
args.model_name = "MIT/ast-finetuned-audioset-10-10-0.4593"

In [12]:
feature_extractor = get_feature_extractor(args.model_name, args.train_dataset_mean, args.train_dataset_std)

# creating train and validation datasets
train_dataset_encoded = preprocess_data_for_training(dataset_path=train_path, sampling_rate=args.sampling_rate, feature_extractor=feature_extractor,
                                                     fe_batch_size=args.fe_batch_size, dataset_name="train", shuffle=True, extract_file_name=False)

val_dataset_encoded = preprocess_data_for_training(dataset_path=val_path, sampling_rate=args.sampling_rate, feature_extractor=feature_extractor,
                                                   fe_batch_size=args.fe_batch_size, dataset_name="validation")

# Download model from model hub
model = ASTForAudioClassification.from_pretrained(args.model_name, num_labels=num_labels, label2id=label2id, id2label=id2label, ignore_mismatched_sizes=True)


2023-07-12 09:28:08,358 - __main__ - INFO -  feature extractor loaded with dataset mean: -8.141991150530815 and standard deviation: 4.095692486358449


Resolving data files:   0%|          | 0/177 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]

2023-07-12 09:28:09,002 - __main__ - INFO -  loaded train dataset length is: 176
2023-07-12 09:28:09,020 - __main__ - INFO -  train dataset sampling rate casted to: 22050
2023-07-12 09:28:09,033 - __main__ - INFO -  done extracting features for train dataset


Resolving data files:   0%|          | 0/67 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]

2023-07-12 09:28:09,371 - __main__ - INFO -  loaded validation dataset length is: 66
2023-07-12 09:28:09,381 - __main__ - INFO -  validation dataset sampling rate casted to: 22050
2023-07-12 09:28:09,391 - __main__ - INFO -  done extracting features for validation dataset


Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([66, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([66]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# from transformers import ASTModel

In [13]:
# model2 = ASTModel.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

In [14]:
# import matplotlib.pyplot as plt
# import numpy as np
# import seaborn as sns
# import torch
# from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
# from typing import Dict, Union, List, Any

# def make_predictions2(examples: torch.Tensor, 
#                      model: torch.nn.Module, 
#                      device: Union[str, torch.device],
#                      labels: torch.Tensor = None) -> Dict[str, Union[List[str], np.ndarray]]:

#     model = model.to(device)
#     with torch.no_grad():
#         logits = model(examples.to(device))
#     return logits


In [None]:
for param in model.bert.parameters():

    param.requires_grad = False

In [29]:
for param in model.audio_spectrogram_transformer.parameters():
    param.requires_grad = False

In [31]:
model

ASTForAudioClassification(
  (audio_spectrogram_transformer): ASTModel(
    (embeddings): ASTEmbeddings(
      (patch_embeddings): ASTPatchEmbeddings(
        (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ASTEncoder(
      (layer): ModuleList(
        (0): ASTLayer(
          (attention): ASTAttention(
            (attention): ASTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ASTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ASTIntermediate(
            (dense): Li

In [32]:
for param in model.audio_spectrogram_transformer.parameters():
    print(param.requires_grad)

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
Fals

In [26]:
model3.audio_spectrogram_transformer == model2.audio_spectrogram_transformer

False

In [22]:
model2.audio_spectrogram_transformer.re

<bound method Module.requires_grad_ of ASTModel(
  (embeddings): ASTEmbeddings(
    (patch_embeddings): ASTPatchEmbeddings(
      (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): ASTEncoder(
    (layer): ModuleList(
      (0): ASTLayer(
        (attention): ASTAttention(
          (attention): ASTSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): ASTSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): ASTIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (interm

In [21]:
model == model2

False

In [15]:
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
    
# val_dataset_encoded.set_format(type='torch', columns=['input_values'])
# val_dataset_encoded3 = val_dataset_encoded.map(lambda x: make_predictions2(x['input_values'], model.audio_spectrogram_transformer, device, x['label']), batched = True, batch_size=4)

In [16]:
from collections import Counter
class Siamese_Dataset(torch.utils.data.Dataset):
    def __init__(self, train=True):
        self.rnd = np.random.RandomState(0)
        
        if train:
            self.x_data = torch.tensor(train_dataset_encoded[:]['input_values'], dtype=torch.float32).to(device)
            self.y_data = torch.tensor(train_dataset_encoded[:]['label'], dtype=torch.int64).to(device) 
        else:
            self.x_data = torch.tensor(val_dataset_encoded[:]['input_values'], dtype=torch.float32).to(device)
            self.y_data = torch.tensor(val_dataset_encoded[:]['label'], dtype=torch.int64).to(device) 

        self.n = len(self.x_data)
        self.singles = [k for k, v in Counter(self.y_data.tolist()).items() if v==1]

    def __len__(self):
        return self.n
    
    #TODO: add check for the same elements
#     def __getitem__(self, idx1):
#         flag = self.rnd.randint(0,1)  # 0 = same class, or 1
#         y = self.y_data[idx1]
#         idx2 = self.rnd.randint(0,self.n-1)  # a bit tricky

#         if flag == 0:  # get two images with same label
#             while self.y_data[idx2] != y:
#                 idx2 += 1
#                 if idx2 == self.n: idx2 = 0
#         elif idx1 % 2 != 0:  # get images different labels
#             while self.y_data[idx2] == y:
#                 idx2 += 1
#                 if idx2 == self.n: idx2 = 0

#         pixels1 = self.x_data[idx1]
#         label1 = self.y_data[idx1] 
#         pixels2 = self.x_data[idx2]
#         label2 = self.y_data[idx2] 
#         flag = torch.tensor(flag, dtype=torch.float32).to(device)
#         return (pixels1, label1, pixels2, label2, flag)

    def __getitem__(self, idx1):
        y = self.y_data[idx1]
        idx2 = self.rnd.randint(0,self.n-1)
        if idx1 == idx2:
            idx2 += 1
        y2 = self.y_data[idx2]
        idx3 = self.rnd.randint(0,self.n-1)

        if y != y2:  # get two images with same label
            while self.y_data[idx3] != y:
                idx3 += 1
                if idx3 == idx1 and not self.y_data[idx3] in self.singles:
                    idx3 += 1
                if idx3 == self.n: idx3 = 0
            idx2, idx3 = idx3, idx2
        else:  # get images different labels
            while self.y_data[idx3] == y:
                idx3 += 1
                if idx3 == self.n: idx3 = 0

        pixels1 = self.x_data[idx1]
        label1 = self.y_data[idx1] 
        pixels2 = self.x_data[idx2]
        label2 = self.y_data[idx2] 
        pixels3 = self.x_data[idx3]
        label3 = self.y_data[idx3]
        return (pixels1, label1, pixels2, label2, pixels3, label3)
    
    
class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, train=True):
        self.rnd = np.random.RandomState(0)
        
        if train:
            self.x_data = torch.tensor(train_dataset_encoded[:]['input_values'], dtype=torch.float32).to(device)
            self.y_data = torch.tensor(train_dataset_encoded[:]['label'], dtype=torch.int64).to(device) 
        else:
            self.x_data = torch.tensor(val_dataset_encoded[:]['input_values'], dtype=torch.float32).to(device)
            self.y_data = torch.tensor(val_dataset_encoded[:]['label'], dtype=torch.int64).to(device) 

        self.n = len(self.x_data)

    def __len__(self):
        return self.n

    def __getitem__(self, index):
        x = self.x_data[index]
        y = self.y_data[index]

        return (x, y)

In [21]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [22]:
train_ds = Siamese_Dataset()

In [23]:
(pixels1, label1, pixels2, label2, pixels3, label3) = train_ds.__getitem__(0)

In [26]:
train_ds

torch.Size([1024, 128])

In [11]:
model = ASTForAudioClassification.from_pretrained(args.model_name, num_labels=num_labels, label2id=label2id, id2label=id2label, ignore_mismatched_sizes=True)

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([66, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([66]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
model

ASTForAudioClassification(
  (audio_spectrogram_transformer): ASTModel(
    (embeddings): ASTEmbeddings(
      (patch_embeddings): ASTPatchEmbeddings(
        (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ASTEncoder(
      (layer): ModuleList(
        (0): ASTLayer(
          (attention): ASTAttention(
            (attention): ASTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ASTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ASTIntermediate(
            (dense): Li

In [24]:
import torch.nn as nn

In [33]:
model.audio_spectrogram_transformer.encoder.layer[0].attention.attention.dropout

Dropout(p=0.0, inplace=False)

In [34]:
for i in range(12):
    model.audio_spectrogram_transformer.encoder.layer[i].output.dropout = nn.Dropout(p=0.1, inplace=False)
    model.audio_spectrogram_transformer.encoder.layer[i].attention.output.dropout = nn.Dropout(p=0.1, inplace=False)
    model.audio_spectrogram_transformer.encoder.layer[i].attention.attention.dropout = nn.Dropout(p=0.1, inplace=False)
    

In [35]:
model

ASTForAudioClassification(
  (audio_spectrogram_transformer): ASTModel(
    (embeddings): ASTEmbeddings(
      (patch_embeddings): ASTPatchEmbeddings(
        (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ASTEncoder(
      (layer): ModuleList(
        (0): ASTLayer(
          (attention): ASTAttention(
            (attention): ASTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ASTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): ASTIntermediate(
            (dense): Li

In [14]:
import numpy as np
import matplotlib.pyplot as plt

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# -----------------------------------------------------------

class SiameseNet(torch.nn.Module):
    def __init__(self):
        super(SiameseNet, self).__init__()  # pre 3.3 syntax

        self.model = ASTForAudioClassification.from_pretrained(args.model_name, num_labels=num_labels, label2id=label2id, id2label=id2label, ignore_mismatched_sizes=True)
        self.model = self.model.audio_spectrogram_transformer
        #self.model = ASTModel.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

    def forward(self, x1, x2, x3):
        # oupt1 = self.model(x1)[:]['last_hidden_state']
        # oupt2 = self.model(x2)[:]['last_hidden_state']
        output = self.model(torch.cat((x1, x2, x3), 0)).pooler_output
        # oupt1 = self.model(x1).last_hidden_state
        # oupt2 = self.model(x2).last_hidden_state
        # oupt3 = self.model(x3).last_hidden_state
        return torch.tensor_split(output, 3)
    
    
class ClassificationNet(torch.nn.Module):
    def __init__(self, siamese_net):
        super(ClassificationNet, self).__init__()  # pre 3.3 syntax
        self.model = ASTForAudioClassification.from_pretrained(args.model_name, num_labels=num_labels, label2id=label2id, id2label=id2label, ignore_mismatched_sizes=True)
        self.model = self.model.classifier
        self.siamese_net = siamese_net
        #self.model = ASTModel.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

    def forward(self, x):
        x = self.siamese_net(x).pooler_output
        output = self.model(x)
        return output


# -----------------------------------------------------------

# class ContrastiveLoss(torch.nn.Module):
#     def __init__(self, m=2.0):
#         super(ContrastiveLoss, self).__init__()  # pre 3.3 syntax
#         self.m = m  # margin or radius

#     def forward(self, y1, y2, flag):
#         # flag = 0 means y1 and y2 are supposed to be same
#         # flag = 1 means y1 and y2 are supposed to be different
        
#         # TODO: change to triplet loss

#         euc_dist = torch.nn.functional.pairwise_distance(y1, y2)
        
#         try:
#             loss = torch.mean((1-flag) * torch.pow(euc_dist, 2) + (flag) * torch.pow(torch.clamp(self.m - euc_dist, min=0.0), 2))
#         except:
#             loss = [torch.mean((1-flag) * torch.pow(x, 2) + (flag) * torch.pow(torch.clamp(2.0 - x, min=0.0), 2)) for x in euc_dist]

#         return loss
    
# -----------------------------------------------------------

# def siamese_dissim(siamese_model, image1, image2):
#   # images are shape [1, chnls, 28, 28]
#   # assumes model is in eval() mode
#   image1 = image1.reshape(1,1,28,28)  # if necessary
#   image2 = image2.reshape(1,1,28,28)
#   with T.no_grad():
#     oupt1, oupt2 = siamese_model(image1, image2)
#   dissim = T.nn.functional.pairwise_distance(oupt1, oupt2)
#   return np.round(dissim.item(), 6)

# -----------------------------------------------------------

# def main():
#   # 0. setup
#     print("\nBegin MNIST Siamese network demo ")
#     np.random.seed(1)
#     torch.manual_seed(1)

#     # 1. create Dataset
#     # print("\nLoading 1000-item train Dataset from text file ")
#     # train_file = ".\\Data\\mnist_train_1000.txt" 
#     train_ds = Siamese_Dataset()

#     batch_size = 1
#     train_ldr = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True)

#     # 2. create network
#     print("\nCreating Siamese network (2 conv, 3 linear) ")
#     net = SiameseNet().to(device)

#     # 3. train model
#     max_epochs = 10
#     ep_log_interval = 1
#     lrn_rate = 2e-5
#     loss_func = ContrastiveLoss()
#     optimizer = torch.optim.SGD(net.parameters(), lr=lrn_rate)

#     print("\nbat_size = %3d " % batch_size)
#     print("loss = " + "ContrastiveLoss()" )
#     print("optimizer = SGD")
#     print("max_epochs = %3d " % max_epochs)
#     print("lrn_rate = %0.3f " % lrn_rate)

#     print("\nStarting training")
#     net.train()  # set mode
#     for epoch in range(0, max_epochs):
#         ep_loss = 0  # for one full epoch
#         for (batch_idx, batch) in enumerate(train_ldr):
#             X1, y1, X2, y2, flag = batch
#             oupt1, oupt2 = net(X1, X2)

#             optimizer.zero_grad()       # reset gradients
#             loss_val = loss_func(oupt1, oupt2, flag)

#             ep_loss += loss_val.item()  # accumulate loss
#             loss_val.backward()         # compute grads
#             optimizer.step()            # update weights
#             if epoch % ep_log_interval == 0:
#                 print("epoch = %4d  |  loss = %10.4f" % (epoch, ep_loss))
#             print("Done ") 

#     # 4. TODO: save trained model

#     # -----------------------------------------------------------

#     # 5. use model
#     print("\nUsing trained Siaamese model ")
#     pixels1 = train_ds.x_data[0]  # a '1' 
#     pixels2 = train_ds.x_data[3]  # a different '1'
#     pixels3 = train_ds.x_data[4]  # a '6'

#     net.eval()
#     dissim_12 = siamese_dissim(net, pixels1, pixels2)
#     dissim_13 = siamese_dissim(net, pixels1, pixels3)

#     print("\nEnd MNIST Siamese demo ")


In [19]:
def compute_metrics(pred, labels):
    """
    Computes the accuracy, F1 score, precision, and recall for a set of predictions.

    Args:
        pred (Any): A set of predictions, as returned by a Hugging Face Trainer.

    Returns:
        Dict[str, float]: A dictionary containing four keys: 'accuracy', 'f1', 'precision', and 'recall',
        each with a float value representing the corresponding metric.
    """
    # labels = pred.label_ids
    # preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, pred, average="macro")
    acc = accuracy_score(labels, pred)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [None]:
from tqdm import tqdm
print("\nBegin MNIST Siamese network demo ")
np.random.seed(1)
torch.manual_seed(1)

import gc
torch.cuda.empty_cache()
# del variables
gc.collect()

# 1. create Dataset
# print("\nLoading 1000-item train Dataset from text file ")
# train_file = ".\\Data\\mnist_train_1000.txt" 
train_ds = Siamese_Dataset()

batch_size = 2
train_ldr = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True)
#TODO: check what margin would be best
triplet_loss = torch.nn.TripletMarginLoss(margin=10.0, p=2)

# 2. create network
print("\nCreating Siamese network ")
net = SiameseNet().to(device)

# 3. train model
max_epochs = 10
ep_log_interval = 1
lrn_rate = 2e-5
# loss_func = ContrastiveLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lrn_rate)

print("\nbatch_size = %3d " % batch_size)
# print("loss = " + "ContrastiveLoss()" )
# print("optimizer = SGD")
print("max_epochs = %3d " % max_epochs)
print("lrn_rate = %0.6f " % lrn_rate)

print("\nStarting training")
net.train()  # set mode
for epoch in range(0, max_epochs):
    ep_loss = 0  # for one full epoch
    for (batch_idx, batch) in tqdm(enumerate(train_ldr)):
        X1, y1, X2, y2, X3, y3 = batch
        oupt1, oupt2, oupt3 = net(X1, X2, X3)

        optimizer.zero_grad()       # reset gradients
        loss_val = triplet_loss(oupt1, oupt2, oupt3)

        ep_loss += loss_val.item()  # accumulate loss
        loss_val.backward()         # compute grads
        optimizer.step()            # update weights
    if epoch % ep_log_interval == 0:
        print("epoch = %4d  |  loss = %10.4f" % (epoch, ep_loss))
    print("Done ") 

# 4. TODO: save trained model

# -----------------------------------------------------------

# 5. use model
# print("\nUsing trained Siaamese model ")
# pixels1 = train_ds.x_data[0]  # a '1' 
# pixels2 = train_ds.x_data[3]  # a different '1'
# pixels3 = train_ds.x_data[4]  # a '6'

# net.eval()
# dissim_12 = siamese_dissim(net, pixels1, pixels2)
# dissim_13 = siamese_dissim(net, pixels1, pixels3)

print("\nEnd MNIST Siamese demo ")


In [None]:
net = SiameseNet().to(device)
import pickle

# pickle.dump(net.model, open('model_siamese.pkl', 'wb'))
# net = torch.load('model_siamese.pkl', map_location=torch.device('cpu'))
net.model = pickle.load(open('model_siamese.pkl', 'rb'))

In [18]:
train_class_ds = ClassificationDataset()

batch_size_class = 4
train_class_ldr = torch.utils.data.DataLoader(train_class_ds, batch_size=batch_size_class, shuffle=True)

In [19]:
from tqdm import tqdm
lrn_rate = 2e-5
max_epochs = 10
ep_log_interval = 1

classification_net = ClassificationNet(net.model).to(device)

optimizer_class = torch.optim.Adam(classification_net.parameters(), lr=lrn_rate)
categorical_crossentropy = torch.nn.functional.cross_entropy

for (batch_idx, batch) in tqdm(enumerate(train_class_ldr)):
        X, y = batch


Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([66, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([66]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
44it [00:00, 10219.81it/s]


In [20]:
from tqdm import tqdm
lrn_rate = 2e-5
max_epochs = 10
ep_log_interval = 1

classification_net = ClassificationNet(net.model).to(device)

optimizer_class = torch.optim.Adam(classification_net.parameters(), lr=lrn_rate)
categorical_crossentropy = torch.nn.functional.cross_entropy


for epoch in range(0, max_epochs):
    ep_loss = 0  # for one full epoch
    for (batch_idx, batch) in tqdm(enumerate(train_class_ldr)):
        X, y = batch
        output = classification_net(X)

        optimizer_class.zero_grad()       # reset gradients
        loss_val = categorical_crossentropy(output, y)

        ep_loss += loss_val.item()  # accumulate loss
        loss_val.backward()         # compute grads
        optimizer_class.step()            # update weights
    if epoch % ep_log_interval == 0:
        print("epoch = %4d  |  loss = %10.4f" % (epoch, ep_loss))
    print("Done ") 

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([66, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([66]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
44it [00:47,  1.08s/it]


epoch =    0  |  loss =   163.8971
Done 


44it [00:48,  1.10s/it]


epoch =    1  |  loss =    74.2935
Done 


44it [00:50,  1.16s/it]


epoch =    2  |  loss =    25.5594
Done 


44it [00:53,  1.21s/it]


epoch =    3  |  loss =     8.2624
Done 


44it [00:53,  1.22s/it]


epoch =    4  |  loss =     3.2833
Done 


44it [00:53,  1.22s/it]


epoch =    5  |  loss =     3.5265
Done 


44it [00:53,  1.22s/it]


epoch =    6  |  loss =     1.4283
Done 


44it [00:53,  1.22s/it]


epoch =    7  |  loss =     0.9449
Done 


44it [00:53,  1.22s/it]


epoch =    8  |  loss =     0.7021
Done 


44it [00:53,  1.22s/it]

epoch =    9  |  loss =     0.5776
Done 





In [28]:
test = classification_net(X)

In [30]:
type(test)

torch.Tensor

In [138]:
def make_predictions2(examples: torch.Tensor, 
                     model: torch.nn.Module, 
                     device,
                     labels: torch.Tensor = None):
    model = model.to(device)
    examples = torch.tensor(examples, dtype=torch.float32).to(device)
    labels = torch.tensor(labels, dtype=torch.int64).to(device) 

    with torch.no_grad():
        logits = model(examples)
    predicted_class_id = [str(torch.argmax(item).item()) for item in logits]
    if isinstance(labels, torch.Tensor):
        loss = torch.nn.functional.cross_entropy(logits.view(-1, 66), labels.to(device).view(-1), reduction="none")
        loss = loss.view(len(examples), -1).cpu().numpy()

        return {'predicted_class_id': predicted_class_id, 'loss': loss, 'logits': logits}
    else:
        return {'predicted_class_id': predicted_class_id}


In [None]:
val_dataset_encoded['input_values'][0]

In [22]:
val_dataset_encoded2 = val_dataset_encoded.map(lambda x: make_predictions2(x['input_values'], classification_net, device, x['label']), batched=True, batch_size=4, remove_columns="input_values")

Map:   0%|          | 0/66 [00:00<?, ? examples/s]

In [26]:
compute_metrics([int(i) for i in val_dataset_encoded2[:]['predicted_class_id']], val_dataset_encoded2[:]['label'])

  _warn_prf(average, modifier, msg_start, len(result))


{'accuracy': 0.5909090909090909,
 'f1': 0.4952380952380952,
 'precision': 0.4532828282828283,
 'recall': 0.5909090909090909}

In [13]:
from collections import Counter
class Siamese_Dataset2(torch.utils.data.Dataset):
    def __init__(self, model, train=True):
        self.rnd = np.random.RandomState(0)
        
        if train:
            self.x_data = torch.tensor(train_dataset_encoded[:]['input_values'], dtype=torch.float32).to(device)
            self.y_data = torch.tensor(train_dataset_encoded[:]['label'], dtype=torch.int64).to(device) 
        else:
            self.x_data = torch.tensor(val_dataset_encoded[:]['input_values'], dtype=torch.float32).to(device)
            self.y_data = torch.tensor(val_dataset_encoded[:]['label'], dtype=torch.int64).to(device) 
            
        self.mapping = {}
        for i, category in enumerate(self.y_data):
            category = category.item()
            self.mapping[category] = self.mapping.get(category, []) + [i]

        self.n = len(self.x_data)
        self.size_of_embeddings_for_triplet_mining = 5
        self.model = model # TODo: is it going to be updated at each iteration???
        self.margin = 10

    def __len__(self):
        return self.n
    
    def __getitem__(self, anchor_idx):
        y_anchor = int(self.y_data[anchor_idx])
        
        indices_to_consider = self.mapping[y_anchor].copy()
        if len(indices_to_consider) > 1:
            indices_to_consider.remove(anchor_idx)
            positive_idx = np.random.choice(indices_to_consider)
        else:
            positive_idx = indices_to_consider[0]
        #[enchor_embedding, positive_embedding]
        temp_predictions = self.model(self.x_data[[anchor_idx, positive_idx]]).pooler_output
        
        indices_for_negative_mining = np.random.choice(self.n, size=self.size_of_embeddings_for_triplet_mining, replace=False)
        indices_for_negative_mining = [i for i in indices_for_negative_mining if i not in indices_to_consider]
        negative_predictions = []
        batch_for_negative_mining = 2
        batched_negative_indices = [indices_for_negative_mining[i:i + batch_for_negative_mining] for i in range(0, len(indices_for_negative_mining), batch_for_negative_mining)]
        for idxs in batched_negative_indices:
            negative_predictions.append(self.model(self.x_data[list(idxs)]).pooler_output)
        negative_predictions = torch.cat(negative_predictions, 0)
        distance_to_positive = torch.cdist(temp_predictions[0].view(1, -1), temp_predictions[1].view(1, -1))
        distances_to_negative = torch.cdist(temp_predictions[0].view(1, -1), negative_predictions)

        #semi hard triplet mining
        negative_indices = torch.where(((-self.margin < distance_to_positive - distances_to_negative) & (distance_to_positive - distances_to_negative < 0))[0])[0]
        # if no semi hard available take the hardest (max distance)
        # index_in_list_of_indices = np.random.choice(negative_indices.cpu()) if len(negative_indices) else int((distances_to_negative == torch.max(distances_to_negative)).nonzero(as_tuple=True)[0])
        index_in_list_of_indices = np.random.choice(negative_indices.cpu()) if len(negative_indices) else np.random.choice(range(len(indices_for_negative_mining)))
        negative_idx = indices_for_negative_mining[index_in_list_of_indices]

        anchor = self.x_data[anchor_idx]
        positive = self.x_data[positive_idx]
        negative = self.x_data[negative_idx]
        # print(anchor_idx, positive_idx, negative_idx)
        # print(self.y_data[anchor_idx], self.y_data[positive_idx], self.y_data[negative_idx])

        return (anchor, positive, negative)

In [None]:
from tqdm import tqdm
print("\nBegin MNIST Siamese network demo ")
np.random.seed(1)
torch.manual_seed(1)

import gc
torch.cuda.empty_cache()
# del variables
gc.collect()

# 1. create Dataset
# print("\nLoading 1000-item train Dataset from text file ")
# train_file = ".\\Data\\mnist_train_1000.txt" 


batch_size = 2

net = SiameseNet().to(device)
train_ds = Siamese_Dataset(net.model)

train_ldr = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True)
#TODO: check what margin would be best
triplet_loss = torch.nn.TripletMarginLoss(margin=10.0, p=2)

# 2. create network
print("\nCreating Siamese network ")


# 3. train model
max_epochs = 5
ep_log_interval = 1
lrn_rate = 2e-5
# loss_func = ContrastiveLoss()
# optimizer = torch.optim.Adam(net.parameters(), lr=lrn_rate, )
optimizer = torch.optim.AdamW(net.parameters(), lr=lrn_rate, weight_decay=1e-2)

print("\nbatch_size = %3d " % batch_size)
# print("loss = " + "ContrastiveLoss()" )
# print("optimizer = SGD")
print("max_epochs = %3d " % max_epochs)
print("lrn_rate = %0.6f " % lrn_rate)

print("\nStarting training")
net.train()  # set mode
for epoch in range(0, max_epochs):
    ep_loss = 0  # for one full epoch
    for (batch_idx, batch) in tqdm(enumerate(train_ldr)):
        # X1, X2, X3 = batch
        X1, y1, X2, y2, X3, y3 = batch
        oupt1, oupt2, oupt3 = net(X1, X2, X3)

        optimizer.zero_grad()       # reset gradients
        loss_val = triplet_loss(oupt1, oupt2, oupt3)

        ep_loss += loss_val.item()  # accumulate loss
        loss_val.backward()         # compute grads
        optimizer.step()            # update weights
    if epoch % ep_log_interval == 0:
        print("epoch = %4d  |  loss = %10.4f" % (epoch, ep_loss))
    print("Done ") 


print("\nEnd MNIST Siamese demo ")



Begin MNIST Siamese network demo 


Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([66, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([66]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Creating Siamese network 

batch_size =   2 
max_epochs =   5 
lrn_rate = 0.000020 

Starting training


876it [22:03,  1.51s/it]


epoch =    0  |  loss =  1373.2850
Done 


876it [22:34,  1.55s/it]


epoch =    1  |  loss =  1014.0319
Done 


876it [22:09,  1.52s/it]


epoch =    2  |  loss =   607.3057
Done 


876it [22:25,  1.54s/it]


epoch =    3  |  loss =   502.8375
Done 


876it [21:39,  1.48s/it]

epoch =    4  |  loss =   559.3827
Done 

End MNIST Siamese demo 





In [16]:
import pickle

pickle.dump(net.model, open('model_siamese_full4.pkl', 'wb'))

In [17]:
5

5

In [None]:
lrn_rate = 2e-5
max_epochs = 5
ep_log_interval = 1
batch_size_class = 4

train_class_ds = ClassificationDataset()
train_class_ldr = torch.utils.data.DataLoader(train_class_ds, batch_size=batch_size_class, shuffle=True)

classification_net = ClassificationNet(net.model).to(device) #pass net.model - siamese model
# optimizer_class = torch.optim.Adam(classification_net.parameters(), lr=lrn_rate)
optimizer_class = torch.optim.AdamW(net.parameters(), lr=lrn_rate, weight_decay=1e-2)

categorical_crossentropy = torch.nn.functional.cross_entropy

for epoch in range(0, max_epochs):
    ep_loss = 0  # for one full epoch
    for (batch_idx, batch) in tqdm(enumerate(train_class_ldr)):
        X, y = batch
        output = classification_net(X)

        optimizer_class.zero_grad()       # reset gradients
        loss_val = categorical_crossentropy(output, y)

        ep_loss += loss_val.item()  # accumulate loss
        loss_val.backward()         # compute grads
        optimizer_class.step()            # update weights
    if epoch % ep_log_interval == 0:
        print("epoch = %4d  |  loss = %10.4f" % (epoch, ep_loss))
    print("Done ") 

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([66, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([66]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
438it [08:13,  1.13s/it]


epoch =    1  |  loss =   121.1541
Done 


438it [08:15,  1.13s/it]


epoch =    2  |  loss =    36.0794
Done 


438it [08:14,  1.13s/it]


epoch =    3  |  loss =    33.0890
Done 


438it [07:55,  1.09s/it]

epoch =    4  |  loss =    19.2798
Done 





In [None]:
def make_predictions2(examples: torch.Tensor, 
                     model: torch.nn.Module, 
                     device,
                     labels: torch.Tensor = None):
    model = model.to(device)
    examples = torch.tensor(examples, dtype=torch.float32).to(device)
    if labels:
        labels = torch.tensor(labels, dtype=torch.int64).to(device) 

    with torch.no_grad():
        logits = model(examples)
    predicted_class_id = [str(torch.argmax(item).item()) for item in logits]
    if isinstance(labels, torch.Tensor):
        loss = torch.nn.functional.cross_entropy(logits.view(-1, 66), labels.to(device).view(-1), reduction="none")
        loss = loss.view(len(examples), -1).cpu().numpy()

        return {'predicted_class_id': predicted_class_id, 'loss': loss, 'logits': logits}
    else:
        return {'predicted_class_id': predicted_class_id}

    
def compute_metrics(pred, labels):
    # labels = pred.label_ids
    # preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, pred, average="macro")
    acc = accuracy_score(labels, pred)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [20]:
4

4

In [21]:
val_dataset_encoded2 = val_dataset_encoded.map(lambda x: make_predictions2(x['input_values'], classification_net, device, x['label']), batched=True, batch_size=4, remove_columns="input_values")

Map:   0%|          | 0/579 [00:00<?, ? examples/s]

In [22]:
train_dataset_encoded2 = train_dataset_encoded.map(lambda x: make_predictions2(x['input_values'], classification_net, device, x['label']), batched=True, batch_size=4, remove_columns="input_values")

Map:   0%|          | 0/1752 [00:00<?, ? examples/s]

In [23]:
compute_metrics([int(i) for i in train_dataset_encoded2[:]['predicted_class_id']], train_dataset_encoded2[:]['label'])

  _warn_prf(average, modifier, msg_start, len(result))


{'accuracy': 0.9908675799086758,
 'f1': 0.9741083101533664,
 'precision': 0.977485380116959,
 'recall': 0.974931129476584}

In [24]:
compute_metrics([int(i) for i in val_dataset_encoded2[:]['predicted_class_id']], val_dataset_encoded2[:]['label'])

  _warn_prf(average, modifier, msg_start, len(result))


{'accuracy': 0.8687392055267703,
 'f1': 0.8171011880909657,
 'precision': 0.8546414835077073,
 'recall': 0.8165884498945812}

In [30]:
10

10

In [25]:
test_dataset_encoded = preprocess_data_for_training(dataset_path='../data/test', sampling_rate=args.sampling_rate, feature_extractor=feature_extractor,
                                                   fe_batch_size=args.fe_batch_size, dataset_name="test")


Resolving data files:   0%|          | 0/557 [00:00<?, ?it/s]

Downloading and preparing dataset audiofolder/default to /root/.cache/huggingface/datasets/audiofolder/default-576c17cc42543850/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc...


Downloading data files:   0%|          | 0/557 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset audiofolder downloaded and prepared to /root/.cache/huggingface/datasets/audiofolder/default-576c17cc42543850/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

2023-07-04 15:06:57,118 - __main__ - INFO -  loaded test dataset length is: 556
2023-07-04 15:06:57,130 - __main__ - INFO -  test dataset sampling rate casted to: 22050


Map:   0%|          | 0/556 [00:00<?, ? examples/s]

2023-07-04 15:07:23,949 - __main__ - INFO -  done extracting features for test dataset


In [26]:
# val_dataset_encoded2 = val_dataset_encoded.map(lambda x: make_predictions2(x['input_values'], classification_net, device, x['label']), batched=True, batch_size=4, remove_columns="input_values")
test_dataset_encoded = test_dataset_encoded.map(lambda x: make_predictions2(x['input_values'], classification_net, device), batched=True, batch_size=4, remove_columns="input_values")

Map:   0%|          | 0/556 [00:00<?, ? examples/s]

In [27]:
test_dataset_encoded_df = test_dataset_encoded.to_pandas()
test_dataset_encoded_df.head()

Unnamed: 0,file_name,predicted_class_id
0,0.wav,14
1,1.wav,60
2,10.wav,26
3,100.wav,56
4,101.wav,57


In [28]:
test_dataset_encoded_df.to_csv("exp_siamese/predictions_siamese4.csv", index=False)

In [39]:
train_ds2 = Siamese_Dataset2(net.model)
train_ldr2 = torch.utils.data.DataLoader(train_ds2, batch_size=batch_size, shuffle=True)
for (batch_idx, batch) in tqdm(enumerate(train_ldr2)):
    print(batch_idx)

0it [00:00, ?it/s]

102 56 80
tensor(41) tensor(41) tensor(36)


0it [00:22, ?it/s]


In [43]:
distance_to_positive = torch.cdist(temp_predictions[0].view(1, -1), temp_predictions[1].view(1, -1))
distances_to_negative = torch.cdist(temp_predictions[0].view(1, -1), negative_predictions)


In [55]:
margin = 20

tensor([[-24.0878, -17.6224, -17.0130, -20.1809, -25.4389, -28.3435, -17.4109,
         -19.9022, -28.0310, -19.5770]], grad_fn=<SubBackward0>)

In [68]:
np.random.choice(negative_indices)

2

In [78]:
negative_indices = torch.where(((-margin < distance_to_positive - distances_to_negative) & (distance_to_positive - distances_to_negative < 0))[0])[0]
index_in_list_of_indices = np.random.choice(negative_indices) if len(negative_indices) else int((distances_to_negative == torch.min(distances_to_negative)).nonzero(as_tuple=True)[0])
negative_idx = indices_for_negative_mining[index_in_list_of_indices]
negative_idx

14

In [30]:
torch.tensor([temp_predictions[0]], dtype=torch.float32)

In [250]:
torch.cat(negative_predictions, 0)

tensor([[ 0.7825,  0.5864, -0.7387,  ..., -0.3220, -2.3220,  1.5102],
        [ 1.1009,  0.5280, -0.4702,  ...,  0.0370, -2.0756,  1.5000],
        [ 0.7825,  0.5864, -0.7387,  ..., -0.3220, -2.3220,  1.5102],
        [ 1.1009,  0.5280, -0.4702,  ...,  0.0370, -2.0756,  1.5000]],
       grad_fn=<CatBackward0>)

[[158, 12, 139, 14], [71, 17, 157, 13], [79, 73]]

In [166]:
distance_to_positive = torch.cdist(torch.tensor([val_dataset_encoded2['pooler_output'][0]], dtype=torch.float32), torch.tensor([val_dataset_encoded2['pooler_output'][1]], dtype=torch.float32))

In [207]:

distances_to_negative
int((distances_to_negative == torch.min(distances_to_negative)).nonzero(as_tuple=True)[0])

0

In [196]:
margin = 1
int(torch.where(((-margin < distance_to_positive - distances_to_negative) & (distance_to_positive - distances_to_negative < 0))[0])[0])

1

In [134]:
val_dataset_encoded.select([1, 2, 4])['file_name']

['Aleetacurvicosta_GBIF3044550084_IN69533545_164034.wav',
 'Atrapsaltacollina_GBIF1831196023_IN9883996_17874_edit1.wav',
 'Atrapsaltaencaustica_GBIF3385010567_IN94514864_304493_edit1.wav']

In [148]:
val_dataset_encoded2 = val_dataset_encoded.select([1, 2, 4]).map(lambda x: make_predictions3(x['input_values'], net.model, device, x['label']), batched=True, batch_size=4, remove_columns="input_values")

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [158]:
val_dataset_encoded2

Dataset({
    features: ['label', 'file_name', 'last_hidden_state', 'pooler_output'],
    num_rows: 3
})

In [151]:
val_dataset_encoded2 = val_dataset_encoded.select([1, 2, 4]).map(lambda x: net.model(torch.tensor(x['input_values'], dtype=torch.float32).to(device)), batched=True, batch_size=4, remove_columns="input_values")

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [155]:
len(val_dataset_encoded2['pooler_output'][0])

768

In [97]:
indices_to_consider = mapping[55]


In [102]:
mapping[55]

[135, 147, 175]

In [100]:
indices_to_consider.remove(160)

In [80]:
np.random.choice(train_ds.n, size=10, replace=False)

array([145,   3, 151, 160, 161, 149,  83,  85, 137, 132])

In [46]:
mapping.get(category, []) + [45]

[45]

In [63]:
np.random.choice(mapping[55])

135

In [31]:
train_ds.y_data[0].item()

61

In [24]:
train_ds.y_data

tensor([61, 21,  5, 12, 54, 10, 22, 30, 54, 44, 25, 13, 10, 54,  9, 57, 19, 43,
        64, 44, 64,  6, 38, 29, 59, 56, 61, 61,  2, 36, 27, 33,  5, 14, 27, 21,
        34, 26, 34, 51, 61, 17, 11,  3, 20, 40,  1, 36, 32, 11, 65, 47, 33,  8,
        37, 39, 41,  0, 48, 64, 10, 43, 34, 64, 47, 39, 58, 64,  8, 62, 33,  4,
        39, 18, 61, 65, 61, 45, 50, 42, 36, 60, 46, 33, 17, 40,  9, 44, 31, 39,
        29, 64, 37, 29, 60,  9, 54, 37, 44, 35, 24, 26, 41, 39, 29, 64, 53, 10,
        59, 61, 38, 64, 29,  7, 49, 11, 25, 38, 22, 44,  0, 17, 46,  9, 15, 62,
        21, 30, 64, 16, 28, 59, 52, 54, 40, 55,  8, 17, 60, 24, 23, 56, 10, 60,
         9, 24, 42, 55, 11,  9, 17, 64, 12, 11, 23, 12, 31, 19,  5, 39, 55, 63,
        17, 42, 10,  9, 63, 26, 17, 43, 56,  9, 56, 13, 38, 55])

In [None]:
def plot_embeddings(embeddings, targets, xlim=None, ylim=None):
    plt.figure(figsize=(10,10))
    for i in range(10):
        inds = np.where(targets==i)[0]
        plt.scatter(embeddings[inds,0], embeddings[inds,1], alpha=0.5, color=colors[i])
    if xlim:
        plt.xlim(xlim[0], xlim[1])
    if ylim:
        plt.ylim(ylim[0], ylim[1])
    plt.legend(mnist_classes)

def extract_embeddings(dataloader, model):
    with torch.no_grad():
        model.eval()
        embeddings = np.zeros((len(dataloader.dataset), 2))
        labels = np.zeros(len(dataloader.dataset))
        k = 0
        for images, target in dataloader:
            if cuda:
                images = images.cuda()
            embeddings[k:k+len(images)] = model.get_embedding(images).data.cpu().numpy()
            labels[k:k+len(images)] = target.numpy()
            k += len(images)
    return embeddings, labels

In [27]:
def fit(train_loader, val_loader, model, loss_fn, optimizer, scheduler, n_epochs, cuda, log_interval, metrics=[],
        start_epoch=0):
    """
    Loaders, model, loss function and metrics should work together for a given task,
    i.e. The model should be able to process data output of loaders,
    loss function should process target output of loaders and outputs from the model
    Examples: Classification: batch loader, classification model, NLL loss, accuracy metric
    Siamese network: Siamese loader, siamese model, contrastive loss
    Online triplet learning: batch loader, embedding model, online triplet loss
    """
    for epoch in range(0, start_epoch):
        scheduler.step()

    for epoch in range(start_epoch, n_epochs):
        scheduler.step()

        # Train stage
        train_loss, metrics = train_epoch(train_loader, model, loss_fn, optimizer, cuda, log_interval, metrics)

        message = 'Epoch: {}/{}. Train set: Average loss: {:.4f}'.format(epoch + 1, n_epochs, train_loss)
        for metric in metrics:
            message += '\t{}: {}'.format(metric.name(), metric.value())

        val_loss, metrics = test_epoch(val_loader, model, loss_fn, cuda, metrics)
        val_loss /= len(val_loader)

        message += '\nEpoch: {}/{}. Validation set: Average loss: {:.4f}'.format(epoch + 1, n_epochs,
                                                                                 val_loss)
        for metric in metrics:
            message += '\t{}: {}'.format(metric.name(), metric.value())

        print(message)


def train_epoch(train_loader, model, loss_fn, optimizer, cuda, log_interval, metrics):
    for metric in metrics:
        metric.reset()

    model.train()
    losses = []
    total_loss = 0

    for batch_idx, (data, target) in enumerate(train_loader):
        target = target if len(target) > 0 else None
        if not type(data) in (tuple, list):
            data = (data,)
        if cuda:
            data = tuple(d.cuda() for d in data)
            if target is not None:
                target = target.cuda()


        optimizer.zero_grad()
        outputs = model(*data)

        if type(outputs) not in (tuple, list):
            outputs = (outputs,)

        loss_inputs = outputs
        if target is not None:
            target = (target,)
            loss_inputs += target

        loss_outputs = loss_fn(*loss_inputs)
        loss = loss_outputs[0] if type(loss_outputs) in (tuple, list) else loss_outputs
        losses.append(loss.item())
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

        for metric in metrics:
            metric(outputs, target, loss_outputs)

        if batch_idx % log_interval == 0:
            message = 'Train: [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                batch_idx * len(data[0]), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), np.mean(losses))
            for metric in metrics:
                message += '\t{}: {}'.format(metric.name(), metric.value())

            print(message)
            losses = []

    total_loss /= (batch_idx + 1)
    return total_loss, metrics


def test_epoch(val_loader, model, loss_fn, cuda, metrics):
    with torch.no_grad():
        for metric in metrics:
            metric.reset()
        model.eval()
        val_loss = 0
        for batch_idx, (data, target) in enumerate(val_loader):
            target = target if len(target) > 0 else None
            if not type(data) in (tuple, list):
                data = (data,)
            if cuda:
                data = tuple(d.cuda() for d in data)
                if target is not None:
                    target = target.cuda()

            outputs = model(*data)

            if type(outputs) not in (tuple, list):
                outputs = (outputs,)
            loss_inputs = outputs
            if target is not None:
                target = (target,)
                loss_inputs += target

            loss_outputs = loss_fn(*loss_inputs)
            loss = loss_outputs[0] if type(loss_outputs) in (tuple, list) else loss_outputs
            val_loss += loss.item()

            for metric in metrics:
                metric(outputs, target, loss_outputs)

    return val_loss, 

In [25]:
from torch.optim import lr_scheduler

batch_size = 2
lrn_rate = 2e-5
max_epochs = 10
cuda = True

kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {}
train_ds = Siamese_Dataset()
val_ds = Siamese_Dataset(train=False)
triplet_train_loader = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True, **kwargs)
triplet_test_loader = torch.utils.data.DataLoader(val_ds, batch_size=batch_size, shuffle=False, **kwargs)

# Set up the network and training parameters
# from networks import EmbeddingNet, TripletNet
# from losses import TripletLoss

# embedding_net = EmbeddingNet()
# model = TripletNet(embedding_net)
net = SiameseNet().to(device)
# if cuda:
#     model.cuda()
# loss_fn = TripletLoss(margin)
loss_fn = torch.nn.TripletMarginLoss(margin=10.0)
# lr = 1e-3
# optimizer = optim.Adam(model.parameters(), lr=lr)
optimizer = torch.optim.Adam(net.parameters(), lr=lrn_rate)

scheduler = lr_scheduler.StepLR(optimizer, 4, gamma=0.1, last_epoch=-1)

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([66, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([66]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
cuda = True
log_interval = 1
fit(triplet_train_loader, triplet_test_loader, net, loss_fn, optimizer, scheduler, max_epochs, cuda, log_interval)


In [None]:
from tqdm import tqdm
print("\nBegin MNIST Siamese network demo ")
np.random.seed(1)
torch.manual_seed(1)

import gc
torch.cuda.empty_cache()
# del variables
gc.collect()

# 1. create Dataset
# print("\nLoading 1000-item train Dataset from text file ")
# train_file = ".\\Data\\mnist_train_1000.txt" 
train_ds = Siamese_Dataset()

batch_size = 2
train_ldr = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True)
#TODO: check what margin would be best
triplet_loss = torch.nn.TripletMarginLoss(margin=10.0, p=2)

# 2. create network
print("\nCreating Siamese network ")
net = SiameseNet().to(device)

# 3. train model
max_epochs = 10
ep_log_interval = 1
lrn_rate = 2e-5
# loss_func = ContrastiveLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lrn_rate)

print("\nbatch_size = %3d " % batch_size)
# print("loss = " + "ContrastiveLoss()" )
# print("optimizer = SGD")
print("max_epochs = %3d " % max_epochs)
print("lrn_rate = %0.6f " % lrn_rate)

print("\nStarting training")
net.train()  # set mode
for epoch in range(0, max_epochs):
    ep_loss = 0  # for one full epoch
    for (batch_idx, batch) in tqdm(enumerate(train_ldr)):
        X1, y1, X2, y2, X3, y3 = batch
        oupt1, oupt2, oupt3 = net(X1, X2, X3)

        optimizer.zero_grad()       # reset gradients
        loss_val = triplet_loss(oupt1, oupt2, oupt3)

        ep_loss += loss_val.item()  # accumulate loss
        loss_val.backward()         # compute grads
        optimizer.step()            # update weights
    if epoch % ep_log_interval == 0:
        print("epoch = %4d  |  loss = %10.4f" % (epoch, ep_loss))
    print("Done ") 

# 4. TODO: save trained model

# -----------------------------------------------------------

# 5. use model
# print("\nUsing trained Siaamese model ")
# pixels1 = train_ds.x_data[0]  # a '1' 
# pixels2 = train_ds.x_data[3]  # a different '1'
# pixels3 = train_ds.x_data[4]  # a '6'

# net.eval()
# dissim_12 = siamese_dissim(net, pixels1, pixels2)
# dissim_13 = siamese_dissim(net, pixels1, pixels3)

print("\nEnd MNIST Siamese demo ")


In [55]:
net.model(X).pooler_output

In [25]:
val_dataset_encoded2[:]['predicted_class_id']

In [24]:
output = classification_net(X)


In [25]:
from tqdm import tqdm
print("\nBegin MNIST Siamese network demo ")
np.random.seed(1)
torch.manual_seed(1)

import gc
torch.cuda.empty_cache()
# del variables
gc.collect()

# 1. create Dataset
# print("\nLoading 1000-item train Dataset from text file ")
# train_file = ".\\Data\\mnist_train_1000.txt" 
train_ds = Siamese_Dataset()

batch_size = 2
train_ldr = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True)
#TODO: check what margin would be best
triplet_loss = torch.nn.TripletMarginLoss(margin=10.0, p=2)

# 2. create network
print("\nCreating Siamese network ")
net = SiameseNet().to(device)

# 3. train model
max_epochs = 10
ep_log_interval = 1
lrn_rate = 2e-5
# loss_func = ContrastiveLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lrn_rate)

print("\nbatch_size = %3d " % batch_size)
# print("loss = " + "ContrastiveLoss()" )
# print("optimizer = SGD")
print("max_epochs = %3d " % max_epochs)
print("lrn_rate = %0.6f " % lrn_rate)

print("\nStarting training")
net.train()  # set mode
for epoch in range(0, max_epochs):
    ep_loss = 0  # for one full epoch
    for (batch_idx, batch) in tqdm(enumerate(train_ldr)):
        X1, y1, X2, y2, X3, y3 = batch


Begin MNIST Siamese network demo 

Creating Siamese network 


Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([66, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([66]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



batch_size =   2 
max_epochs =  10 
lrn_rate = 0.000020 

Starting training


88it [00:00, 329.18it/s]
88it [00:00, 348.62it/s]
88it [00:00, 314.22it/s]
88it [00:00, 322.30it/s]
88it [00:00, 327.68it/s]
88it [00:00, 350.32it/s]
88it [00:00, 367.97it/s]
88it [00:00, 336.02it/s]
88it [00:00, 351.24it/s]
88it [00:00, 331.46it/s]


In [27]:
test = model(X1)

In [29]:
test.logits.shape

torch.Size([2, 66])

In [26]:
net.model(X1).pooler_output.shape

torch.Size([2, 768])

In [36]:
net.model(X1).last_hidden_state.shape

torch.Size([2, 1214, 768])

In [33]:
model.classifier(net.model(X1).pooler_output).shape

torch.Size([2, 66])

In [34]:
model.classifier(net.model(X1).last_hidden_state).shape

torch.Size([2, 1214, 66])

In [37]:
test.logits

tensor([[-0.6777,  0.7706,  0.6015, -0.4064,  0.7318, -0.7616,  0.1365, -0.3242,
          0.0804, -1.6639, -0.7140,  0.1405, -1.1172,  0.5060,  0.4503,  1.0351,
         -1.3584, -0.4642,  1.4119, -0.2866,  0.2653,  0.1873,  0.1038,  0.6556,
         -0.6115,  0.9315,  0.1400, -0.3580, -0.4849, -0.4869, -0.1716,  0.8513,
          0.7254,  0.2358,  0.5989,  0.5368,  0.8863, -0.1549, -0.0678,  0.5784,
          0.2994, -0.5280, -0.7998,  0.0025,  0.3474,  1.0589, -0.2939,  0.8725,
          0.0236,  0.4269,  0.2931,  0.4674, -0.0266, -0.4732,  0.1252, -1.5213,
          0.1104,  0.8602, -0.8185,  0.4762,  0.1954,  0.4118,  0.0576, -0.3409,
         -0.7113,  0.1168],
        [ 0.5461, -0.3503, -0.3823, -0.6223,  0.3965, -1.1243, -0.6866, -0.6873,
         -0.4432, -0.1056,  0.0224, -0.1174,  1.2591, -0.1088,  0.1457,  0.6368,
         -0.1531,  0.1038,  0.3419, -0.1198,  0.6944,  0.0080,  1.1043,  0.0122,
          0.3334, -0.2455,  0.1225,  0.9528,  0.1736, -1.7596,  0.3202,  0.2750,


In [39]:
model.classifier(net.model(X1).last_hidden_state)

tensor([[[-0.7940,  0.9164,  0.6317,  ..., -0.4091, -0.5656,  0.3754],
         [-0.5145,  0.5657,  0.5364,  ..., -0.2594, -0.8149, -0.1528],
         [-0.2973,  0.2718,  0.5956,  ..., -1.2559,  0.3854,  0.3624],
         ...,
         [ 0.1172, -1.3147,  0.1869,  ..., -0.6163, -0.3961, -0.3540],
         [ 0.3750, -0.8571,  0.0535,  ..., -0.6026, -0.5179, -0.2638],
         [-0.1941, -0.4391,  0.5003,  ..., -0.6610, -0.1963, -0.4841]],

        [[ 0.6599, -0.2133, -0.4067,  ..., -0.3970,  0.3605,  1.0515],
         [ 0.3944, -0.4711, -0.3264,  ..., -0.3474, -0.2570,  0.8255],
         [-0.3495,  0.0635, -0.6890,  ..., -0.5967,  0.5328,  0.4974],
         ...,
         [-0.3794, -0.3161,  0.0347,  ..., -0.6805,  0.0286, -0.3282],
         [-0.0819, -0.0814, -0.0831,  ..., -0.6742,  0.4789,  0.4459],
         [ 0.2560, -0.0630,  0.0057,  ..., -0.5864,  0.7134,  0.5144]]],
       grad_fn=<ViewBackward0>)

In [23]:
net.model(X1).last_hidden_state

torch.Size([2, 1214, 768])

In [37]:
torch.cuda.empty_cache()
# del variables
gc.collect()

2099

In [24]:
model = ASTForAudioClassification.from_pretrained(args.model_name, num_labels=num_labels, label2id=label2id, id2label=id2label, ignore_mismatched_sizes=True)

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([66, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([66]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
model = model.classifier

In [26]:
model.to(device)

ASTMLPHead(
  (layernorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (dense): Linear(in_features=768, out_features=66, bias=True)
)

In [31]:
net.model(X1).last_hidden_state.shape

In [29]:
model(net.model(X1).last_hidden_state).shape

torch.Size([2, 1214, 66])

In [None]:
self.model(x1).last_hidden_state

In [27]:
test = torch.cat((X1, X2, X3), 0)
test2 = torch.tensor_split(test, 3)

In [30]:
test.shape

torch.Size([6, 1024, 128])

In [68]:
oupt1, oupt2 = net(X1, X2)

In [73]:
euc_dist[0]

tensor([2.7713e-05, 2.7713e-05, 2.7713e-05,  ..., 2.7713e-05, 2.7713e-05,
        2.7713e-05], grad_fn=<SelectBackward0>)

In [71]:
[torch.mean((1-flag) * torch.pow(x, 2) + (flag) * torch.pow(torch.clamp(2.0 - x, min=0.0), 2)) for x in euc_dist]

In [69]:
euc_dist = torch.nn.functional.pairwise_distance(oupt1, oupt2)

try:
    loss = torch.mean((1-flag) * torch.pow(euc_dist, 2) + (flag) * torch.pow(torch.clamp(self.m - euc_dist, min=0.0), 2))
except:
    loss = [torch.mean((1-flag) * torch.pow(x, 2) + (flag) * torch.pow(torch.clamp(2.0 - x, min=0.0), 2)) for x in euc_dist]


In [65]:
[torch.mean((1-flag) * torch.pow(x, 2) + (flag) * torch.pow(torch.clamp(2.0 - x, min=0.0), 2)) for x in euc_dist]

[tensor(7.6800e-10, grad_fn=<MeanBackward0>),
 tensor(822.7380, grad_fn=<MeanBackward0>)]

In [59]:
euc_dist.map(lambda x: torch.mean((1-flag) * torch.pow(x, 2) + (flag) * torch.pow(torch.clamp(2.0 - x, min=0.0), 2)))

In [52]:
# torch.pow(test, 2)
flag = 0

In [57]:
(1-flag) * torch.pow(test, 2) + (flag) * torch.pow(torch.clamp(2.0 - test, min=0.0), 2)

tensor([ 816.2149,  688.5238, 1167.8853,  ..., 2091.9470, 1763.9680,
         587.6725], grad_fn=<AddBackward0>)

In [53]:
torch.mean((1-flag) * torch.pow(test, 2) + (flag) * torch.pow(torch.clamp(2.0 - test, min=0.0), 2))

tensor(822.7380, grad_fn=<MeanBackward1>)

In [20]:
loss_val = loss_func(oupt1, oupt2, flag)

In [33]:
len(val_dataset_encoded[0]['input_values'][0])

128

In [21]:
test = net.model(X1)

In [25]:
test.last_hidden_state.shape

torch.Size([2, 1214, 768])