In [2]:
import logging                                                    # module for displaying relevant information in the logs
import sys                                                        # to access to some variables used or maintained by the interpreter 
import argparse                                                   # to parse arguments from passed in the hyperparameters
import os                                                         # to manage environmental variables
import json                                                       # to open the json file with labels
from transformers import (                                        # required classes to perform the model training and implement early stopping
    ASTFeatureExtractor, 
    ASTForAudioClassification, 
    Trainer, 
    TrainingArguments, 
    EarlyStoppingCallback
)                                    
import torch                                                       # library to work with PyTorch tensors and to figure out if we have a GPU available
from datasets import load_dataset, Audio, Dataset                  # required tools to create, load and process our audio dataset
import pandas as pd                                                # home of the DataFrame construct, _the_ most important object for Data Science
from preprocessing import preprocess_audio_arrays                  # functions to preprocess the dataset with ASTFeatureExtractor
from gdsc_eval import compute_metrics, make_predictions, make_chunked_predictions           # functions to create predictions and evaluate them
from typing import Optional                                        # for type hints
                                # Add the source directory to the PYTHONPATH. This allows to import local functions and modules.
from config import DEFAULT_BUCKET, DEFAULT_REGION  

In [3]:
# CONFIG

#----- BEST MODEL ---------
#7 sec, sr=44100, lr=2e-05, num_mel_bins=178

SAMPLING_RATE = 44100

SPLIT_IN_SECS = 7  # in seconds
CHUNK_MIN_SIZE = 1  # in seconds

model_name = 'sm-training-custom-2023-07-16-14-12-31-154'
checkpoint = 'checkpoint-13200'

FOLDER_TO_PROCESS = 'test'

In [4]:
model_path =f"/root/data/experiments/models/{model_name}/{checkpoint}"

feature_extractor = ASTFeatureExtractor.from_pretrained(model_path)
model = ASTForAudioClassification.from_pretrained(model_path)

In [5]:
test_path = f'/root/data/data/{FOLDER_TO_PROCESS}'
test_dataset = load_dataset("audiofolder", data_dir=test_path).get('train')
test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=SAMPLING_RATE))
print(test_dataset)
print(test_dataset[0])

Resolving data files:   0%|          | 0/557 [00:00<?, ?it/s]

Downloading and preparing dataset audiofolder/default to /root/.cache/huggingface/datasets/audiofolder/default-2d285a18baf538c5/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc...


Downloading data files:   0%|          | 0/557 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset audiofolder downloaded and prepared to /root/.cache/huggingface/datasets/audiofolder/default-2d285a18baf538c5/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['audio'],
    num_rows: 556
})
{'audio': {'path': '/root/data/data/test/0.wav', 'array': array([ 0.        ,  0.        ,  0.        , ...,  0.06658936,
        0.02105713, -0.00656128]), 'sampling_rate': 44100}}


In [6]:
remove_metadata = lambda x: x.endswith(".wav")
extract_file_name = lambda x: x.split('/')[-1]

test_paths = list(test_dataset.info.download_checksums.keys())

test_paths = list(filter(remove_metadata, test_paths))
test_paths = list(map(extract_file_name, test_paths))
print(test_paths[:3])

test_dataset = test_dataset.add_column("file_name", test_paths)
print(test_dataset)
print(test_dataset[0])

['0.wav', '1.wav', '10.wav']
Dataset({
    features: ['audio', 'file_name'],
    num_rows: 556
})
{'audio': {'path': '/root/data/data/test/0.wav', 'array': array([ 0.        ,  0.        ,  0.        , ...,  0.06658936,
        0.02105713, -0.00656128]), 'sampling_rate': 44100}, 'file_name': '0.wav'}


In [7]:
import copy
from datasets import Dataset

chunks = []
total = 0
for i, x in enumerate(test_dataset):
    
    one_file_dataset = test_dataset.select([i])
    
    #print(x)
    file_size = len(x['audio']['array'])
    #print('file_size:', file_size)
    sampling_rate = x['audio']['sampling_rate']
    #print('sampling_rate:', sampling_rate)
    
    chunk_size = SPLIT_IN_SECS * sampling_rate
    n = int(file_size / chunk_size)
    num = 0
    for i_n in range(n+1):
        if i_n > 0 and len(x['audio']['array'][i_n * chunk_size : (i_n + 1) * chunk_size]) < CHUNK_MIN_SIZE * sampling_rate:
            break
            
        #print('chunk:', i_n)
        x_chunk = copy.deepcopy(x)
        x_chunk['audio']['array'] = x_chunk['audio']['array'][i_n * chunk_size : (i_n + 1) * chunk_size]
        #print(x_chunk['audio']['array'])
        
        one_file_dataset.add_item(x_chunk)
        chunks.append(x_chunk)
        num += 1
        total += 1
    
    print(f'{x["file_name"]} - file_size:{file_size} - splitted into {num} chunks')
    
    #if i > 10:
    #    break
            
print(f'finished chunking, total number of chunks: {total}')
chunked_one_file_dataset = Dataset.from_pandas(pd.DataFrame(chunks))
print('chunked_one_file_dataset:', chunked_one_file_dataset)
    

print('\npreprocessing by preprocess_audio_arrays ...')
chunked_one_file_dataset_encoded = chunked_one_file_dataset.map(lambda x: preprocess_audio_arrays(x, 'audio', 'array', feature_extractor), remove_columns="audio", batched=True, batch_size = 16)
chunked_one_file_dataset_encoded.set_format(type='torch', columns=['input_values'])
    
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


print('\ncalculating predictions by make_chunked_predictions ...')
chunked_one_file_dataset_encoded = chunked_one_file_dataset_encoded.map(lambda x: make_chunked_predictions(x['input_values'], model, device), batched=True, batch_size=16, remove_columns="input_values")
    
chunked_one_file_dataset_encoded_df = chunked_one_file_dataset_encoded.to_pandas()
print('\npredictions:', chunked_one_file_dataset_encoded_df, '\n\n')
chunked_one_file_dataset_encoded_df.to_csv(f"/root/data/experiments/models/{model_name}/{checkpoint}/predictions_{FOLDER_TO_PROCESS}_{SPLIT_IN_SECS}_secs_{SAMPLING_RATE}.csv", index=False)
chunked_one_file_dataset_encoded_df.to_csv(f"/root/data/experiments/exp_2023-07-13_chunk_fixed_predictions/predictions_{model_name}_{checkpoint}_{FOLDER_TO_PROCESS}_{SPLIT_IN_SECS}_secs_{SAMPLING_RATE}.csv", index=False)
print('RESULTS SAVED in:')
print(f"/root/data/experiments/models/{model_name}/{checkpoint}/predictions_{FOLDER_TO_PROCESS}_{SPLIT_IN_SECS}_secs_{SAMPLING_RATE}.csv")
print(f"/root/data/experiments/exp_2023-07-13_chunk_fixed_predictions/predictions_{model_name}_{checkpoint}_{FOLDER_TO_PROCESS}_{SPLIT_IN_SECS}_secs_{SAMPLING_RATE}.csv")
    

0.wav - file_size:406784 - splitted into 2 chunks
1.wav - file_size:147375 - splitted into 1 chunks
10.wav - file_size:366303 - splitted into 2 chunks
100.wav - file_size:490633 - splitted into 2 chunks
101.wav - file_size:3174534 - splitted into 11 chunks
102.wav - file_size:264600 - splitted into 1 chunks
103.wav - file_size:176400 - splitted into 1 chunks
104.wav - file_size:846667 - splitted into 3 chunks
105.wav - file_size:264600 - splitted into 1 chunks
106.wav - file_size:529200 - splitted into 2 chunks
107.wav - file_size:220500 - splitted into 1 chunks
108.wav - file_size:1058400 - splitted into 4 chunks
109.wav - file_size:308700 - splitted into 1 chunks
11.wav - file_size:485100 - splitted into 2 chunks
110.wav - file_size:264600 - splitted into 1 chunks
111.wav - file_size:88200 - splitted into 1 chunks
112.wav - file_size:2295459 - splitted into 8 chunks
113.wav - file_size:7943344 - splitted into 26 chunks
114.wav - file_size:441000 - splitted into 2 chunks
115.wav - fil

Map:   0%|          | 0/2313 [00:00<?, ? examples/s]


calculating predictions by make_chunked_predictions ...


Map:   0%|          | 0/2313 [00:00<?, ? examples/s]


predictions:      file_name predicted_class_id   
0        0.wav                 14  \
1        0.wav                 14   
2        1.wav                 60   
3       10.wav                 26   
4       10.wav                 26   
...        ...                ...   
2308    99.wav                 39   
2309    99.wav                 39   
2310    99.wav                 39   
2311    99.wav                 39   
2312    99.wav                 39   

                                                 logits  
0     [-0.06360662, -0.548346, 0.39514756, -0.640027...  
1     [-0.06094554, -0.5902772, 0.40035322, -0.04751...  
2     [-1.1129682, -1.5018668, 0.13556546, -0.916900...  
3     [-0.52893174, 1.666247, -0.8008976, 0.18271926...  
4     [0.1385512, -1.4241198, -1.9842126, -0.7965207...  
...                                                 ...  
2308  [-0.30983245, -0.98355216, 0.23288876, 0.00535...  
2309  [-0.47169477, -0.8123874, 0.19997203, 0.090179...  
2310  [-0.43138832,