In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys # Python system library needed to load custom functions
import math # module with access to mathematical functions
import os # for changing the directory

import time
import numpy as np  # for performing calculations on numerical arrays
import pandas as pd  # home of the DataFrame construct, _the_ most important object for Data Science

from IPython.display import Audio as AudioI # for listening to our insects
from scipy.fft import fft # function to calculate Fast Fourier Transform

import matplotlib.pyplot as plt  # allows creation of insightful plots
import seaborn as sns # another library to make even more beautiful plots

sys.path.append('../src_old') # add the source directory to the PYTHONPATH. This allows to import local functions and modules.
# enable rendering plots under the code cell that created it
%matplotlib inline

from eda_utils import show_sampling, signal_generator, plot_random_spec, plot_spec, plot_waveform # functions to create plots for and from audio data
from gdsc_utils import download_directory, PROJECT_DIR # function to download GDSC data from S3 bucket and our root directory
from config import DEFAULT_BUCKET  # S3 bucket with the GDSC data

from datasets import load_dataset, Audio 

import librosa
import soundfile as sf

import torchaudio

os.chdir(PROJECT_DIR) # changing our directory to root

In [4]:
#from augmentation_utils import generate_augmented_data

In [5]:
import sagemaker                                                                       # dedicated sagemaker library to execute training jobs
import boto3 

from config import DEFAULT_BUCKET, DEFAULT_REGION  
from gdsc_utils import upload_to_s3, PROJECT_DIR                                        # functions to create S3 buckets and to help with downloading models. Importing our root directory
os.chdir(PROJECT_DIR)

In [6]:
import shutil
import tqdm

In [7]:
train_path = '../data/train'

In [8]:
train_dataset = load_dataset("audiofolder", data_dir=train_path).get('train') #.shuffle(seed = 42)

Resolving data files:   0%|          | 0/1753 [00:00<?, ?it/s]

Downloading and preparing dataset audiofolder/default to /root/.cache/huggingface/datasets/audiofolder/default-0bb5909dae519ccd/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc...


Downloading data files:   0%|          | 0/1753 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset audiofolder downloaded and prepared to /root/.cache/huggingface/datasets/audiofolder/default-0bb5909dae519ccd/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [9]:
train_dataset

Dataset({
    features: ['audio', 'label'],
    num_rows: 1752
})

In [10]:
train_dataset[0]

{'audio': {'path': '/root/data/data/train/Achetadomesticus_XC489192-Achetadomesticus_poland_psz_20140510_22.00h_3498_edit1.wav',
  'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00015259,
          0.00094604, -0.00158691]),
  'sampling_rate': 44100},
 'label': 0}

In [11]:
metadata = pd.read_csv('/root/data/data/metadata.csv')

In [13]:
val_path = '../data/val'
val_dataset = load_dataset("audiofolder", data_dir=val_path).get('train')

Resolving data files:   0%|          | 0/580 [00:00<?, ?it/s]

Downloading and preparing dataset audiofolder/default to /root/.cache/huggingface/datasets/audiofolder/default-3bb25b14b41b1a03/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc...


Downloading data files:   0%|          | 0/580 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset audiofolder downloaded and prepared to /root/.cache/huggingface/datasets/audiofolder/default-3bb25b14b41b1a03/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [14]:
from datasets import concatenate_datasets

In [15]:
trv_dataset = concatenate_datasets([train_dataset, val_dataset]).shuffle(seed = 42)

In [17]:
trv_dataset.shape, train_dataset.shape, val_dataset.shape

((2331, 2), (1752, 2), (579, 2))

In [12]:
#metadata[(metadata.label==28) & (metadata.subset=='validation')].file_name.values

In [13]:
#metadata[(metadata.label==28) & (metadata.subset=='train')].file_name.values

In [31]:
# audio_trn = 'data/train/Gryllusbimaculatus_XC751730-dat007-026.wav' 
# audio_val = 'data/val/Gryllusbimaculatus_XC751729-dat007-025.wav'

In [14]:
#AudioI(audio_trn)  #, rate=44100)

In [15]:
#AudioI(audio_trn)

In [16]:
#AudioI(audio_val)

In [17]:
#plot_spec([audio_val])

In [19]:
def white_noise(signal, noise_factor_min=0.1, noise_factor_max=0.4, noise_factor=None): 

    noise = np.random.normal(0, signal.std(), signal.shape[0])
    if noise_factor is None:
        noise_factor = np.random.uniform(noise_factor_min, noise_factor_max)       
    signal_augmented = signal + noise * noise_factor
    return signal_augmented, noise_factor

In [20]:
def time_stretch(signal, stretch_rate_min=1.05, stretch_rate_max=1.2, stretch_rate=None):    
    if (stretch_rate_min) < 1 & (stretch_rate_max > 1):
        raise ValueError("Both 'stretch_rate_min' and 'stretch_rate_max' must be either below 1 or above 1")
    if stretch_rate is None:
        stretch_rate = np.random.uniform(stretch_rate_min, stretch_rate_max)
    return librosa.effects.time_stretch(y=signal, rate=stretch_rate), stretch_rate

In [21]:
def pitch_scale(signal, sr, n_steps_min=1, n_steps_max=2, n_steps=None):
    if n_steps is None:
        n_steps = np.random.uniform(n_steps_min, n_steps_max)    
    return librosa.effects.pitch_shift(y=signal, sr=sr, n_steps=n_steps), n_steps  

In [22]:
def polarity_inversion(signal):
    return signal * -1

In [23]:
def random_gain(signal, gain_min=1.5, gain_max=3, gain=None):
  
    if gain is None:  
        gain = np.random.uniform(gain_min, gain_max)
    return signal * gain, gain

In [24]:
def augmentation(signal, sr, strategy='few'):
    
    if strategy == 'all':
        aug_selection = np.array([1,2,3,4,5])
    if strategy == 'few':
        aug_selection = np.sort(np.random.choice(5, size=np.random.randint(1,5), replace=False)+1)    
    elif strategy == 'one':
        aug_selection = np.sort(np.random.choice(5, size=1, replace=False)+1)
    signal_aug = signal.copy()
    
    params = []
    
    for i in np.sort(aug_selection):
        if i == 1:
            signal_aug, param = time_stretch(signal=signal_aug) 
            params.append(param)
        if i == 2:
            signal_aug, param = pitch_scale(signal=signal_aug, sr=sr) 
            params.append(param)
        if i == 3:
            signal_aug, param = random_gain(signal=signal_aug)
            params.append(param)
        if i == 4:
            signal_aug, param = white_noise(signal=signal_aug)
            params.append(param)
        if i == 5:
            signal_aug = polarity_inversion(signal=signal_aug) 
            params.append(None)
    return  signal_aug, list(np.array(['ts', 'ps', 'rg', 'wn', 'pi'])[aug_selection-1]), params
            

In [25]:
def generate_augmented_data(train_data, metadata, dir_source, dir_dest, labels_to_augment=None, strategy='all', fraction=0.5, seed=42):    
    
    np.random.seed(seed) 
    print('\n--------- copying basic train set ---------')  
    t = time.time()    
#    shutil.copytree(dir_source + '/train', dir_dest + '/train')
    print('Elapsed time:', round(time.time() - t, 2))
    
    print('\n--------- copying validation set ---------')   
    t = time.time()    
#    shutil.copytree(dir_source + '/val', dir_dest + '/val')
    print('Elapsed time:', round(time.time() - t, 2))    
 
    print('\n--------- copying test set ---------')  
    t = time.time()    
#    shutil.copytree(dir_source + '/test', dir_dest + '/test')
    print('Elapsed time:', round(time.time() - t, 2))
    
    
    sr = train_data[0]['audio']['sampling_rate']
    
    print('\n--------- data preprocessing stage ---------')
    t = time.time()
    path_train_data = [train_data[i]['audio']['path'] for i in range(train_data.shape[0])]
    path_train_data = np.array(path_train_data)
    # if isinstance(metadata, str):
    #     metadata = pd.read_csv(metadata)
    df_audio = metadata[metadata.subset=='train']
    df_audio = df_audio.assign(path_full_raw = '/root/data/' + df_audio.path)
    df_audio = df_audio.assign(audio_rank_id = (df_audio.groupby('label')['file_name'].rank(method='first', na_option = 'bottom')-1).astype(int))
        
    df_audio_agg = df_audio.label.value_counts().reset_index(name='count').sort_values(by='label').reset_index(drop=True)
    count_max = df_audio_agg['count'].max()
    threshold = np.floor(count_max * fraction)
    if labels_to_augment is not None:
        #df_audio_agg = pd.merge(df_audio_agg, pd.DataFrame(dict(label=labels_to_augment)), on='label', how='inner')
        df_audio_agg = df_audio_agg[df_audio_agg['label'].isin(labels_to_augment)]
    
    df_audio_agg['gap'] = np.where((threshold - df_audio_agg['count']) > 0, (threshold - df_audio_agg['count']).astype(int), 0)
    df_audio_agg = df_audio_agg[df_audio_agg.gap > 0]

    label = []
    rank_id = []
    for l, c, g in zip(df_audio_agg['label'], df_audio_agg['count'], df_audio_agg['gap']):
        ind = np.random.choice(c, size=g, replace=True)
        label.extend([l] * len(ind))
        rank_id.extend(ind)
    df_aug = pd.DataFrame(dict(label=label, audio_rank_id=rank_id))
    df_aug = df_aug.assign(rank_id = (df_aug.groupby(['label', 'audio_rank_id'])['audio_rank_id'].rank(method='first', na_option = 'bottom')-1).astype(int))
    
    df_aug_final = pd.merge(df_audio, df_aug, on=['label', 'audio_rank_id'], how='inner')
    df_aug_final = df_aug_final.assign(file_name = df_aug_final.file_name.apply(lambda x: x.split('.wav')[0]) + '_aug' + df_aug_final.rank_id.astype(str) + '.wav')    
    df_aug_final = df_aug_final.assign(path = 'data/train/' + df_aug_final.file_name)  
    print('Elapsed time:', round(time.time() - t, 2))
    
    
    print('\n--------- augmentation stage ---------')    
    t = time.time()
    df_aug_final = df_aug_final.assign(aug_type = None)
    df_aug_final = df_aug_final.assign(aug_params = None)
#   for row, (p, f) in enumerate(zip(df_aug_final.path_full_raw.iloc, df_aug_final.file_name.iloc)):
    for row, (p, f) in enumerate(zip(df_aug_final.path_full_raw, df_aug_final.file_name)):        
        i = int(np.where(path_train_data == p)[0][0])
       # print(p, i)
        sig_aug, type_aug, params_aug = augmentation(signal=train_data[i]['audio']['array'], sr=sr, strategy=strategy)   
        # df_aug_final['num_frames'].iloc[row] = sig_aug.shape[0]
        # df_aug_final['length'].iloc[row] = sig_aug.shape[0]/sr
        
        df_aug_final.iat[row, df_aug_final.columns.get_loc('num_frames')] = sig_aug.shape[0]  
        df_aug_final.iat[row, df_aug_final.columns.get_loc('length')] = sig_aug.shape[0]/sr
        
        df_aug_final.iat[row, df_aug_final.columns.get_loc('aug_type')] = type_aug  
        df_aug_final.iat[row, df_aug_final.columns.get_loc('aug_params')] = params_aug        
        sf.write(dir_dest + '/train/' + f, sig_aug, sr)
    #time.time() - t     
    metadata_aug = pd.concat([metadata, df_aug_final[['file_name', 'unique_file', 'path', 'species', 'label', 'subset', 'sample_rate', 'num_frames', 'length']]], axis=0)
    metadata_aug.to_csv(dir_dest + 'metadata.csv', index=False)
    metadata_aug[metadata_aug.subset == 'train'][['file_name', 'label']].to_csv(dir_dest + '/train/' + 'metadata.csv', index=False)
    shutil.copyfile(dir_source + 'labels.json', dir_dest + 'labels.json')
    
    df_aug_final[['file_name', 'aug_type', 'aug_params']].to_csv(dir_dest + 'aug_logs.csv', index=False)
    
    print('Elapsed time:', round(time.time() - t, 2))
    
    aug_summary = metadata_aug[metadata_aug.subset == 'train'].label.value_counts().reset_index().sort_values(by='label').reset_index(drop=True)
    aug_summary.to_csv(dir_dest + 'aug_summary.csv', index=False)
    

In [41]:
generate_augmented_data(train_data=train_dataset, 
                        metadata=metadata, 
                        dir_source='/root/data/data/', 
                        dir_dest='/root/data/data/augmented_strategy_one_classes_few/', 
                        labels_to_augment=[7,28,49,47,30],
                        strategy='one',
                        fraction=1,
                        seed=42)


--------- copying basic train set ---------
Elapsed time: 0.0

--------- copying validation set ---------
Elapsed time: 0.0

--------- copying test set ---------
Elapsed time: 0.0

--------- data preprocessing stage ---------
Elapsed time: 15.86

--------- augmentation stage ---------
Elapsed time: 555.42


In [26]:
upload_to_s3(local_path=f'/root/data/data/augmented_strategy_one_classes_few/labels.json',
             s3_path=f'data_augmented_strategy_one_classes_few/labels.json',
             bucket='sagemaker-us-east-1-292159885427')

's3://sagemaker-us-east-1-292159885427/data_augmented_strategy_one_classes_few/labels.json'

In [28]:
for path, directories, files in os.walk('/root/data/data/augmented_strategy_one_classes_few/train'):
    for file in files:
        if file != 'metadata-checkpoint.csv':
            upload_to_s3(local_path=f'/root/data/data/augmented_strategy_one_classes_few/train/{file}',
                         s3_path=f'data_augmented_strategy_one_classes_few/train/{file}',
                         bucket='sagemaker-us-east-1-292159885427')

In [29]:
for path, directories, files in os.walk('/root/data/data/augmented_strategy_one_classes_few/val'):
    for file in files:
        upload_to_s3(local_path=f'/root/data/data/augmented_strategy_one_classes_few/val/{file}',
                     s3_path=f'data_augmented_strategy_one_classes_few/val/{file}',
                     bucket='sagemaker-us-east-1-292159885427')

In [30]:
for path, directories, files in os.walk('/root/data/data/augmented_strategy_one_classes_few/test'):
    for file in files:
        upload_to_s3(local_path=f'/root/data/data/augmented_strategy_one_classes_few/test/{file}',
                     s3_path=f'data_augmented_strategy_one_classes_few/test/{file}',
                     bucket='sagemaker-us-east-1-292159885427')

In [27]:
l=0
for file_name in os.listdir('/root/data/data/augmented_strategy_one_classes_few/train'):
    l +=1
l    

2179