In [None]:
!pip uninstall torchtext
!pip install torch==1.8.2+cu102 torchaudio==0.8.2 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
!pip install pytorch-lightning psutil
!git clone https://github.com/SJWyatt/leaf-audio-pytorch.git leaf-audio-pytorch-fork && cd leaf-audio-pytorch-fork && pip install .

!pip install "ray[tune]"
!pip install "ray[default]"
!pip install wandb

In [None]:
import sys, os, math, random, functools
from pathlib import Path
import torch
import numpy as np
import pandas as pd

from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.utilities.cloud_io import load as pl_load

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset, Subset
from torch.optim import Adam

from pytorch_lightning import Trainer, seed_everything, loggers
from pytorch_lightning.core.lightning import LightningModule
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

import torchaudio
from torchaudio import transforms

import tensorflow as tf
from leaf_audio_pytorch import frontend, initializers

import ray
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler, PopulationBasedTraining
from ray.tune.integration.wandb import WandbLogger, wandb_mixin
from ray.tune.integration.pytorch_lightning import TuneReportCallback, \
    TuneReportCheckpointCallback

# import rans
# import util
# from torch_vae.tvae_beta_binomial import BetaBinomialVAE
# from torch_vae import tvae_utils

import wandb
wandb.init(project="Bosch_Audio")
losses = {k:v for k, v in vars(torch.nn.modules.loss).items() if k.endswith('Loss')}

In [None]:
# from google.colab import drive
# drive.mount("/content/gdrive")
# !cd /content/gdrive/MyDrive

# !cp /content/gdrive/MyDrive/rans.py .
# !cp /content/gdrive/MyDrive/util.py .
# !cp -avr /content/gdrive/MyDrive/torch_vae .

# # LARGE DATASET
# !gdown --id 1KSr-GF8avGTnPA83AVE3Hjz4TjWRlnpc
# !mkdir UrbanSound8KZIP
# !tar -xvf  'UrbanSound8K.tar.gz' -C '/content/UrbanSound8KZIP'

In [None]:
# LARGE DATASET
# dataset_folder = "/content/UrbanSound8KZIP/UrbanSound8K"

# LOCAL DATASET
dataset_folder = "/home/dustedduke/Documents/Bosch_Audio_Project_Clean/data/UrbanSound8K"

## Audio transforms

In [None]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.image as mpimg

class AudioUtil():

    @staticmethod
    def get_available_transforms():
      method_list = []
      for attribute in dir(AudioUtil):
          attribute_value = getattr(AudioUtil, attribute)
          if callable(attribute_value):
              if attribute.startswith('__') == False:
                  method_list.append(attribute)
      method_list.remove('open')
      method_list.remove('get_available_transforms')
      return method_list

    # ----------------------------
    # Load an audio file. Return the signal as a tensor and the sample rate
    # ----------------------------
    @staticmethod
    def open(audio_file):
        sig, sr = torchaudio.load(audio_file)
        return (sig, sr)
    
    # ----------------------------
    # Convert the given audio to the desired number of channels
    # ----------------------------
    @staticmethod
    def rechannel(aud, new_channels):
        sig, sr = aud

        if (sig.shape[0] == new_channels):
            # Nothing to do
            return aud

        if (new_channels == 1):
            # Convert from stereo to mono by selecting only the first channel
            resig = sig[:1, :]
        else:
            # Convert from mono to stereo by duplicating the first channel
            resig = torch.cat([sig, sig])

        return ((resig, sr))
    
    # ----------------------------
    # Since Resample applies to a single channel, we resample one channel at a time
    # ----------------------------
    @staticmethod
    def resample(aud, newsr):
        sig, sr = aud

        if (sr == newsr):
            # Nothing to do
            return aud

        num_channels = sig.shape[0]
        # Resample first channel
        resig = torchaudio.transforms.Resample(sr, newsr)(sig[:1,:])
        if (num_channels > 1):
            # Resample the second channel and merge both channels
            retwo = torchaudio.transforms.Resample(sr, newsr)(sig[1:,:])
            resig = torch.cat([resig, retwo])

        return ((resig, newsr))
    
    # ----------------------------
    # Pad (or truncate) the signal to a fixed length 'max_ms' in milliseconds
    # ----------------------------
    @staticmethod
    def pad_trunc(aud, max_ms):
      sig, sr = aud
      num_rows, sig_len = sig.shape
      max_len = sr//1000 * max_ms

      if (sig_len > max_len):
        # Truncate the signal to the given length
        sig = sig[:,:max_len]

      elif (sig_len < max_len):
        # Length of padding to add at the beginning and end of the signal
        pad_begin_len = random.randint(0, max_len - sig_len)
        pad_end_len = max_len - sig_len - pad_begin_len

        # Pad with 0s
        pad_begin = torch.zeros((num_rows, pad_begin_len))
        pad_end = torch.zeros((num_rows, pad_end_len))

        sig = torch.cat((pad_begin, sig, pad_end), 1)
        
      return (sig, sr)
    
    # ----------------------------
    # Shifts the signal to the left or right by some percent. Values at the end
    # are 'wrapped around' to the start of the transformed signal.
    # ----------------------------
    @staticmethod
    def time_shift(aud, sr, shift_limit):
        sig,sr = aud, sr
        _, sig_len = sig.shape
        shift_amt = int(random.random() * shift_limit * sig_len)
        return (sig.roll(shift_amt), sr)
    
    # ----------------------------
    # Generate a Spectrogram
    # ----------------------------
    @staticmethod
    def spectro_gram(aud, n_mels=64, n_fft=1024, hop_len=None):
        sig,sr = aud
        top_db = 80

        # spec has shape [channel, n_mels, time], where channel is mono, stereo etc
        spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)

        # Convert to decibels
        spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
        return (spec)
    
    # ----------------------------
    # Augment the Spectrogram by masking out some sections of it in both the frequency
    # dimension (ie. horizontal bars) and the time dimension (vertical bars) to prevent
    # overfitting and to help the model generalise better. The masked sections are
    # replaced with the mean value.
    # ----------------------------
    @staticmethod
    def spectro_augment(spec, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
        _, n_mels, n_steps = spec.shape
        mask_value = spec.mean()
        aug_spec = spec

        freq_mask_param = max_mask_pct * n_mels
        for _ in range(n_freq_masks):
            aug_spec = transforms.FrequencyMasking(freq_mask_param)(aug_spec, mask_value)

        time_mask_param = max_mask_pct * n_steps
        for _ in range(n_time_masks):
            aug_spec = transforms.TimeMasking(time_mask_param)(aug_spec, mask_value)

        return aug_spec

    # TODO fix
    @staticmethod
    def rans_compress(aud, config):
      
        # print('\nTEST 22\n')

        image = aud[0].float().view(1, -1)

        rng = np.random.RandomState(0)
        other_bits = rng.randint(low=1 << 16, high=1 << 31, size=50, dtype=np.uint32)
        state = rans.unflatten(other_bits)
        state = config['vae_append'](state, image)

        compressed_length = 32 * (len(rans.flatten(state)) - len(other_bits))
        compressed_message = rans.flatten(state)
        
        return (torch.from_numpy(compressed_message), aud[1])
        
    @staticmethod
    def rans_decompress(aud, config):
        scale_bits = 8
        x = rans.unflatten(aud[0].numpy())
        for start, freq in reversed(list(zip(config['starts'], config['freqs']))):
            def statfun(cf):
                assert start <= cf < start + freq
                return None, (start, freq)
            x, symbol = rans.pop(x, statfun, scale_bits)
        assert x == (rans.head_min, ())

        return (torch.from_numpy(aud[0]), aud[1])

    def original_embedding(aud, config):
        reaud = AudioUtil.resample(aud, 44100)
        rechan = AudioUtil.rechannel(reaud, 2)
        dur_aud = AudioUtil.pad_trunc(rechan, 4000)
        shift_aud = AudioUtil.time_shift(dur_aud, 0.4)
        sgram = AudioUtil.spectro_gram(shift_aud, n_mels=64, n_fft=1024, hop_len=None)
        aug_sgram = AudioUtil.spectro_augment(sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)

        return (aug_sgram)
    
    @staticmethod
    def leaf_embedding(aud, config):

        # Prepare
        reaud = AudioUtil.resample(aud, config['new_sample_rate'])
        rechan = AudioUtil.rechannel(reaud, config['new_channels'])
        padded = AudioUtil.pad_trunc(rechan, config['new_length'])

        complex_conv_init = initializers.GaborInit(sample_rate=padded[1], min_freq=config['min_freq'], 
                                                   max_freq=config['max_freq'])

        # LEAF shape [channels, time, n_filters]
        custom_leaf = frontend.Leaf(learn_pooling=config['learn_pooling'],
                                    n_filters=config['n_filters'],
                                    window_len=config['window_len'],
                                    sample_rate=padded[1],
                                    preemp=config['preemp'],
                                    complex_conv_init=complex_conv_init)

        leaf_repr_torch = custom_leaf(padded[0])
        leaf_repr_torch_perm = leaf_repr_torch.permute(2,1,0)
        leaf_repr_torch_flat = leaf_repr_torch_perm.flatten(start_dim=1).detach()
        return leaf_repr_torch_flat
        
    @staticmethod
    def leaf_bits_back(aud, config):
        # Postponed

  

## Dataset

In [None]:
def one_hot(idx, num_items):
    return [(0.0 if n != idx else 1.0) for n in range(num_items)]

class UrbanDataset(Dataset):

    def __init__(self, config, dataset_folder, fold, transform=None, augment=None):
        super().__init__()
        self.dataset_folder = dataset_folder
        self.path_to_csv = os.path.join(self.dataset_folder, "metadata/UrbanSound8K.csv")
        self.path_to_audio_folder = os.path.join(self.dataset_folder, "audio")
        self.metadata = pd.read_csv(self.path_to_csv) #[:2000]
        self.fold = fold
        self.transform = transform
        self.augment = augment
        self.config = config

    def train_validation_split(self):
        train_idx = list(self.metadata[self.metadata["fold"] != self.fold].index)
        val_idx = list(self.metadata[self.metadata["fold"] == self.fold].index)
        
        train_set = Subset(self, train_idx)
        val_set = Subset(self, val_idx)
        return train_set, val_set
  
    def __len__(self):
        return len(self.metadata)
    
    def __getitem__(self, index):
        
        file_name = self.metadata["slice_file_name"].iloc[index]
        file_path = os.path.join(
            os.path.join(self.path_to_audio_folder, "fold" + str(self.metadata["fold"].iloc[index])), file_name
        )

        # Both training and validation are transformed
        aud = torchaudio.load(file_path)
        aud = self.transform(aud, self.config)
        label = torch.from_numpy(np.array(one_hot(self.metadata["classID"].iloc[index], 10)))

        return {
            "input_vector": aud,
            "label": label,
        }

## Metrics

In [None]:
from sklearn.metrics import f1_score, accuracy_score, auc, precision_recall_curve

def auprc(y_true, y_scores):
    """ Compute AUPRC for 1 class
        Args:
            y_true (np.array): one hot encoded labels
            y_scores (np.array): model prediction
        Return:
            auc (float): the Area Under the Recall Precision curve
    """
    precision, recall, thresholds = precision_recall_curve(y_true, y_scores)
    recall = np.concatenate((np.array([1.0]), recall, np.array([0.0])))
    precision = np.concatenate((np.array([0.0]), precision, np.array([1.0])))
    return auc(recall, precision)

def compute_macro_auprc(y_true, y_scores, return_auprc_per_class=False):
    """ Compute macro AUPRC
        Args:
            y_true (np.array): one hot encoded labels
            y_scores (np.array): model prediction
        Return:
            auprc_macro (float): the macro AUPRC
    """
    _, num_classes = y_true.shape
    auprc_scores = [auprc(y_true[:, i], y_scores[:, i]) for i in range(num_classes)]
    auprc_macro = np.nanmean(np.array(auprc_scores))
    if return_auprc_per_class:
        return auprc_scores, auprc_macro
    else:
        return auprc_macro

## Model

In [None]:
class VanillaClassifier(LightningModule):
    def __init__(self, config):
        super(VanillaClassifier, self).__init__()
        self.config = config
        self.fold = config['FOLD_INDEX']

        self.dataset_folder = config['dataset_folder']
        self.dropout = None
        self.output_size = config['output_size']
        self.embedding_dim = config['new_channels'] * config['n_filters']
        self.hidden_dim = config['hidden_dim']
        self.n_layers = config['n_layers']
        self.drop_prob = config['drop_prob']
        self.loss_type = config['loss_type']
        self.transform = config['transform']
        self.augment = config['augment']
        self.batch_size = config['batch_size']
        self.shuffle = config['shuffle']
        self.learning_rate = config['learning_rate']
        
        self.previous_hidden = torch.randn(self.n_layers, self.batch_size, self.hidden_dim)
        self.previous_cell = torch.randn(self.n_layers, self.batch_size, self.hidden_dim)

        self.lstm = nn.LSTM(
            input_size=self.embedding_dim, 
            hidden_size=self.hidden_dim, 
            num_layers=self.n_layers, 
            batch_first=True, 
            dropout=self.drop_prob if config['n_layers'] > 1 else 0,
            bidirectional=False)
        if (config['n_layers'] == 1):
            self.dropout = nn.Dropout(config['drop_prob'])
        self.fc1 = nn.Linear(self.hidden_dim, 32)
        self.fc2 = nn.Linear(32, self.output_size)
        
    def forward(self, x):
        batch_size = x.size(0)
        x = x.float()
        x, hidden = self.lstm(x, (self.previous_hidden, self.previous_cell))
        self.previous_cell = hidden[0].detach()
        self.previous_hidden = hidden[1].detach()

        x = x[:, -1]
        if (self.n_layers > 1):
            x = self.dropout(x)
        x = self.fc1(x)
        x = self.fc2(x)
        return x
  
    def prepare_data(self):
          data_param = {
              "config": self.config,
              "dataset_folder": self.dataset_folder,
              "fold": self.config['FOLD_INDEX'],
              "transform": self.transform,
              "augment": self.augment
          }
        
          self.dataset = UrbanDataset(**data_param)
          (self.train_dataset, self.val_dataset) = self.dataset.train_validation_split()

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset, batch_size=self.batch_size, shuffle=self.shuffle, num_workers=2, drop_last=True
        )

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=2, drop_last=True)

    def test_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=2, drop_last=True)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        

    def torch_accuracy(self, logits, labels):
        predicted = torch.argmax(logits, 1)
        target = torch.argmax(labels, 1)

        correct = (predicted == target).sum().item()
        accuracy = correct / len(target)
        return torch.tensor(accuracy)


    def training_step(self, batch, batch_idx):

        data, target = (
            batch["input_vector"].float(),
            batch["label"].float()
        )

        output = self.forward(data)
        criterion = nn.CrossEntropyLoss()
        loss = criterion(output, torch.argmax(target, dim=1).long())
        accuracy_sc = self.torch_accuracy(output, target)
        
        wandb.log({"loss": loss})
        wandb.log({"2_train/1_accuracy0.5": accuracy_sc})
        self.log("loss", loss)
        self.log("2_train/1_accuracy0.5", accuracy_sc)

        return {"loss": loss}

    def validation_step(self, batch, batch_idx):

        data, target= (
            batch["input_vector"].float(),
            batch["label"].float()
        )

        output = self.forward(data)
        criterion = nn.CrossEntropyLoss()

        val_loss = criterion(output, torch.argmax(target, dim=1).long())
        val_accuracy = self.torch_accuracy(output, target)
        
        self.log("val_loss", val_loss)
        self.log("val_accuracy", val_accuracy)

        wandb.log({"val_loss": val_loss.mean(0)})
        wandb.log({"val_accuracy": val_accuracy})
        wandb.log({"FOLD_INDEX": self.config['FOLD_INDEX']})

        return {
            "val_loss": val_loss,
            "val_accuracy": val_accuracy,
            "output": output.detach(),
            "target": target.detach(),
        }

    def validation_epoch_end(self, outputs):
        all_outputs = torch.cat([o["output"] for o in outputs], 0).cpu().numpy()
        all_targets = torch.cat([o["target"] for o in outputs], 0).cpu().numpy()

        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        avg_acc = torch.stack([x["val_accuracy"] for x in outputs]).mean()

        self.log("1_loss/val_loss", avg_loss)
        self.log("2_valid/1_accuracy0.5", avg_acc)
        wandb.log({"1_loss/val_loss": avg_loss.item()})
        wandb.log({"2_valid/1_accuracy0.5": avg_acc.item()})

        # wandb.log({"2_valid/1_f1_micro0.5": f1_micro})
        # wandb.log({"2_valid/1_auprc_micro": auprc_micro})
        # wandb.log({"2_valid/1_auprc_macro": auprc_macro})
        
        predicted_values = np.argmax(all_outputs, axis=-1)
        true_values = np.argmax(all_targets, axis=-1)
            
        wandb.log({"conf_mat" : wandb.plot.confusion_matrix(probs=None,
                                preds=predicted_values, y_true=true_values,
                                class_names=["Air Conditioner", "Car Horn", "Children Playing", "Dog Bark", 
                                            "Drilling", "Engine Idling", "Gun Shot", "Jackhammer", "Siren", "Street Music"])})

        wandb.log({"roc_curve" : wandb.plot.roc_curve(true_values,
                    all_outputs, labels=["Air Conditioner", "Car Horn", "Children Playing", "Dog Bark", "Drilling", "Engine Idling", "Gun Shot", "Jackhammer", "Siren", "Street Music"])})

        wandb.log({"pr" : wandb.plot.pr_curve(true_values, all_outputs,
                 labels=["Air Conditioner", "Car Horn", "Children Playing", "Dog Bark", "Drilling", "Engine Idling", "Gun Shot", "Jackhammer", "Siren", "Street Music"], classes_to_plot=None)})


## Training

In [None]:
@wandb_mixin
def training(config, checkpoint_dir=None):
    seed_everything(config['seed'])
    model = VanillaClassifier(config)

    trainer = Trainer(
        profiler="simple",
        logger=TensorBoardLogger(
            save_dir=tune.get_trial_dir(), name="", version="."),
        limit_val_batches=config['percent_valid_examples'],
        progress_bar_refresh_rate=0,
        checkpoint_callback=False,
        max_epochs=config['max_epochs'],
        gpus=None
        log_every_n_steps=10,
        callbacks=[
            TuneReportCallback(
                {
                  "loss": "loss",
                  "2_train/1_accuracy0.5": "2_train/1_accuracy0.5"
                  # "2_valid/1_accuracy0.5",
                  # "2_valid/1_f1_micro0.5",
                  # "2_valid/1_auprc_micro",
                  # "2_valid/1_auprc_macro"
                },
                on="batch_end"),
            TuneReportCallback(
                {
                  "1_loss/val_loss": "1_loss/val_loss"
                  "2_valid/1_accuracy0.5",
                  # "2_valid/1_f1_micro0.5",
                  # "2_valid/1_auprc_micro",
                  # "2_valid/1_auprc_macro"
                },
                on="validation_end"),
            ]
        )
    
    trainer.fit(model)
    
    torch.save({
            'FOLD_INDEX': config['FOLD_INDEX'],
            'model_state_dict': model.state_dict(),
            }, "./model_state_dict")
    
    torch.save(model, "./model + " + str(config['FOLD_INDEX']) + ".pth")
    
    return np.random.randn() * 0.1 + config["a"]

# Configure and Start

## 1. Choose list of transforms and augmentations

In [None]:
print('AVAILABLE TRANSFORMS AND AUGMENTATIONS:', *AudioUtil.get_available_transforms(), sep='\n- ')

In [None]:
# VAE configuration
# prior_precision = 8
# obs_precision = 14
# q_precision = 14

# latent_dim = 50
# latent_shape = (1, latent_dim)
# model = BetaBinomialVAE(hidden_dim=200, latent_dim=latent_dim)
# model.load_state_dict(
#     torch.load('/content/gdrive/MyDrive/torch_vae/saved_params/torch_vae_beta_binomial_params',
#               map_location=lambda storage, location: storage))
# model.eval()

# rec_net = tvae_utils.torch_fun_to_numpy_fun(model.encode)
# gen_net = tvae_utils.torch_fun_to_numpy_fun(model.decode)

# obs_append = tvae_utils.beta_binomial_obs_append(255, obs_precision)
# obs_pop = tvae_utils.beta_binomial_obs_pop(255, obs_precision)

# vae_append = util.vae_append(latent_shape, gen_net, rec_net, obs_append,
#                       prior_precision, q_precision)

input_size = tune.choice([8])
experiment_config = {
  'FOLD_INDEX': tune.grid_search(list(range(1,11))),
  "a": tune.uniform(0, 1),
  "transform": AudioUtil.leaf_embedding, # AudioUtil.leaf_bits_back,
  "vae_append": None, # vae_append,
  "augment": None,
  "new_sample_rate": 44100, # tune.grid_search([44100, 48000]),
  "new_channels": 1, # tune.grid_search([1, 2]),
  "new_length": 4000,
  "min_freq": 60,
  "max_freq": 7800,
  "n_filters": input_size,
  "window_len": 8, # tune.grid_search([32, 64]),
  "preemp": False,
  "learn_pooling": False,
  "percent_valid_examples": 1.0,
  'dataset_folder': dataset_folder,
  'seed': 42,
  'folds': 10,
  'max_epochs': 10,
  'embedding_dim': input_size,
  'hidden_dim': 8, # tune.choice([32, 64]),
  'output_size': 10,
  'n_layers': 1,
  'drop_prob': 0,
  'batch_size': 32, #t une.choice([16, 32]),
  'learning_rate': 1e-3, # tune.choice([1e-7, 1e-3]), #1e-5,
  'loss_type': tune.choice([nn.CrossEntropyLoss]),
  'shuffle': False,
  'wandb': {
    "project": "Bosch_Audio",
    "api_key": "f7753b76352daa3550476e0166a5095eacffbdce"
  }
}

In [None]:
ray.shutdown()
ray.init(num_cpus=4, num_gpus=0)

def tune_asha():

    scheduler = ASHAScheduler(
          max_t=10,
          grace_period=1,
          reduction_factor=2)
    
    searcher = tune.suggest.basic_variant.BasicVariantGenerator(
        constant_grid_search=True
    )
    
    reporter = CLIReporter(
    parameter_columns=['FOLD_INDEX', 'embedding_dim', 'hidden_dim', 'batch_size'],
    metric_columns=["1_loss/val_loss", "loss", "2_train/1_accuracy0.5"])

    analysis = tune.run(
    tune.with_parameters(
        training),
    resources_per_trial={
        "cpu": 4,
        "gpu": 0
    },
    metric="loss",
    mode="min",
    config=experiment_config,
    num_samples=1,
    scheduler=scheduler,
    search_alg=searcher,
    progress_reporter=reporter,
    callbacks=[],
    local_dir="./results",
    stop={"training_iteration": 10, "loss": 0.001},
    name="tune_asha")

    print("Best hyperparameters found were: ", analysis.best_config)
    
    df = analysis.dataframe()
    print(df.groupby(["config/a"]).mean())

In [None]:
tune_asha()

In [None]:
ray.shutdown()