In [1]:
import os,sys
import numpy as np
import torch,torchvision
from torch import nn
from scipy.signal import spectrogram
from scipy.io import wavfile
from sklearn.model_selection import train_test_split

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def train_val_test_split(x,y,train_size=0.7):
    x_trn,x_vts,y_trn,y_vts=train_test_split(x,y,test_size=1-train_size,stratify=y)
    x_val,x_tst,y_val,y_tst=train_test_split(x_vts,y_vts,test_size=0.5,stratify=y_vts)

data_folder='../data/raw/Kaggle_Environmental_Sound_Classification_50' 
sys.path.insert(0, os.path.abspath(data_folder))
from utils import ESC50

train_splits = [1,2,3,4]
test_split = 5
batch_size=16
window_size_secs=2

epochs = 200
#sample_size = 64 # fixed sample size
nz = 500 # latent vector size
k = 1 # number of steps to apply to the discriminator

shared_params = {'csv_path': data_folder+'/esc50.csv',
                 'wav_dir': data_folder+'/audio/audio',
                 'dest_dir': data_folder+'/audio/audio/16000',
                 'audio_rate': 16000,
                 'only_ESC10': True,
                 'pad': 0,
                 'normalize': True}

train_gen = ESC50(folds=train_splits,
                  randomize=True,
                  strongAugment=True,
                  random_crop=True,
                  inputLength=window_size_secs,
                  mix=True,
                  **shared_params).batch_gen(batch_size)

test_gen = ESC50(folds=[test_split],
                 randomize=False,
                 strongAugment=False,
                 random_crop=True,
                 inputLength=window_size_secs,
                 mix=False,
                 **shared_params).batch_gen(batch_size)

X, Y = next(train_gen)
X2, Y2 = next(test_gen)
X.shape, Y.shape, X2.shape, Y2.shape

((16, 32000, 1), (16, 10), (16, 32000, 1), (16, 10))

In [2]:
data_shape=shared_params['audio_rate']*window_size_secs
data_shape

32000

In [3]:
Generator=nn.Sequential(nn.Linear(nz,data_shape//2**2),
                          nn.LeakyReLU(0.2),
                          # nn.Linear(data_shape//2**4,data_shape//2**3),
                          # nn.LeakyReLU(0.2),
                          # nn.Linear(data_shape//2**3,data_shape//2**2),
                          # nn.LeakyReLU(0.2),
                          # nn.Linear(data_shape//2**2,data_shape//2),
                          # nn.LeakyReLU(0.2),
                          nn.Linear(data_shape//2**2,data_shape),
                          nn.Tanh())
Discriminator=nn.Sequential(nn.Linear(data_shape,data_shape//2**3),
                           nn.LeakyReLU(0.2),
                           nn.Dropout(0.3),
                           # nn.Linear(data_shape//2,data_shape//2**2),
                           # nn.LeakyReLU(0.2),
                           # nn.Dropout(0.3),
                           # nn.Linear(data_shape//2**2,data_shape//2**3),
                           # nn.LeakyReLU(0.2),
                           # nn.Dropout(0.3),
                           # nn.Linear(data_shape//2**3,data_shape//2**4),
                           # nn.LeakyReLU(0.2),
                           # nn.Dropout(0.3),
                           # nn.Linear(data_shape//2**4,data_shape//2**5),
                           # nn.LeakyReLU(0.2),
                           # nn.Dropout(0.3),
                           # nn.Linear(data_shape//2**5,data_shape//2**6),
                           # nn.LeakyReLU(0.2),
                           # nn.Dropout(0.3),
                           nn.Linear(data_shape//2**3,data_shape//2**7),
                           nn.LeakyReLU(0.2),
                           nn.Dropout(0.3),
                           nn.Linear(data_shape//2**7,1),
                           nn.Sigmoid())
generator=Generator.to(device)
discriminator=Discriminator.to(device)

optim_g = torch.optim.Adam(generator.parameters(), lr=0.0002)
optim_d = torch.optim.Adam(discriminator.parameters(), lr=0.0002)

criterion=nn.BCELoss()

In [4]:
# to create real labels (1s)
def label_real(size):
    data = torch.ones(size, 1)
    return data.to(device)
# to create fake labels (0s)
def label_fake(size):
    data = torch.zeros(size, 1)
    return data.to(device)
# function to create the noise vector
def create_noise(sample_size, nz):
    return torch.randn(sample_size, nz).to(device)

# function to train the discriminator network
def train_discriminator(optimizer, data_real, data_fake):
    b_size = data_real.size(0)
    real_label = label_real(b_size)
    fake_label = label_fake(b_size)
    optimizer.zero_grad()
    output_real = discriminator(data_real)
    loss_real = criterion(output_real, real_label)
    output_fake = discriminator(data_fake)
    loss_fake = criterion(output_fake, fake_label)
    loss_real.backward()
    loss_fake.backward()
    optimizer.step()
    return loss_real + loss_fake

# function to train the generator network
def train_generator(optimizer, data_fake):
    b_size = data_fake.size(0)
    real_label = label_real(b_size)
    optimizer.zero_grad()
    output = discriminator(data_fake)
    loss = criterion(output, real_label)
    loss.backward()
    optimizer.step()
    return loss

losses_g=[]
losses_d=[]
noise = create_noise(batch_size, nz)
generator.train()
discriminator.train()
None

In [5]:
generator

Sequential(
  (0): Linear(in_features=500, out_features=8000, bias=True)
  (1): LeakyReLU(negative_slope=0.2)
  (2): Linear(in_features=8000, out_features=32000, bias=True)
  (3): Tanh()
)

In [6]:
discriminator

Sequential(
  (0): Linear(in_features=32000, out_features=4000, bias=True)
  (1): LeakyReLU(negative_slope=0.2)
  (2): Dropout(p=0.3, inplace=False)
  (3): Linear(in_features=4000, out_features=250, bias=True)
  (4): LeakyReLU(negative_slope=0.2)
  (5): Dropout(p=0.3, inplace=False)
  (6): Linear(in_features=250, out_features=1, bias=True)
  (7): Sigmoid()
)

In [None]:
# Training Loop
for epoch in range(epochs):
    loss_g = 0.0
    loss_d = 0.0
    for bi, data in enumerate(train_gen):
        samps, _ = data
        samps=torch.tensor(samps)
        samps = samps.to(device)
        b_size = len(samps)
        # run the discriminator for k number of steps
        for step in range(k):
            n=create_noise(b_size, nz)
            data_fake = generator(n).detach()
            data_real = samps.reshape(samps.shape[:-1]).float()
            # train the discriminator network
            loss_d += train_discriminator(optim_d, data_real, data_fake)
        data_fake = generator(create_noise(b_size, nz))
        # train the generator network
        loss_g += train_generator(optim_g, data_fake)
    # create the final fake signal for the epoch
    generated_s = generator(noise).cpu().detach()
    # make the samps as grid
    generated_s = make_grid(generated_s)
    epoch_loss_g = loss_g / bi # total generator loss for the epoch
    epoch_loss_d = loss_d / bi # total discriminator loss for the epoch
    losses_g.append(epoch_loss_g)
    losses_d.append(epoch_loss_d)
    
    print(f"Epoch {epoch} of {epochs}")
    print(f"Generator loss: {epoch_loss_g:.8f}, Discriminator loss: {epoch_loss_d:.8f}")