In [1]:
import soundcard as sc
import soundfile as sf
import cv2
import numpy as np
import time
import keyboard

import torch
import torchaudio

import torch.optim as optim
import torch.nn as nn
from torchvision import utils, datasets
import torchvision.transforms as T
import torch.nn.functional as F

from utils.audio_utils import *
from utils.vaes import *    
from utils.data_utils import calculate_mean_std, split_train_val_test

import matplotlib.pyplot as plt

import os

from datetime import datetime

import math

import random

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [9]:
# stream your audio output and check what it visualizes

audio_stream = RealTimeAudioStream()
audio_stream.stream()

RealTimeAudioStream initialized with 44032 sample rate


In [10]:
def random_rotation_matrix(dim):
    # Generate a random orthogonal matrix
    x = torch.randn(dim, dim).requires_grad_(False)
    q, _ = torch.linalg.qr(x)
    
    # Apply the SVD decomposition to obtain the rotation matrix
    u, _, v = torch.linalg.svd(q)
    rotation_matrix = u @ v.mT
    
    return rotation_matrix

def interpolate_transforms(matrix1, matrix2, steps):
    # Interpolate between two rotation matrices
    transforms = []
    for t in range(steps + 1):
        weight = t / steps
        interpolated_matrix = torch.lerp(matrix1, matrix2, weight).requires_grad_(False)
        transforms.append(interpolated_matrix)
    return transforms

In [21]:
z_dim = 4
mean =  [452.15845655661803, 70.35665321114695, 42.851305103906, 22.160291749979983]
std1 =  [510.373306840422, 135.12631741685448, 102.5477751543778, 63.582527648859525]
mean, std1 = torch.Tensor(mean).to("cuda"), torch.Tensor(std1).to("cuda")

audio_stream = RealTimeAudioStream(z_dim=z_dim)
vae = VAE(sample_x=example_data, hidden_dims=None, z_dim=z_dim)
vae.load_state_dict(torch.load(r"C:\Users\dan\Desktop\sound_vae\models\8_orbita_only.pth"))
vae.to("cuda")
vae.eval()

steps = 0 #current transformation matrix
mat2 = random_rotation_matrix(z_dim)

with torch.no_grad():
    while not audio_stream.done:

        rms, zcr, fft = audio_stream.step(None)
        rms, zcr = rms.mean().item(), zcr.mean().item()
        
        if steps==0:
            steps = random.randint(2, 200)
            mat1 = mat2.clone()
            mat2 = random_rotation_matrix(z_dim)

            rots = interpolate_transforms(mat2, mat1, steps)
        
        steps -= 1
        r_m = rots[steps].to("cuda")

        # rms = ((rms - 0.15) * 12) # 0.3
        # zcr = ((zcr - 0.07) * 24)# 0.14
        # print(angle_r, rms, zcr, end="\r")
        # z = (torch.tensor([rms, zcr]) @ r_m).unsqueeze(0).to("cuda") 
        ######
        z = fft[0, :, 0].float()
        z = z - mean
        z /= std1*40
        z += 0.5
        z = z @ r_m
        ######
        sample = vae.decoder(z)

        image = sample[0].permute(1, 2, 0).detach().cpu().numpy()
        image = cv2.resize(image, (1024, 512))

        cv2.imshow("generation", image)

        k = cv2.waitKey(33)
        if k==27:    # Esc key to stop
            cv2.destroyAllWindows()
            break

cv2.destroyAllWindows()


#audio_stream.stream()

RealTimeAudioStream initialized with 44032 sample rate


In [None]:
# MNIST dataset

batch_size_train = 256
batch_size_test = 256

torch.backends.cudnn.enabled = False

# MNIST Dataset
train_dataset = datasets.MNIST(root='./mnist_data/', train=True, transform=T.ToTensor(), download=True)
test_dataset = datasets.MNIST(root='./mnist_data/', train=False, transform=T.ToTensor(), download=False)


# Data Loader (Input Pipeline)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size_train, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size_test, shuffle=False)

examples = enumerate(test_loader)
batch_idx, (example_data, example_targets) = next(examples)
example_data.shape

In [11]:
# custom dataset

batch_size = 1024

WIDTH = 128
HEIGHT = 64

dataset_dir = r"data_synthetic\10_syntensor_only"

# calculate dataset mean and std
raw_transforms = T.Compose([
    T.Resize((HEIGHT, WIDTH)), 
    T.ToTensor(),
])

raw_data = datasets.ImageFolder(root = dataset_dir, transform = raw_transforms)

# DATA_MEAN, DATA_STD = calculate_mean_std(raw_data)
DATA_MEAN = [0.9386, 0.9386, 0.9386]
DATA_STD  = [0.1825, 0.1825, 0.1825]

In [12]:
data_transforms = T.Compose([
    T.Resize((HEIGHT, WIDTH)), 
    T.ToTensor(),
    T.Normalize(mean=DATA_MEAN,std=DATA_STD),
    T.Grayscale(),
])

dataset = datasets.ImageFolder(root = dataset_dir, transform = data_transforms)

# split into Train, Val and Test
data = split_train_val_test(dataset, val=0.0, test=0.1, batch_size=batch_size)

examples = enumerate(data['train'])
batch_idx, (example_data, example_targets) = next(examples)
print(example_data.shape)
print(example_data.min())
print(example_data.max())

torch.Size([1024, 1, 64, 128])
tensor(-5.1425)
tensor(0.3364)


In [None]:
def eval_on_test(model, test_loader):
    model.eval()

    with torch.no_grad():
        latents = []
        labels = []
        for x, y in test_loader:
            mu, log_var = model.encoder(x.cuda())
            z = model.sampling(mu, log_var).cpu().numpy()

            latents.append(z)
            labels.append(y)

    latents = np.concatenate(latents, 0)
    labels = np.concatenate(labels, 0)
    model.train()

    return latents, labels

In [None]:
def visualize_latent_space(model, loss_items, experiment_name, test_loader, z_dims):

    now = datetime.now()
    pic_name = now.strftime("%Y%m%d%H%M%S%f")

    extent = 5

    cmap = plt.cm.tab20
    bounds = np.linspace(0,10,11)
    fig, ax = plt.subplots()

    latents, labels = eval_on_test(model, test_loader)
    if extent is not None: 
        ax.set_xlim(-extent, extent)
        ax.set_ylim(-extent, extent)
    scat = ax.scatter(latents[:, 0], latents[:,1], s=2, marker='o', cmap=cmap, c=labels)
    cb = plt.colorbar(scat, spacing='proportional',ticks=bounds)

    title = f"Recon: {loss_items[0].item():2.3f}, KLD {loss_items[1].item():2.3f}"
    ax.set_title(title)

    path1 = rf'latent_space_vis\{experiment_name}'

    if not os.path.exists(path1):
        os.makedirs(path1)

    fig.savefig(path1 + rf'\{pic_name}.jpg')
    plt.close()

In [None]:
# return reconstruction error + KL divergence losses
def vae_loss(recon_x, x, mu, log_var):
    B, C, H, W = recon_x.shape
    beta = 0.01 #legend says, that the bigger beta is, the higher the disentanglement
    recons_loss = F.mse_loss(recon_x.view(B, -1), x.view(B, -1), reduction="mean")
    KLD = beta * -0.5 * torch.mean(1 + log_var - mu.pow(2) - log_var.exp()) # 1 + log(sigma**2) - mu**2 - sigma**2
    return recons_loss, KLD

In [None]:
def norm_image(image):
    dtype = image.dtype
    image = image.astype(float)
    image = image - np.min(image)
    image = image / np.max(image) * 255
    image = image.astype(dtype)
    return image

def save_recon(x_recon, experiment_name):
    image = x_recon[0].permute(1, 2, 0).detach().cpu().numpy()
    now = datetime.now()
    pic_name = now.strftime("%Y%m%d%H%M%S%f")
    path =  rf"latent_space_vis\{experiment_name}\recons"
    if not os.path.exists(path):
        os.makedirs(path)

    cv2.imwrite(os.path.join(path, f"{pic_name}.jpg"), norm_image(image))

In [None]:
def train(model, loss_f, train_loader, test_loader, optimizer, scheduler, epoch, experiment_name, embedding_size):
    
    if embedding_size > 2: vis = False
    else: vis = True

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model.to(device)

    for batch_idx, (x, _) in enumerate(train_loader):
        x = x.to(device)
        model.train()
        optimizer.zero_grad()
        x_recon, mu, log_var = model(x)
        std3 = log_var.exp().sqrt().mean()*3
        x_recon = x_recon[:, 0, None, :, :]

        rec, KLD = loss_f(x_recon, x, mu, log_var)
        loss = rec + KLD

        loss.backward()
        optimizer.step()
        scheduler.step()
        # print(scheduler.get_last_lr())

        if batch_idx % 25 == 0:
            if vis: 
                visualize_latent_space(model, (rec, KLD), experiment_name, test_loader)
                
            save_recon(x_recon, experiment_name)
            print("Epoch {:3} Iteration {:3}: recon: {:8.4f}, kld: {:8.4f}, std3: {:2.4f}".format(epoch, batch_idx, rec.item(), KLD.item(), std3.item()))

    path =  rf"models\{experiment_name}"
    if not os.path.exists(path):
        os.makedirs(path)

    save_model_to = rf"{path}\vae_{epoch}.pth"
    torch.save(model.state_dict(), save_model_to)

    return loss


In [None]:
embedding_size = 4


# build model
vae = VAE(sample_x=example_data, hidden_dims=None, z_dim=embedding_size)
optimizer = optim.Adam(vae.parameters(), lr=0.01)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=40)

In [None]:
# ### sanity check
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# vae.to(device)
# vae.train()

# for batch_idx, (x, _) in enumerate(data['train']):
#     x = x.to(device)
#     out, mu, log_var = vae(x)
#     print(out.shape)
#     break


In [None]:
# remember: acceptable recon loss is 0.45
num_epochs = 200

experiment_name = f"10_syntensor_only"

for epoch in range(1, num_epochs + 1):
    train(vae, vae_loss, data['train'], data['test'], optimizer, scheduler, epoch, experiment_name, embedding_size=embedding_size)

In [None]:
torch.save(vae, r"models\vae_orbita.pth")