In [1]:
import cv2
import numpy as np

import torch
import torchaudio

import torch.optim as optim
import torch.nn as nn
from torchvision import utils, datasets
import torchvision.transforms as T
import torch.nn.functional as F

from utils.audio_utils import RealTimeAudioStream
from utils.vaes import *    
from utils.data_utils import calculate_mean_std, split_train_val_test

import matplotlib.pyplot as plt

import os

from datetime import datetime

import math

import random

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
# stream your audio output and check what it visualizes

# короче блять работает хуйня єта ебаная (PyAudio) только с микрофонами по-нормальному
# а чтоб стримить аутпут системьі, то конечно лучше юзать пєкєдж soundcard
# но там немного другой принцип работьі и єто охуеть можно
# так что по ходу надо попробовать єтое: https://github.com/intxcc/pyaudio_portaudio/tree/master
# audio_stream = RealTimeAudioStream()
# audio_stream.stream(rms=True, zcr=True, fft=True)

In [3]:
def random_rotation_matrix(dim):
    # Generate a random orthogonal matrix
    x = torch.randn(dim, dim).requires_grad_(False)
    q, _ = torch.linalg.qr(x)
    
    # Apply the SVD decomposition to obtain the rotation matrix
    u, _, v = torch.linalg.svd(q)
    rotation_matrix = u @ v.mT
    
    return rotation_matrix

def interpolate_transforms(matrix1, matrix2, steps):
    # Interpolate between two rotation matrices
    transforms = []
    for t in range(steps + 1):
        weight = t / steps
        interpolated_matrix = torch.lerp(matrix1, matrix2, weight).requires_grad_(False)
        transforms.append(interpolated_matrix)
    return transforms

In [76]:
# custom dataset

batch_size = 64

WIDTH = 256
HEIGHT = 128

dataset_dir = r"data/prepared"

# calculate dataset mean and std
raw_transforms = T.Compose([
    T.Resize((HEIGHT, WIDTH)), 
    T.ToTensor(),
])

raw_data = datasets.ImageFolder(root = dataset_dir, transform = raw_transforms)

DATA_MEAN, DATA_STD = calculate_mean_std(raw_data)
# DATA_MEAN = [0.9386, 0.9386, 0.9386]
# DATA_STD  = [0.1825, 0.1825, 0.1825]

computing batch       50 / 50
tensor([0.9359, 0.9359, 0.9359]) tensor([0.2315, 0.2315, 0.2315])


In [77]:
data_transforms = T.Compose([
    T.Resize((HEIGHT, WIDTH)), 
    T.ToTensor(),
    T.Normalize(mean=DATA_MEAN,std=DATA_STD),
    T.Grayscale(),
])

dataset = datasets.ImageFolder(root = dataset_dir, transform = data_transforms)

# split into Train, Val and Test
data = split_train_val_test(dataset, val=0.0, test=0.1, batch_size=batch_size)

examples = enumerate(data['train'])
batch_idx, (example_data, example_targets) = next(examples)
print(example_data.shape)
print(example_data.min())
print(example_data.max())

torch.Size([64, 1, 128, 256])
tensor(-4.0421)
tensor(0.2770)


In [78]:
def eval_on_test(model, test_loader):
    model.eval()

    with torch.no_grad():
        latents = []
        labels = []
        for x, y in test_loader:
            mu, log_var = model.encoder(x.cuda())
            z = model.sampling(mu, log_var).cpu().numpy()

            latents.append(z)
            labels.append(y)

    latents = np.concatenate(latents, 0)
    labels = np.concatenate(labels, 0)
    model.train()

    return latents, labels

In [79]:
def visualize_latent_space(model, loss_items, experiment_name, test_loader, z_dims):

    now = datetime.now()
    pic_name = now.strftime("%Y%m%d%H%M%S%f")

    extent = 5

    cmap = plt.cm.tab20
    bounds = np.linspace(0,10,11)
    fig, ax = plt.subplots()

    latents, labels = eval_on_test(model, test_loader)
    if extent is not None: 
        ax.set_xlim(-extent, extent)
        ax.set_ylim(-extent, extent)
    scat = ax.scatter(latents[:, 0], latents[:,1], s=2, marker='o', cmap=cmap, c=labels)
    cb = plt.colorbar(scat, spacing='proportional',ticks=bounds)

    title = f"Recon: {loss_items[0].item():2.3f}, KLD {loss_items[1].item():2.3f}"
    ax.set_title(title)

    path1 = rf'latent_space_vis\{experiment_name}'

    if not os.path.exists(path1):
        os.makedirs(path1)

    fig.savefig(path1 + rf'\{pic_name}.jpg')
    plt.close()

In [81]:
# return reconstruction error + KL divergence losses
def vae_loss(recon_x, x, mu, log_var):
    B, C, H, W = recon_x.shape
    beta = 0.1 #legend says, that the bigger beta is, the higher the disentanglement
    recons_loss = F.mse_loss(recon_x.view(B, -1), x.view(B, -1), reduction="mean")
    KLD = beta * -0.5 * torch.mean(1 + log_var - mu.pow(2) - log_var.exp()) # 1 + log(sigma**2) - mu**2 - sigma**2
    return recons_loss, KLD

In [82]:
def norm_image(image):
    dtype = image.dtype
    image = image.astype(float)
    image = image - np.min(image)
    image = image / np.max(image) * 255
    image = image.astype(dtype)
    return image

def save_recon(x_recon, experiment_name):
    image = x_recon[0].permute(1, 2, 0).detach().cpu().numpy()
    now = datetime.now()
    pic_name = now.strftime("%Y%m%d%H%M%S%f")
    path =  f"latent_space_vis/{experiment_name}/recons"
    if not os.path.exists(path):
        os.makedirs(path)

    cv2.imwrite(os.path.join(path, f"{pic_name}.jpg"), norm_image(image))

In [83]:
def train(model, loss_f, train_loader, test_loader, optimizer, scheduler, epoch, experiment_name, embedding_size):
    np.set_printoptions(precision=2)
    if embedding_size > 2: vis = False
    else: vis = True

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model.train()
    model.to(device)

    for batch_idx, (x, _) in enumerate(train_loader):
        x = x.to(device)
        optimizer.zero_grad()
        x_recon, mu, log_var = model(x)
        std2 = log_var.exp().sqrt().mean(0)*2
        x_recon = x_recon[:, 0, None, :, :]

        rec, KLD = loss_f(x_recon, x, mu, log_var)
        loss = rec + KLD

        loss.backward()
        optimizer.step()
        scheduler.step()
        # print(scheduler.get_last_lr())

        if batch_idx % 25 == 0:
            if vis: 
                visualize_latent_space(model, (rec, KLD), experiment_name, test_loader)
                
            save_recon(x_recon, experiment_name)
            print("Epoch {:3} Iteration {:3}: recon: {:8.4f}, kld: {:8.4f}, std2: {}".format(epoch, batch_idx, rec.item(), KLD.item(), std2.detach().cpu().numpy()))

    path =  "models/{experiment_name}"
    if not os.path.exists(path):
        os.makedirs(path)

    # save_model_to = f"{path}/vae_{epoch}.pth"
    # torch.save(model.state_dict(), save_model_to)

    return loss


In [84]:
embedding_size = 8


# build model
vae = VAE(sample_x=example_data, hidden_dims=None, z_dim=embedding_size)
optimizer = optim.Adam(vae.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=40)

In [85]:
# # remember: acceptable recon loss is 0.45
num_epochs = 20

experiment_name = f"10_syntensor_only"

for epoch in range(1, num_epochs + 1):
    train(vae, vae_loss, data['train'], data['test'], optimizer, scheduler, epoch, experiment_name, embedding_size=embedding_size)

Epoch   1 Iteration   0: recon:   2.0797, kld:   0.0121, std2: [1.9  2.51 2.   2.04 2.23 2.03 2.38 1.89]
Epoch   1 Iteration  25: recon:   0.8169, kld:   0.0758, std2: [0.75 0.89 1.14 1.23 0.96 1.14 1.02 1.57]
Epoch   1 Iteration  50: recon:   0.8177, kld:   0.0734, std2: [0.89 0.84 1.12 1.14 1.15 1.02 1.06 1.41]
Epoch   1 Iteration  75: recon:   0.7282, kld:   0.0798, std2: [0.9  0.86 1.23 1.05 0.88 0.97 0.82 1.18]
Epoch   1 Iteration 100: recon:   0.6081, kld:   0.0904, std2: [0.72 0.81 0.85 1.02 1.03 0.77 0.86 1.59]
Epoch   1 Iteration 125: recon:   0.5653, kld:   0.0851, std2: [0.76 0.78 1.06 0.82 1.04 0.75 0.71 1.25]
Epoch   2 Iteration   0: recon:   0.5370, kld:   0.0951, std2: [0.67 0.77 0.86 0.78 1.08 0.62 0.61 1.3 ]
Epoch   2 Iteration  25: recon:   0.5470, kld:   0.1075, std2: [0.66 0.85 0.95 0.79 1.07 0.8  0.73 1.02]
Epoch   2 Iteration  50: recon:   0.4571, kld:   0.1025, std2: [0.69 0.7  0.91 0.69 0.96 0.6  0.62 1.  ]
Epoch   2 Iteration  75: recon:   0.4100, kld:   0.1154

In [None]:
# torch.save(vae, "models/vae_promaton.pth")

In [90]:
from time import time
from IPython.display import clear_output
import multiprocessing as mp

# this is needed to normalize sound; if you will change z-dim size, you should recalcualte
# these things. From this example you Ґcan see, that for techno music we've got a huge
# mean for bass (1 position) and low mean value for high freqs
# mean =  [510.15845655661803, 135.35665321114695, 102.851305103906, 63.160291749979983]
# std1 =  [700.373306840422, 135.12631741685448, 102.5477751543778, 63.582527648859525]

std = [0.00056372, 0.00181005, 0.00298502, 0.00497356, 0.00476172,
       0.00467349, 0.00724976, 0.02696994]
mean = [-1314.773    ,  -256.68332  ,  -139.03033  ,   -91.24173  ,
        -111.323074 ,  -109.316895 ,   -63.288567 ,   -14.2479315]
mean, std = torch.Tensor(mean).to("cuda"), torch.Tensor(std).to("cuda")

query = "default"
vae.eval()
vae.cuda()

steps = 0 #current transformation matrix
mat2 = torch.eye(embedding_size)
values = []

stream = RealTimeAudioStream(query=query, z_dim=embedding_size)
stream.start_audio_process()
response = None
with torch.no_grad():
    while True:
        response = stream.step_process()
        rms, zcr, fft = response
        
        # here we are perfoming a basis change.
        # basis 1 is the last one, we've been to
        # then we pick random basis 2 and slowly
        # traversing towards it
        if steps==0:
            steps = random.randint(2, 200)
            mat1 = mat2.clone()
            mat2 = random_rotation_matrix(embedding_size)

            rots = interpolate_transforms(mat2, mat1, steps)
        
        steps -= 1
        r_m = rots[steps].to("cuda")

        # rms = ((rms - 0.15) * 12) # 0.3
        # zcr = ((zcr - 0.07) * 24)# 0.14
        # print(angle_r, rms, zcr, end="\r")
        # z = (torch.tensor([rms, zcr]) @ r_m).unsqueeze(0).to("cuda") 
        ######
        z = fft[0, :, 0].float()
        z += (mean)
        z *= (std)
        # z = z @ r_m
        ######
        # print(fft)
        values.append(z.cpu().numpy())
        print("mean:", np.mean(values, 0), "\nstd2:", np.std(values, 0) * 2, "\n", len(values))

        sample = vae.decoder(z.cuda())
        sample = (sample + 1) / 2 
        image = sample[0].permute(1, 2, 0).detach().cpu().numpy()
        image = cv2.resize(image, (1024, 512))


        clear_output(wait=True)

        cv2.imshow("generation", image)

        k = cv2.waitKey(33)
        if k==27:    # Esc key to stop
            cv2.destroyAllWindows()
            break

cv2.destroyAllWindows()


#audio_stream.stream()

mean: [0.63 0.5  0.46 0.24 0.23 0.34 0.47 0.6 ] 
std2: [2.32 2.43 2.03 1.28 1.22 1.81 2.29 3.05] 
 526


In [38]:
-np.mean(values, 0)

array([-0.51353484, -0.34026122, -0.2533055 , -0.3103051 , -0.43936414,
       -0.47857332, -0.42765737, -0.28083706], dtype=float32)

In [26]:
1 / np.var(values, 0)

array([3.1778342e-07, 3.2762962e-06, 8.9103723e-06, 2.4736308e-05,
       2.2673956e-05, 2.1841470e-05, 5.2558982e-05, 7.2737777e-04],
      dtype=float32)

In [22]:
np.square([2.1409633e-07, 5.9963389e-07, 1.2509867e-06, 2.2082168e-06,
       2.3094976e-06, 2.4036513e-06, 3.9445022e-06, 1.3495573e-05])

array([4.58372385e-14, 3.59560802e-13, 1.56496772e-12, 4.87622144e-12,
       5.33377916e-12, 5.77753957e-12, 1.55590976e-11, 1.82130491e-10])

In [31]:
np.sqrt([3.1778342e-07, 3.2762962e-06, 8.9103723e-06, 2.4736308e-05,
       2.2673956e-05, 2.1841470e-05, 5.2558982e-05, 7.2737777e-04])

array([0.00056372, 0.00181005, 0.00298502, 0.00497356, 0.00476172,
       0.00467349, 0.00724976, 0.02696994])