In [150]:
import cv2
import numpy as np

import torch
import torchaudio

import torch.optim as optim
import torch.nn as nn
from torchvision import utils, datasets
import torchvision.transforms as T
import torch.nn.functional as F

from utils.audio_utils import RealTimeAudioStream
from utils.vaes import *    
from utils.data_utils import calculate_mean_std, split_train_val_test

import matplotlib.pyplot as plt

import os

from datetime import datetime

import math

import random

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [151]:
# this ensures that the current MacOS version is at least 12.3+
print(torch.backends.mps.is_available())
# this ensures that the current current PyTorch installation was built with MPS activated.
print(torch.backends.mps.is_built())

True
True


In [152]:
# stream your audio output and check what it visualizes

# короче блять работает хуйня єта ебаная (PyAudio) только с микрофонами по-нормальному
# а чтоб стримить аутпут системьі, то конечно лучше юзать пєкєдж soundcard
# но там немного другой принцип работьі и єто охуеть можно
# так что по ходу надо попробовать єтое: https://github.com/intxcc/pyaudio_portaudio/tree/master
# audio_stream = RealTimeAudioStream()
# audio_stream.stream(rms=True, zcr=True, fft=True)

In [153]:
def random_rotation_matrix(dim):
    # Generate a random orthogonal matrix
    x = torch.randn(dim, dim).requires_grad_(False)
    q, _ = torch.linalg.qr(x)
    
    # Apply the SVD decomposition to obtain the rotation matrix
    u, _, v = torch.linalg.svd(q)
    rotation_matrix = u @ v.mT
    
    return rotation_matrix

def interpolate_transforms(matrix1, matrix2, steps):
    # Interpolate between two rotation matrices
    transforms = []
    for t in range(steps + 1):
        weight = t / steps
        interpolated_matrix = torch.lerp(matrix1, matrix2, weight).requires_grad_(False)
        transforms.append(interpolated_matrix)
    return transforms

In [154]:
# custom dataset

batch_size = 32

WIDTH = 256
HEIGHT = 128

dataset_dir = r"data/prepared"

# calculate dataset mean and std
raw_transforms = T.Compose([
    T.Resize((HEIGHT, WIDTH)), 
    T.ToTensor(),
])

raw_data = datasets.ImageFolder(root = dataset_dir, transform = raw_transforms)

# DATA_MEAN, DATA_STD = calculate_mean_std(raw_data)
DATA_MEAN = [0.9358, 0.9358, 0.9358]
DATA_STD  = [0.2308, 0.2308, 0.2308]

In [155]:
data_transforms = T.Compose([
    T.Resize((HEIGHT, WIDTH)), 
    T.ToTensor(),
    T.Grayscale(),
])

dataset = datasets.ImageFolder(root = dataset_dir, transform = data_transforms)

# split into Train, Val and Test
data = split_train_val_test(dataset, val=0.0, test=0.1, batch_size=batch_size)

examples = enumerate(data['train'])
batch_idx, (example_data, example_targets) = next(examples)
print(example_data.shape)
print(example_data.min())
print(example_data.max())



torch.Size([32, 1, 128, 256])
tensor(0.)
tensor(0.9999)


In [156]:
def eval_on_test(model, test_loader):
    model.eval()

    with torch.no_grad():
        latents = []
        labels = []
        for x, y in test_loader:
            mu, log_var = model.encoder(x.mps())
            z = model.sampling(mu, log_var).cpu().numpy()

            latents.append(z)
            labels.append(y)

    latents = np.concatenate(latents, 0)
    labels = np.concatenate(labels, 0)
    model.train()

    return latents, labels

In [157]:
def visualize_latent_space(model, loss_items, experiment_name, test_loader, z_dims):

    now = datetime.now()
    pic_name = now.strftime("%Y%m%d%H%M%S%f")

    extent = 5

    cmap = plt.cm.tab20
    bounds = np.linspace(0,10,11)
    fig, ax = plt.subplots()

    latents, labels = eval_on_test(model, test_loader)
    if extent is not None: 
        ax.set_xlim(-extent, extent)
        ax.set_ylim(-extent, extent)
    scat = ax.scatter(latents[:, 0], latents[:,1], s=2, marker='o', cmap=cmap, c=labels)
    cb = plt.colorbar(scat, spacing='proportional',ticks=bounds)

    title = f"Recon: {loss_items[0].item():2.3f}, KLD {loss_items[1].item():2.3f}"
    ax.set_title(title)

    path1 = rf'latent_space_vis\{experiment_name}'

    if not os.path.exists(path1):
        os.makedirs(path1)

    fig.savefig(path1 + rf'\{pic_name}.jpg')
    plt.close()

In [158]:
def norm_image(image):
    image = (image + 1) / 2
    image = image * 255
    return image

def save_recon(x_recon, experiment_name):
    image = x_recon[0].permute(1, 2, 0).detach().cpu().numpy()
    now = datetime.now()
    pic_name = now.strftime("%Y%m%d%H%M%S%f")
    path =  f"latent_space_vis/{experiment_name}/recons"
    if not os.path.exists(path):
        os.makedirs(path)

    cv2.imwrite(os.path.join(path, f"{pic_name}.jpg"), norm_image(image))

In [190]:
# return reconstruction error + KL divergence losses
def vae_loss(recon_x, x, mu, log_var):
    B, C, H, W = recon_x.shape
    beta = 0.04 #legend says, that the bigger beta is, the higher the disentanglement
    recons_loss = F.mse_loss(recon_x.view(B, -1), x.view(B, -1), reduction="mean")
    KLD = beta * -0.5 * torch.mean(1 + log_var - mu.pow(2) - log_var.exp()) # 1 + log(sigma**2) - mu**2 - sigma**2
    return recons_loss, KLD

In [192]:
def train_step(model, loss_f, train_loader, optimizer, scheduler, epoch, experiment_name, device):
    mean_rec = []
    mean_kld = []
    aggregated_z = []
    for batch_idx, (x, _) in enumerate(train_loader):
        x = x.to(device)
        x = (x * 2) - 1
        optimizer.zero_grad()
        x_recon, mu, log_var = model(x)

        rec, KLD = loss_f(x_recon, x, mu, log_var)
        loss = rec + KLD
        
        mean_rec.append(rec)
        mean_kld.append(KLD)

        z = model.sampling(mu, log_var)
        aggregated_z.append(z.detach().cpu().numpy())
        
        loss.backward()
        optimizer.step()
        scheduler.step()
        # print(scheduler.get_last_lr())

        if batch_idx % 100 == 0:                
            save_recon(x_recon, experiment_name)
            mean_rec = torch.as_tensor(mean_rec).mean().item()
            mean_kld = torch.as_tensor(mean_kld).mean().item()
            z_std = np.std(np.array(aggregated_z).reshape(-1, 8), 0)
            print("Epoch {:3} Iteration {:3}: recon: {:8.4f}, kld: {:8.4f}".format(
                epoch, batch_idx, mean_rec, mean_kld)
            )
            print(f"Z std: {z_std}")
            print(f"Z mean: {np.mean(np.array(aggregated_z).reshape(-1, 8), 0)}")
            mean_rec = []
            mean_kld = []
            aggregated_z = []
            # print(f"mean: {mean.detach().cpu().numpy()}")

    # save_model_to = f"{path}/vae_{epoch}.pth"
    # torch.save(model.state_dict(), save_model_to)

In [197]:
embedding_size = 8


# build model
# vae = VAE(shape=(HEIGHT, WIDTH), z_dim=embedding_size)
# vae = torch.load("models/vae_promaton.pth")
optimizer = optim.Adam(vae.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=30)

In [198]:
np.set_printoptions(precision=2)
# # remember: acceptable recon loss is 0.45
num_epochs = 30

experiment_name = f"10_syntensor_only"
vis = False

device = torch.device("mps")

vae.train()
vae.to(device)

for epoch in range(1, num_epochs + 1):
    train_step(vae, vae_loss, data['train'], optimizer, scheduler, epoch, experiment_name, device=device)



Epoch   1 Iteration   0: recon:   0.0300, kld:   0.0508
Z std: [0.81 0.71 0.96 0.93 0.82 1.16 1.08 0.87]
Z mean: [ 0.07 -0.35 -0.07  0.26  0.29 -0.22 -0.32 -0.21]
Epoch   1 Iteration 100: recon:   0.0402, kld:   0.0496
Z std: [1.   1.07 0.99 1.03 1.05 1.04 1.07 1.03]
Z mean: [ 0.01 -0.03 -0.04 -0.03 -0.07  0.03  0.09  0.01]
Epoch   1 Iteration 200: recon:   0.0369, kld:   0.0500
Z std: [1.01 1.04 0.94 1.04 1.   1.04 1.04 1.01]
Z mean: [ 0.   -0.04  0.   -0.02 -0.07 -0.02  0.03 -0.05]




Epoch   2 Iteration   0: recon:   0.0384, kld:   0.0496
Z std: [0.74 0.94 0.84 0.96 1.04 1.19 1.01 1.06]
Z mean: [ 0.25  0.29  0.18 -0.18 -0.25 -0.09 -0.11  0.24]
Epoch   2 Iteration 100: recon:   0.0383, kld:   0.0490
Z std: [0.99 1.05 0.95 1.03 1.02 1.03 1.05 1.01]
Z mean: [ 0.01 -0.04 -0.02  0.01 -0.06 -0.01  0.04 -0.02]
Epoch   2 Iteration 200: recon:   0.0375, kld:   0.0490
Z std: [0.99 1.06 0.95 1.03 1.01 1.05 1.05 0.98]
Z mean: [ 0.03 -0.   -0.01 -0.02 -0.01  0.    0.03 -0.01]




Epoch   3 Iteration   0: recon:   0.0393, kld:   0.0489
Z std: [0.92 1.03 1.02 1.12 1.03 0.99 1.04 0.89]
Z mean: [ 0.02 -0.16 -0.33 -0.1  -0.1  -0.02  0.24 -0.04]


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x136ec28b0>
Traceback (most recent call last):
  File "/Users/dmitry/Documents/sound_vae/.venv/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1478, in __del__
    self._shutdown_workers()
  File "/Users/dmitry/Documents/sound_vae/.venv/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1442, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/multiprocessing/process.py", line 149, in join
    res = self._popen.wait(timeout)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/multiprocessing/popen_fork.py", line 40, in wait
    if not wait([self.sentinel], timeout):
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/multiprocessing/connection

KeyboardInterrupt: 

In [None]:
# torch.save(vae, "models/vae_promaton.pth")

In [199]:
from time import time
from IPython.display import clear_output
import multiprocessing as mp

# this is needed to normalize sound; if you will change z-dim size, you should recalcualte
# these things. From this example you can see, that for techno music we've got a huge
# mean for bass (1 position) and low mean value for high freqs
# mean =  [510.15845655661803, 135.35665321114695, 102.851305103906, 63.160291749979983]
# std1 =  [700.373306840422, 135.12631741685448, 102.5477751543778, 63.582527648859525]

std = [0.00056372, 0.00181005, 0.00298502, 0.00497356, 0.00476172,
       0.00467349, 0.00724976, 0.02696994]
mean = [-1314.773    ,  -256.68332  ,  -139.03033  ,   -91.24173  ,
        -111.323074 ,  -109.316895 ,   -63.288567 ,   -14.2479315]
mean, std = torch.Tensor(mean).to("mps"), torch.Tensor(std).to("mps")

query = "Black"
# vae = torch.load("models/vae_promaton.pth")
vae.eval()
vae.to("mps")

steps = 0 #current transformation matrix
mat2 = torch.eye(embedding_size)
sliding_window = torch.ones([40, embedding_size]).to("mps")

stream = RealTimeAudioStream(query=query, z_dim=embedding_size)
stream.start_audio_process()
response = None
ind = 0
with torch.no_grad():
    while True:
        response = stream.step_process()
        rms, zcr, fft = response
        fft = fft.to(torch.float32).to('mps')
        
        # here we are perfoming a basis change.
        # basis 1 is the last one, we've been to
        # then we pick random basis 2 and slowly
        # traversing towards it
        if steps==0:
            steps = random.randint(2, 200)
            mat1 = mat2.clone()
            mat2 = random_rotation_matrix(embedding_size)

            rots = interpolate_transforms(mat2, mat1, steps)
        
        steps -= 1
        r_m = rots[steps].to("mps")

        # rms = ((rms - 0.15) * 12) # 0.3
        # zcr = ((zcr - 0.07) * 24)# 0.14
        # print(angle_r, rms, zcr, end="\r")
        # z = (torch.tensor([rms, zcr]) @ r_m).unsqueeze(0).to("mps") 
        ######
        z = fft[0, :, 0].float()
        sliding_window[ind % 40] = z
        ind += 1

        z -= torch.mean(sliding_window, 0)
        z /= torch.std(sliding_window, 0) + 0.1
        z *= 1.5
        print(z)

        # z = z @ r_m
        ######
        # print(fft)

        sample = vae.decoder(z.to("mps"))
        image = sample[0].permute(1, 2, 0).detach().cpu().numpy()
        image = (image + 1) / 2 
        image = (image * 255).astype(np.uint8)
        image = cv2.resize(image, (1024, 512))


        clear_output(wait=True)

        cv2.imshow("generation", image)

        k = cv2.waitKey(33)
        if k==ord('q'):    # q key to stop
            cv2.destroyAllWindows()
            break

for i in range(2):
    cv2.waitKey(1)

cv2.destroyAllWindows()


#audio_stream.stream()

KeyboardInterrupt: 

In [None]:
-np.mean(values, 0)

array([-0.51353484, -0.34026122, -0.2533055 , -0.3103051 , -0.43936414,
       -0.47857332, -0.42765737, -0.28083706], dtype=float32)

In [None]:
1 / np.var(values, 0)

array([3.1778342e-07, 3.2762962e-06, 8.9103723e-06, 2.4736308e-05,
       2.2673956e-05, 2.1841470e-05, 5.2558982e-05, 7.2737777e-04],
      dtype=float32)

In [None]:
np.square([2.1409633e-07, 5.9963389e-07, 1.2509867e-06, 2.2082168e-06,
       2.3094976e-06, 2.4036513e-06, 3.9445022e-06, 1.3495573e-05])

array([4.58372385e-14, 3.59560802e-13, 1.56496772e-12, 4.87622144e-12,
       5.33377916e-12, 5.77753957e-12, 1.55590976e-11, 1.82130491e-10])

In [None]:
np.sqrt([3.1778342e-07, 3.2762962e-06, 8.9103723e-06, 2.4736308e-05,
       2.2673956e-05, 2.1841470e-05, 5.2558982e-05, 7.2737777e-04])

array([0.00056372, 0.00181005, 0.00298502, 0.00497356, 0.00476172,
       0.00467349, 0.00724976, 0.02696994])

In [None]:
model_pth = torch.load("models/vae_promaton.pth")
vae = VAE(sample_x=example_data, hidden_dims=None, z_dim=embedding_size)
vae.load_state_dict(model_pth)

[256, 128, 64, 32, 16]
[(8, 16), (16, 32), (32, 64), (64, 128), (128, 256)]


TypeError: Expected state_dict to be dict-like, got <class 'utils.vaes.VAE'>.

In [None]:
from typing import Mapping


isinstance(model_pth, Mapping)

False

In [None]:
vae

VAE(
  (encoder_layers): Sequential(
    (0): Sequential(
      (0): Conv2d(1, 8, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): LeakyReLU(negative_slope=0.01)
    )
    (1): Sequential(
      (0): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): LeakyReLU(negative_slope=0.01)
    )
    (2): Sequential(
      (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): LeakyReLU(negative_slope=0.01)
    )
    (3): Sequential(
      (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): LeakyReLU(negative_slope=0.01)
    )
  )
  (fc_mu): Line