# Improving Speech Audio Quality on Edge Devices with Lightweight Latent Diffusion Models

Kevin Putra Santoso - Avalon AI - Institut Teknologi Sepuluh Nopember

My Final Year Thesis

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

from model.vae.modules import Encoder, Decoder
from model.vae.enc_dec import AutoencoderKL

In [2]:
autoencoder = AutoencoderKL(
    enc_config={"in_channels": 1, "intermediate_channels": 64, "channel_multipliers":[1, 2, 4, 8], "resblock_counts": 2, "attn_resolutions": [16], "dropout": 0.1, "resolution": 128, "z_channels": 64, "double_z": True},
    dec_config={"out_channels": 1, "intermediate_channels": 64, "channel_multipliers":[8, 4, 2, 1], "resblock_counts": 2, "attn_resolutions": [16], "dropout": 0.1, "resolution": 128, "z_channels": 64, "double_z": False, "give_pre_end": False, "tanh_out": False},
)

Melakukan operasi pada z dengan dimensi (1, 64, 16, 32) = 32768 dimensi.


In [3]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(count_parameters(autoencoder))

43690113


In [5]:
encoder = Encoder(in_channels=1, intermediate_channels=64, channel_multipliers=[1, 2, 4, 8], resblock_counts=2, attn_resolutions=[16], dropout=0.1, resolution=128, z_channels=64, double_z=False)
decoder = Decoder(out_channels=1, intermediate_channels=64, channel_multipliers=[8, 4, 2, 1], resblock_counts=2, attn_resolutions=[16], dropout=0.1, resolution=128, z_channels=64, double_z=True, give_pre_end=False, tanh_out=False)

# Hitung kompleksitas dari Encoder dan Decoder
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print("Jumlah parameter encoder: ", count_parameters(encoder))
print("Jumlah parameter decoder: ", count_parameters(decoder))
print("Jumlah parameter autoencoder: ", count_parameters(autoencoder))

Melakukan operasi pada z dengan dimensi (1, 64, 16, 32) = 32768 dimensi.
Jumlah parameter encoder:  24838528
Jumlah parameter decoder:  18474113
Jumlah parameter autoencoder:  43690113


In [9]:
sample_input = torch.randn(1, 1, 128, 256)
sample_out, posterior = autoencoder(sample_input)

In [10]:
sample_out

tensor([[[[ 0.0719,  0.1570,  0.7188,  ...,  0.7362,  0.6994,  0.1857],
          [ 0.2453,  0.3191,  1.0423,  ...,  1.4492,  0.7700,  0.6347],
          [ 0.3229,  0.7446,  0.8214,  ...,  1.1271,  0.8766,  0.4230],
          ...,
          [ 0.0472,  0.1720,  0.3200,  ..., -0.0358,  0.3865,  0.3527],
          [ 0.0199,  0.1468,  0.0680,  ..., -0.1693, -0.0338,  0.0712],
          [ 0.0605,  0.0740,  0.0959,  ..., -0.1611, -0.1097, -0.0829]]]],
       grad_fn=<ConvolutionBackward0>)

In [11]:
sample_out.shape

torch.Size([1, 1, 128, 256])

In [31]:
posterior

<model.vae.distributions.DiagonalGaussianDistribution at 0x20d5aa90be0>