In [1]:
import torch
import torch.nn as nn

# 입력 텐서 정의
input_tensor = torch.tensor([[[1, 1, 1]], [[0, 0, 0]]], dtype=torch.float32)

# 텐서를 FloatTensor로 변환하고 배치 차원 추가
con = nn.Conv1d(in_channels=1, out_channels=5, kernel_size=1,
                              stride=1, dilation=1, padding=0, bias=False)

# 출력 텐서 확인



---

In [None]:
import config
import torch
import numpy as np
from torchlibrosa.stft import STFT, ISTFT, magphase
import torch.nn as nn
import torch.nn.functional as F

In [None]:
window_size = 2048
hop_size = config.hop_samples
window = 'hann'
pad_mode = 'reflect'
center = True
momentum = 0.01
downsample_ratio = 2**6
channels=2
activation='relu'

In [None]:
stft = STFT(n_fft=window_size, hop_length=hop_size, 
            win_length=window_size, window=window, center=center, 
            pad_mode=pad_mode, freeze_parameters=True)

In [None]:
def wav_to_spectrogram(input):
    """Waveform to spectrogram.

    Args:
        input: (batch_size, segment_samples, channels_num)

    Outputs:
        output: (batch_size, channels_num, time_steps, freq_bins)
    """
    sp_list = []
    #####
    channels_num = input.shape[2]
    for channel in range(channels_num):
        sp_list.append(spectrogram(input[:, :, channel]))
    #####
    # for _ in range(self.channels):
    #     sp_list.append(self.spectrogram(input[:,:,0]))

    output = torch.cat(sp_list, dim=1)
    return output

In [None]:
def spectrogram(input):
    (real, imag) = stft(input)
    return (real ** 2 + imag ** 2) ** 0.5

In [None]:
bn0 = nn.BatchNorm2d(window_size // 2 + 1, momentum=momentum)

In [None]:
encoder_block1 = EncoderBlock(in_channels=channels, out_channels=32, 
            downsample=(2, 2), activation=activation, momentum=momentum, classes_num = config.latent_dim)

In [None]:
def init_layer(layer):
    """Initialize a Linear or Convolutional layer. """
    nn.init.xavier_uniform_(layer.weight)
 
    if hasattr(layer, 'bias'):
        if layer.bias is not None:
            layer.bias.data.fill_(0.)


def init_emb(layer):
    """Initialize a Linear or Convolutional layer. """
    nn.init.uniform_(layer.weight, -0.1, 0.1)
 
    if hasattr(layer, 'bias'):
        if layer.bias is not None:
            layer.bias.data.fill_(0.)

def act(x, activation):
    if activation == 'relu':
        return F.relu_(x)

    elif activation == 'leaky_relu':
        return F.leaky_relu_(x, negative_slope=0.2)

    elif activation == 'swish':
        return x * torch.sigmoid(x)

    else:
        raise Exception('Incorrect activation!')

def init_bn(bn):
    """Initialize a Batchnorm layer. """
    bn.bias.data.fill_(0.)
    bn.weight.data.fill_(1.)


In [None]:
class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, size, activation, momentum, classes_num = 527):
        super(ConvBlock, self).__init__()

        self.activation = activation
        pad = size // 2

        self.conv1 = nn.Conv2d(in_channels=in_channels, 
                              out_channels=out_channels,
                              kernel_size=(size, size), stride=(1, 1), 
                              dilation=(1, 1), padding=(pad, pad), bias=False)

        self.bn1 = nn.BatchNorm2d(out_channels, momentum=momentum)

        self.conv2 = nn.Conv2d(in_channels=out_channels, 
                              out_channels=out_channels,
                              kernel_size=(size, size), stride=(1, 1), 
                              dilation=(1, 1), padding=(pad, pad), bias=False)

        self.bn2 = nn.BatchNorm2d(out_channels, momentum=momentum)
        # change autotagging size
        #####
        self.emb1 = nn.Linear(classes_num, out_channels, bias=True)
        self.emb2 = nn.Linear(classes_num, out_channels, bias=True)
        ####
        # self.emb1 = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=size,
        #                       stride=1, dilation=1, padding=pad, bias=False)
        # self.emb2 = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=size,
        #                       stride=1, dilation=1, padding=pad, bias=False)
        self.init_weights()
        
    def init_weights(self):
        init_layer(self.conv1)
        init_layer(self.conv2)
        init_bn(self.bn1)
        init_bn(self.bn2)
        init_emb(self.emb1)
        init_emb(self.emb2)

    # latent query embedded 
    def forward(self, x, condition):
        c1 = self.emb1(condition)
        c2 = self.emb2(condition)
        x = act(self.bn1(self.conv1(x)), self.activation) + c1[:, :, None, None]
        x = act(self.bn2(self.conv2(x)), self.activation) + c2[:, :, None, None]
        return x

In [None]:
class EncoderBlock(nn.Module):
    def __init__(self, in_channels, out_channels, downsample, activation, momentum, classes_num = 527):
        super(EncoderBlock, self).__init__()
        size = 3

        self.conv_block = ConvBlock(in_channels, out_channels, size, activation, momentum, classes_num)
        self.downsample = downsample

    def forward(self, x, condition):
        encoder = self.conv_block(x, condition)
        encoder_pool = F.avg_pool2d(encoder, kernel_size=self.downsample)
        return encoder_pool, encoder

---

In [None]:
import librosa

y, _ = librosa.load("/Users/cooky/HDD/Drum/tactspack/drum loops/drumroll 12_sel.wav", mono=True)

In [None]:
# input = y.reshape(1,y.shape[0],1)
input = np.array([y,y,y,y,y]).reshape(5,y.shape[0],1)

In [None]:
input.shape

(5, 18462, 1)

In [None]:
input = torch.tensor(input, dtype=torch.float32)

In [None]:
input.shape

torch.Size([5, 18462, 1])

In [None]:
sp = wav_to_spectrogram(input)

In [None]:
x = sp.transpose(1,3)
x = bn0(x)
x = x.transpose(1,3)

In [None]:
x.shape

torch.Size([5, 1, 58, 1025])

In [None]:
origin_len = x.shape[2]
pad_len = int(np.ceil(x.shape[2] / downsample_ratio)) \
    * downsample_ratio - origin_len
x = F.pad(x, pad=(0, 0, 0, pad_len))


In [None]:
x.shape

torch.Size([5, 1, 64, 1025])

In [None]:
x = x[..., 0 : x.shape[-1] - 1]

In [None]:
x.shape

torch.Size([5, 1, 64, 1024])

In [None]:
encoder_block1 = EncoderBlock(in_channels=channels, out_channels=32, 
            downsample=(2, 2), activation=activation, momentum=momentum, classes_num = 99)

In [None]:
condition = torch.tensor(np.zeros((1,99)),dtype=torch.float32)

In [None]:
encoder_block1(x, condition)

RuntimeError: Given groups=1, weight of size [32, 5, 3, 3], expected input[1, 1, 64, 1024] to have 5 channels, but got 1 channels instead

In [None]:

size = 3
pad = size//2
conv_block = ConvBlock(channels, 32, size, activation, momentum, 99)


In [None]:
conv1 = nn.Conv2d(in_channels=1, 
                              out_channels=5,
                              kernel_size=(size, size), stride=(1, 1), 
                              dilation=(1, 1), padding=(pad, pad), bias=False)

In [None]:
conv1(x).shape

torch.Size([5, 5, 64, 1024])

In [None]:
bn1 = nn.BatchNorm2d(5, momentum=momentum)

In [None]:
x1 = act(bn1(conv1(x)), activation)

In [None]:
x1.shape

torch.Size([5, 5, 64, 1024])

In [None]:
emb1 = nn.Linear(99, 5, bias=True)

In [None]:
emb_imsi = nn.Conv1d(in_channels=channels, 
                              out_channels=5*5,
                              kernel_size=size, stride=1,
                              dilation=1, padding=pad, bias=False)

In [None]:
condition = torch.tensor(np.zeros((5,99)),dtype=torch.float32)

In [None]:
c1 = emb1(condition)
# c1 = emb_imsi(condition)


In [None]:
c1.shape

torch.Size([5, 5])

In [None]:
c11 = c1[:,:,None,None]
c11.shape

torch.Size([5, 5, 1, 1])

In [None]:
(x1 + c11).shape

torch.Size([5, 5, 64, 1024])

In [None]:
class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, size, activation, momentum, classes_num = 527):
        super(ConvBlock, self).__init__()

        self.activation = activation
        pad = size // 2

        self.conv1 = nn.Conv2d(in_channels=in_channels, 
                              out_channels=out_channels,
                              kernel_size=(size, size), stride=(1, 1), 
                              dilation=(1, 1), padding=(pad, pad), bias=False)

        self.bn1 = nn.BatchNorm2d(out_channels, momentum=momentum)

        self.conv2 = nn.Conv2d(in_channels=out_channels, 
                              out_channels=out_channels,
                              kernel_size=(size, size), stride=(1, 1), 
                              dilation=(1, 1), padding=(pad, pad), bias=False)

        self.bn2 = nn.BatchNorm2d(out_channels, momentum=momentum)
        # change autotagging size
        #####
        self.emb1 = nn.Linear(classes_num, out_channels, bias=True)
        self.emb2 = nn.Linear(classes_num, out_channels, bias=True)
        ####
        # self.emb1 = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=size,
        #                       stride=1, dilation=1, padding=pad, bias=False)
        # self.emb2 = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=size,
        #                       stride=1, dilation=1, padding=pad, bias=False)
        self.init_weights()
        
    def init_weights(self):
        init_layer(self.conv1)
        init_layer(self.conv2)
        init_bn(self.bn1)
        init_bn(self.bn2)
        init_emb(self.emb1)
        init_emb(self.emb2)

    # latent query embedded 
    def forward(self, x, condition):
        c1 = self.emb1(condition)
        c2 = self.emb2(condition)
        x = act(self.bn1(self.conv1(x)), self.activation) + c1[:, :, None, None]
        x = act(self.bn2(self.conv2(x)), self.activation) + c2[:, :, None, None]
        return x

In [None]:

encoder = conv_block(x, condition)


RuntimeError: The size of tensor a (1024) must match the size of tensor b (32) at non-singleton dimension 4

In [None]:
encoder_pool = F.avg_pool2d(encoder, kernel_size=self.downsample)
return encoder_pool, encoder

NameError: name 'encoder' is not defined

----

In [1]:
import torchvggish.vggish as vggish

model_urls = {
    'vggish': 'https://github.com/harritaylor/torchvggish/'
              'releases/download/v0.1/vggish-10086976.pth',
    'pca': 'https://github.com/harritaylor/torchvggish/'
           'releases/download/v0.1/vggish_pca_params-970ea276.pth'
}

In [2]:
model = vggish.VGGish(model_urls)

Downloading: "https://github.com/harritaylor/torchvggish/releases/download/v0.1/vggish-10086976.pth" to /Users/cooky/.cache/torch/hub/checkpoints/vggish-10086976.pth
100.0%
Downloading: "https://github.com/harritaylor/torchvggish/releases/download/v0.1/vggish_pca_params-970ea276.pth" to /Users/cooky/.cache/torch/hub/checkpoints/vggish_pca_params-970ea276.pth
100.0%


In [None]:
import torchaudio.prototype.pipelines.VGGishBundle.VGGish as VGGish

OSError: dlopen(/Users/cooky/miniforge3/envs/cid2rch/lib/python3.10/site-packages/torchaudio/lib/libtorchaudio.so, 0x0006): Symbol not found: __ZN2at14RecordFunctionC1ENS_11RecordScopeEb
  Referenced from: <E741B6D5-E348-3601-ACC9-BC3101AD112C> /Users/cooky/miniforge3/envs/cid2rch/lib/python3.10/site-packages/torchaudio/lib/libtorchaudio.so
  Expected in:     <AAE88793-2D9D-3CCA-96C4-EAC30CEA4202> /Users/cooky/miniforge3/envs/cid2rch/lib/python3.10/site-packages/torch/lib/libtorch_cpu.dylib