# samples
This notebook learns directly from audio sample data, in chunks.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio

In [2]:
torch.cuda.is_available()

True

In [56]:
class Featurizer(nn.Module):
    def __init__(self, block_size):
        super().__init__()
        self.block_size = block_size

    def forward(self, x, batches=10) -> torch.Tensor:
        trim = x.numel() % self.block_size
        blocks = (x.numel() - trim) // self.block_size
        block_trim = blocks % batches
        return x[:-(trim+block_trim*self.block_size)].reshape(-1, batches, self.block_size)


In [48]:
def load(path, device="cpu"):
    audio, sr = torchaudio.load(path)
    audio = audio.to(device)
    if audio.ndim == 2:
        return audio.sum(axis=0), sr
    else:
        return audio, sr

In [49]:
audio, sr = load("../data/grains.wav", "cuda")
audio.numel()

728102

In [69]:
BLOCK_SIZE=64
BATCH_SIZE=40
featurizer = Featurizer(BLOCK_SIZE).to("cuda")

In [55]:
audio.numel()

728102

In [70]:
featurized = featurizer(audio, BATCH_SIZE)
featurized.shape

torch.Size([284, 40, 64])

In [33]:
class Model(nn.Module):
    def __init__(self, block_size, num_layers, hidden_size):
        super().__init__()
        self.lstm = nn.LSTM(block_size, hidden_size, num_layers)
        self.output = nn.Linear(hidden_size, block_size)
    def forward(self, x, hidden):
        output, hidden = self.lstm(x, hidden)
        return self.output(output), hidden
    def init_hidden(self, batch_size=1, device="cpu"):
        shape = (self.lstm.num_layers, batch_size, self.lstm.hidden_size)
        return torch.zeros(shape, dtype=torch.float32, device=device), torch.zeros(shape, dtype=torch.float32, device=device)

In [91]:
model = Model(BLOCK_SIZE, 8, 128).to("cuda")

In [92]:
loss = nn.MSELoss()

In [95]:
optimizer = optim.Adam(model.parameters(), lr=5e-5)

In [97]:
num_epochs=100000
for epoch in range(num_epochs):
    init_hidden = model.init_hidden(BATCH_SIZE, "cuda")
    output, _ = model(featurized, init_hidden)
    epoch_loss = loss(output, featurized)
    epoch_loss.backward()
    optimizer.step()
    if (epoch + 1) % 5000 == 0:
        print(f"Epoch {epoch+1}, loss: {epoch_loss}")

Epoch 5000, loss: 0.010262319818139076
Epoch 10000, loss: 0.010068058036267757
Epoch 15000, loss: 0.00972263514995575
Epoch 20000, loss: 0.009172793477773666
Epoch 25000, loss: 0.008535410277545452
Epoch 30000, loss: 0.00782046653330326
Epoch 35000, loss: 0.007376702036708593
Epoch 40000, loss: 0.0069656274281442165
Epoch 45000, loss: 0.00668838107958436
Epoch 50000, loss: 0.006453242618590593
Epoch 55000, loss: 0.00618392089381814
Epoch 60000, loss: 0.0059726485051214695
Epoch 65000, loss: 0.005835735704749823
Epoch 70000, loss: 0.005791875999420881
Epoch 75000, loss: 0.005632910877466202
Epoch 80000, loss: 0.0055466145277023315
Epoch 85000, loss: 0.005518725607544184
Epoch 90000, loss: 0.00542448740452528
Epoch 95000, loss: 0.005484931170940399
Epoch 100000, loss: 0.0053152828477323055


## Generation
Given an input prompt, generate a continuation.

In [99]:
prompt = featurizer(audio, 1)
prompt.shape

torch.Size([11376, 1, 64])

In [104]:
output, hidden = model(prompt, model.init_hidden(1, "cuda"))
output.shape

torch.Size([11376, 1, 64])

In [121]:
num_blocks=512
hidden = model.init_hidden(1, "cuda")
output, hidden = model(prompt, hidden)
new_audio = torch.cat((prompt, output[-1:]))
for _ in range(num_blocks):
    output, hidden = model(output[-1:], hidden)
    new_audio = torch.cat((new_audio, output[-1:]))
torchaudio.save("../data/output.wav", new_audio.reshape(-1).unsqueeze(0).to("cpu"), sr)

In [115]:
new_audio.reshape(-1).unsqueeze(0).shape

torch.Size([1, 736320])

## A more sophisticated approach

In [120]:
source_audio, sr = load("../data/687984__girlwithsoundrecorder__stefan-a-frog-from-poland.wav", "cuda")

In [138]:
BLOCK_SIZE=128
BATCH_SIZE=500
featurizer = Featurizer(BLOCK_SIZE).to("cuda")

In [139]:
training = featurizer(source_audio, BATCH_SIZE)
training.shape

torch.Size([53, 500, 128])

In [146]:
model = Model(BLOCK_SIZE, 16, 128).to("cuda")
loss = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), 5e-5)

In [None]:
num_epochs=10000
for epoch in range(num_epochs):
    init_hidden = model.init_hidden(BATCH_SIZE, "cuda")
    output, _ = model(training, init_hidden)
    epoch_loss = loss(output, training)
    epoch_loss.backward()
    optimizer.step()
    if (epoch + 1) % 500 == 0:
        print(f"Epoch {epoch+1}, loss: {epoch_loss}")