In [None]:
from platform import python_version
print(python_version())

In [1]:
import torchaudio
import torch

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import Dataset

import matplotlib.pyplot as plt
import numpy as np

In [2]:
device = torch.device("cuda")

## Load Data

In [3]:
aud_1 = "data/Audio/One_Punch_Man_1.wav"
aud_2 = "data/Audio/One_Punch_Man_5.wav"
aud_3 = "data/Audio/One_Punch_Man_6.wav"

waveform_1, sample_freq_1 = torchaudio.load(aud_1)
waveform_2, sample_freq_2 = torchaudio.load(aud_2)
waveform_3, sample_freq_3 = torchaudio.load(aud_3)

data = []
data.append(waveform_1)
data.append(waveform_2)
data.append(waveform_3)

In [4]:
def pad_audio(data):
    longest = max(map(lambda x: x.shape[1], data))
    for i in range(len(data)):
        zeros = torch.zeros(2, (longest - data[i].shape[1]))
        data[i] = torch.cat((data[i], zeros), dim=1)
    return torch.stack(data)

tensor_data = pad_audio(data)

In [5]:
label_1 = "data/Labels/nick/One_Punch_Man_1.label"
label_2 = "data/Labels/nick/One_Punch_Man_5.label"
label_3 = "data/Labels/nick/One_Punch_Man_6.label"

labels = []
for filename in [label_1, label_2, label_3]:
    f = open(filename, "r")
    label = []
    for i in range(4):
        label.append(int(f.readline()))
    labels.append(label)
labels = torch.Tensor(labels)
labels = labels / 1000
print(labels)
print(labels.mean(dim=0))
labels = labels - labels.mean(dim=0)
print(labels)

tensor([[ 196.2080,  285.5490, 1335.5129, 1423.5129],
        [  75.5830,  164.4650, 1327.3361, 1414.3390],
        [ 194.4970,  283.6300, 1283.3770, 1370.8800]])
tensor([ 155.4293,  244.5480, 1315.4087, 1402.9106])
tensor([[ 40.7787,  41.0010,  20.1042,  20.6023],
        [-79.8463, -80.0830,  11.9274,  11.4283],
        [ 39.0677,  39.0820, -32.0317, -32.0306]])


#### Pytorch Dataset

In [6]:
# Not being used yet
class AnimeAudioDataset(Dataset):
    
    def __init__(self):
        aud_1 = "data/Audio/One_Punch_Man_1.wav"
        aud_2 = "data/Audio/One_Punch_Man_5.wav"
        aud_3 = "data/Audio/One_Punch_Man_6.wav"

        waveform_1, sample_freq_1 = torchaudio.load(aud_1)
        waveform_2, sample_freq_2 = torchaudio.load(aud_2)
        waveform_3, sample_freq_3 = torchaudio.load(aud_3)

        self.data = []
        self.data.append(waveform_1)
        self.data.append(waveform_2)
        self.data.append(waveform_3)
    
    def __len__(self):
        return len(self.data)
    
    def __get__item(self, idx):
        # idx can be a tensor
        return self.data[idx]

anime_audio_data = AnimeAudioDataset() 

## Define Network

In [9]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.magic_num = 234395 * 4
        self.conv1 = nn.Conv1d(2, 4, 1600, stride=10)
        self.pool = nn.MaxPool1d(5)
        # self.conv2 = nn.Conv1d(4, 8, 400, stride=10)
        self.fc1 = nn.Linear(self.magic_num, 4)
        # self.fc2 = nn.Linear(10000, 100)
        # self.fc3 = nn.Linear(100, 4)
    
    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        # x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, self.magic_num)
        # x = F.relu(self.fc1(x))
        # x = F.relu(self.fc2(x))
        # x = self.fc3(x)
        x = self.fc1(x)
        return x
    
net = Net()

In [10]:
x = data[0].unsqueeze(0)
x = net.pool(F.relu(net.conv1(x)))
# x = net.pool(F.relu(net.conv2(x)))
print(x.shape)
x = x.view(-1, net.magic_num)
x = net.fc1(x)
print(x)

torch.Size([1, 4, 234395])
tensor([[-0.0022, -0.0142, -0.0081, -0.0295]], grad_fn=<AddmmBackward>)


In [30]:
net = Net()
criterion = nn.MSELoss().to(device)
optimizer = optim.SGD(net.parameters(), lr=0.0003, momentum=0)

In [31]:
tensor_data = tensor_data.to(device)
labels = labels.to(device)
net.cuda()

Net(
  (conv1): Conv1d(2, 4, kernel_size=(1600,), stride=(10,))
  (pool): MaxPool1d(kernel_size=5, stride=5, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=937580, out_features=4, bias=True)
)

In [32]:
for epoch in range(10):
    
    optimizer.zero_grad()
    
    outputs = net(tensor_data)
    loss = criterion(outputs, labels)
    print('loss', loss.item())
    loss.backward()
    optimizer.step()

loss 1861.9652099609375
loss 1712.8060302734375
loss 1495.333740234375
loss 1058.256591796875
loss 354.7622985839844
loss 13.10226821899414
loss 3.1527061462402344
loss 27.252883911132812
loss 274.84857177734375
loss 675.4430541992188


In [None]:
# util to find magic numbers
x = data[0].unsqueeze(0)
x = net.pool(F.relu(net.conv1(x)))
print(x.shape)
x = x.view(-1, 5859 * 16)
# print(x.shape)
x = F.relu(net.fc1(x))
x = F.relu(net.fc2(x))
x = net.fc3(x)
print(x)
pass

In [None]:
test_audio = "data/Audio/Haikyuu!!_14.wav"
waveform, sample_rate = torchaudio.load(test_audio)
'''
print("Shape of waveform: {}".format(waveform.size()))
print("Sample rate of waveform: {}".format(sample_rate))

plt.figure()
plt.plot(waveform.t().numpy())
'''
pass

In [None]:
specgram = torchaudio.transforms.Spectrogram()(waveform)

In [None]:
print("Shape of spectrogram: {}".format(specgram.size()))

plt.figure()
plt.imshow(specgram.log2()[0,:,:].numpy(), cmap='gray')

In [None]:
specgram.log2()[0,:,1000:1010].numpy()