In [None]:
from platform import python_version
print(python_version())

In [1]:
import torchaudio
import torch

import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import Dataset

device = torch.device("cuda")

## Load Data

In [2]:
aud_1 = "data/Audio/One_Punch_Man_1.wav"
aud_2 = "data/Audio/One_Punch_Man_5.wav"
aud_3 = "data/Audio/One_Punch_Man_6.wav"

waveform_1, sample_freq_1 = torchaudio.load(aud_1)
waveform_2, sample_freq_2 = torchaudio.load(aud_2)
waveform_3, sample_freq_3 = torchaudio.load(aud_3)

data = []
data.append(waveform_1)
data.append(waveform_2)
data.append(waveform_3)

In [3]:
def pad_audio(data):
    longest = max(map(lambda x: x.shape[1], data))
    for i in range(len(data)):
        zeros = torch.zeros(2, (longest - data[i].shape[1]))
        data[i] = torch.cat((data[i], zeros), dim=1)
    return torch.stack(data)

tensor_data = pad_audio(data)

In [4]:
label_1 = "data/Labels/nick/One_Punch_Man_1.label"
label_2 = "data/Labels/nick/One_Punch_Man_5.label"
label_3 = "data/Labels/nick/One_Punch_Man_6.label"

labels = []
for filename in [label_1, label_2, label_3]:
    f = open(filename, "r")
    label = []
    for i in range(4):
        label.append(int(f.readline()))
    labels.append(label)
labels = torch.Tensor(labels)
l_mean = labels.mean(dim=0)
l_std = labels.std(dim=0)
labels = (labels - l_mean) / l_std
print(labels)

tensor([[ 0.5897,  0.5911,  0.7170,  0.7328],
        [-1.1546, -1.1546,  0.4254,  0.4065],
        [ 0.5649,  0.5635, -1.1424, -1.1392]])


#### Pytorch Dataset

In [None]:
# Not being used yet
class AnimeAudioDataset(Dataset):
    
    def __init__(self):
        aud_1 = "data/Audio/One_Punch_Man_1.wav"
        aud_2 = "data/Audio/One_Punch_Man_5.wav"
        aud_3 = "data/Audio/One_Punch_Man_6.wav"

        waveform_1, sample_freq_1 = torchaudio.load(aud_1)
        waveform_2, sample_freq_2 = torchaudio.load(aud_2)
        waveform_3, sample_freq_3 = torchaudio.load(aud_3)

        self.data = []
        self.data.append(waveform_1)
        self.data.append(waveform_2)
        self.data.append(waveform_3)
    
    def __len__(self):
        return len(self.data)
    
    def __get__item(self, idx):
        # idx can be a tensor
        return self.data[idx]

anime_audio_data = AnimeAudioDataset() 

## Define Network

In [5]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.magic_num = 234395 * 4
        self.conv1 = nn.Conv1d(2, 4, 1600, stride=10)
        self.pool = nn.MaxPool1d(5)
        # self.conv2 = nn.Conv1d(4, 8, 400, stride=10)
        self.fc1 = nn.Linear(self.magic_num, 4)
        # self.fc2 = nn.Linear(10000, 100)
        # self.fc3 = nn.Linear(100, 4)
    
    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        # x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, self.magic_num)
        # x = F.relu(self.fc1(x))
        # x = F.relu(self.fc2(x))
        # x = self.fc3(x)
        x = self.fc1(x)
        return x
    
net = Net()

In [6]:
net = Net()
criterion = nn.MSELoss().to(device)
optimizer = optim.SGD(net.parameters(), lr=0.003, momentum=0)

In [7]:
tensor_data = tensor_data.to(device)
labels = labels.to(device)
net.cuda()
pass

In [8]:
for epoch in range(20):
    
    optimizer.zero_grad()
    
    outputs = net(tensor_data)
    loss = criterion(outputs, labels)
    # print('loss', loss.item())
    loss.backward()
    optimizer.step()
    
for d in tensor_data:
    print(net(d.unsqueeze(0)))
print(labels)

tensor([[0.5897, 0.5911, 0.7170, 0.7328]], device='cuda:0',
       grad_fn=<AddmmBackward>)
tensor([[-1.1546, -1.1546,  0.4253,  0.4064]], device='cuda:0',
       grad_fn=<AddmmBackward>)
tensor([[ 0.5649,  0.5634, -1.1423, -1.1392]], device='cuda:0',
       grad_fn=<AddmmBackward>)
tensor([[ 0.5897,  0.5911,  0.7170,  0.7328],
        [-1.1546, -1.1546,  0.4254,  0.4065],
        [ 0.5649,  0.5635, -1.1424, -1.1392]], device='cuda:0')


In [None]:
# util to find magic numbers
x = data[0].unsqueeze(0)
x = net.pool(F.relu(net.conv1(x)))
print(x.shape)
x = x.view(-1, 5859 * 16)
# print(x.shape)
x = F.relu(net.fc1(x))
x = F.relu(net.fc2(x))
x = net.fc3(x)
print(x)
pass

In [None]:
# find magic num
x = data[0].unsqueeze(0)
x = net.pool(F.relu(net.conv1(x)))
# x = net.pool(F.relu(net.conv2(x)))
print(x.shape)
x = x.view(-1, net.magic_num)
x = net.fc1(x)
print(x)

In [None]:
test_audio = "data/Audio/Haikyuu!!_14.wav"
waveform, sample_rate = torchaudio.load(test_audio)
'''
print("Shape of waveform: {}".format(waveform.size()))
print("Sample rate of waveform: {}".format(sample_rate))

plt.figure()
plt.plot(waveform.t().numpy())
'''
pass

In [None]:
specgram = torchaudio.transforms.Spectrogram()(waveform)

In [None]:
print("Shape of spectrogram: {}".format(specgram.size()))

plt.figure()
plt.imshow(specgram.log2()[0,:,:].numpy(), cmap='gray')

In [None]:
specgram.log2()[0,:,1000:1010].numpy()