In [1]:
from platform import python_version
print(python_version())

3.6.9


In [2]:
import torchaudio
import torch

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import Dataset

import matplotlib.pyplot as plt
import numpy as np

## Load Data

In [3]:
aud_1 = "data/Audio/One_Punch_Man_1.wav"
aud_2 = "data/Audio/One_Punch_Man_5.wav"
aud_3 = "data/Audio/One_Punch_Man_6.wav"

waveform_1, sample_freq_1 = torchaudio.load(aud_1)
waveform_2, sample_freq_2 = torchaudio.load(aud_2)
waveform_3, sample_freq_3 = torchaudio.load(aud_3)

data = []
data.append(waveform_1)
data.append(waveform_2)
data.append(waveform_3)

In [4]:
def pad_audio(data):
    longest = max(map(lambda x: x.shape[1], data))
    for i in range(len(data)):
        zeros = torch.zeros(2, (longest - data[i].shape[1]))
        data[i] = torch.cat((data[i], zeros), dim=1)
    return data

#tensor_data = torch.stack(pad_audio(data))

def normalize(data):
    for i in range(len(data)):
        data[i] = (data[i] - data[i].mean())/data[i].std()
    return data

tensor_data = torch.stack(normalize(pad_audio(data)))

In [5]:
label_1 = "data/Labels/One_Punch_Man_1.label"
label_2 = "data/Labels/One_Punch_Man_5.label"
label_3 = "data/Labels/One_Punch_Man_6.label"

labels = []
for filename in [label_1, label_2, label_3]:
    f = open(filename, "r")
    label = []
    for i in range(4):
        label.append(int(f.readline()))
    labels.append(label)

# normalize
for i in range(len(labels)):
    labels[i] = (labels[i] - np.mean(labels[i]))/np.std(labels[i])

labels = torch.Tensor(labels)

#### Pytorch Dataset

In [6]:
# Not being used yet
class AnimeAudioDataset(Dataset):
    
    def __init__(self):
        aud_1 = "data/Audio/One_Punch_Man_1.wav"
        aud_2 = "data/Audio/One_Punch_Man_5.wav"
        aud_3 = "data/Audio/One_Punch_Man_6.wav"

        waveform_1, sample_freq_1 = torchaudio.load(aud_1)
        waveform_2, sample_freq_2 = torchaudio.load(aud_2)
        waveform_3, sample_freq_3 = torchaudio.load(aud_3)

        self.data = []
        self.data.append(waveform_1)
        self.data.append(waveform_2)
        self.data.append(waveform_3)
    
    def __len__(self):
        return len(self.data)
    
    def __get__item(self, idx):
        # idx can be a tensor
        return self.data[idx]

anime_audio_data = AnimeAudioDataset() 

## Define Network

In [7]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.magic_num = 585 * 16
        self.conv1 = nn.Conv1d(2, 16, 1600, stride=200)
        self.pool = nn.MaxPool1d(100)
        self.fc1 = nn.Linear(self.magic_num, 10000)
        self.fc2 = nn.Linear(10000, 100)
        self.fc3 = nn.Linear(100, 4)
    
    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = x.view(-1, self.magic_num)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
net = Net()

In [8]:
criterion = nn.MSELoss()
optimizer = optim.SGD(net.parameters(), lr=0.001)

In [None]:
for epoch in range(10):
    outputs = net(tensor_data)
    #print(outputs, labels)
    loss = criterion(outputs, labels)
    print('loss', loss.item())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

loss 0.9282853007316589
loss 0.9236893057823181
loss 0.919090747833252
loss 0.914525032043457
loss 0.9099622368812561
loss 0.9054682850837708
loss 0.901046097278595
loss 0.8967908024787903
loss 0.8925991058349609
loss 0.8885936737060547


In [None]:
# util to find magic numbers
x = data[0].unsqueeze(0)
x = net.pool(F.relu(net.conv1(x)))
print(x.shape)
x = x.view(-1, 5859 * 16)
# print(x.shape)
x = F.relu(net.fc1(x))
x = F.relu(net.fc2(x))
x = net.fc3(x)
print(x)
pass

In [None]:
test_audio = "data/Audio/Haikyuu!!_14.wav"
waveform, sample_rate = torchaudio.load(test_audio)
'''
print("Shape of waveform: {}".format(waveform.size()))
print("Sample rate of waveform: {}".format(sample_rate))

plt.figure()
plt.plot(waveform.t().numpy())
'''
pass

In [None]:
specgram = torchaudio.transforms.Spectrogram()(waveform)

In [None]:
print("Shape of spectrogram: {}".format(specgram.size()))

plt.figure()
plt.imshow(specgram.log2()[0,:,:].numpy(), cmap='gray')

In [None]:
specgram.log2()[0,:,1000:1010].numpy()