In [1]:
import librosa
import librosa.display
import torch
import sys
import glob
import time
from torch.utils.data import Dataset
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch import optim
from pytorch_model_summary import summary

sys.path.insert(1, '../utils')
sys.path.insert(1, '../models')

from data import StutterData, load_data
from audioCNN import AudioCNN


In [2]:
model = AudioCNN()

In [3]:
print(summary(AudioCNN(), torch.zeros((1, 1, 20, 50)), show_input=True))

---------------------------------------------------------------------------
          Layer (type)         Input Shape         Param #     Tr. Param #
   AdaptiveAvgPool2d-1      [1, 1, 20, 50]               0               0
              Conv2d-2      [1, 1, 32, 32]              80              80
   AdaptiveAvgPool2d-3      [1, 8, 30, 30]               0               0
             Dropout-4      [1, 8, 16, 16]               0               0
              Conv2d-5      [1, 8, 16, 16]           1,168           1,168
   AdaptiveAvgPool2d-6     [1, 16, 14, 14]               0               0
              Linear-7           [1, 1024]         524,800         524,800
              Linear-8            [1, 512]         131,328         131,328
              Linear-9            [1, 256]             257             257
Total params: 657,633
Trainable params: 657,633
Non-trainable params: 0
---------------------------------------------------------------------------


In [4]:
model.forward(torch.zeros((1, 1, 20, 50)))  # Test forward pass

tensor([[-0.0007]], grad_fn=<AddmmBackward>)

In [5]:
lr = 0.001
batch_size = 1
epochs = 25
validation_split=0.2
shuffle_dataset=True
random_seed=42
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")     #Check whether a GPU is present.

# optimizer = optim.SGD(model.parameters(), lr = 0.00001, momentum=0.9, weight_decay=5e-4)
# scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[150, 200], gamma=0.1)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr = lr)

In [6]:
dataset = StutterData('../data/*')
train_loader, validation_loader = load_data(dataset, batch_size, validation_split=0.2, shuffle_dataset=True, random_seed=42)

In [7]:
model.to(device)

AudioCNN(
  (maxpool): AdaptiveAvgPool2d(output_size=32)
  (conv1): Conv2d(1, 8, kernel_size=(3, 3), stride=(1, 1))
  (maxpool1): AdaptiveAvgPool2d(output_size=16)
  (dropout): Dropout(p=0.2, inplace=False)
  (conv2): Conv2d(8, 16, kernel_size=(3, 3), stride=(1, 1))
  (maxpool2): AdaptiveAvgPool2d(output_size=8)
  (fc1): Linear(in_features=1024, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (fc3): Linear(in_features=256, out_features=1, bias=True)
)

In [8]:
for epoch in range(epochs):
    losses=[]
#     scheduler.step()
    
    start = time.time()
    
    for b_idx, x in enumerate(train_loader):
#         print(b_idx)
        inputs, targets = x['mfcc'].to(device), x['label'].to(device)
        
        optimizer.zero_grad()
        
        op = model(inputs).view(-1)
#         print(op[0], targets[0])
#         print(type(op.view(-1)[0]), type(targets[0]))
#         print(b_idx, op)
#         print(targets)
        loss = criterion(op, targets)
        loss.backward()
        
        optimizer.step()
        losses.append(loss.item())
        end = time.time()
        if b_idx % 100 == 0:
            print('Batch Index : %d Loss : %.10f Time : %.3f seconds ' % (b_idx, np.mean(losses), end - start))    
    model.eval()
    total = 0
    correct = 0

    with torch.no_grad():
        for b_idx, x in enumerate(validation_loader):
            inputs, targets = x['mfcc'].to(device), x['label'].to(device)

            outputs = torch.sigmoid(model(inputs))
#             print(outputs, targets)
            predicted = torch.round(outputs.data)
            total += targets.size(0)
            correct += predicted.eq(targets.data).cpu().sum()

        print('Epoch : %d Test Acc : %.3f' % (epoch, 100.*correct/total))
        print('--------------------------------------------------------------')
    model.train()   

Batch Index : 0 Loss : 2.0106799901 Time : 0.039 seconds 
Batch Index : 100 Loss : 2.0030731291 Time : 1.694 seconds 




Batch Index : 200 Loss : 1.3438789916 Time : 3.298 seconds 




Batch Index : 300 Loss : 1.1225144125 Time : 4.810 seconds 




Batch Index : 400 Loss : 1.0062614300 Time : 6.274 seconds 




Batch Index : 500 Loss : 0.9304817275 Time : 7.809 seconds 




Batch Index : 600 Loss : 0.8717307593 Time : 9.339 seconds 


KeyboardInterrupt: 