In [1]:
import os

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from dataloader import KeyDataset
from torch.utils.data import DataLoader
from signal_process import signal_process
import warnings

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"]="3"

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [5]:
#Data paths
metadata_path = 'metadata.csv'
audio_dir = 'audio'

# example : ckpt/model.pth
ckpt_dir = 'ckpt'
best_saved_model = 'best_model.pth'
if not os.path.exists(ckpt_dir):
    os.mkdir(ckpt_dir)
restore_path = os.path.join(ckpt_dir, best_saved_model)

In [6]:
#hyper_parameters
n_epochs = 300
batch_size = 16
num_label = 24
method = 'logmelspectrogram'
sr = 22050
learning_rate = 1e-3
momentum = 0.9
weight_decay=1e-4

In [7]:
# TODO : Build your model here
class View(nn.Module):
    
    def __init__(self, *shape): 
        super(View, self).__init__() 
        self.shape = shape
        
    def forward(self, x):
        return x.view(x.shape[0], *self.shape) # x.shape = [batch_size, channel, width, height]
    
class Residual_Block(nn.Module):  # Residual Block 만들기 
    
    def __init__(self, n_ch): # pre activation 적용한 것 (2번째 논문 5번 그림)
        super(Residual_Block, self).__init__() 
        layers = []
        layers += [nn.BatchNorm2d(num_features=n_ch),
                  nn.ReLU(inplace=True), 
                  nn.Conv2d(in_channels=n_ch, out_channels=n_ch, kernel_size=3, stride=1, padding=1, bias=False),
                  nn.BatchNorm2d(num_features=n_ch),
                  nn.ReLU(inplace=True),
                  nn.Conv2d(in_channels=n_ch, out_channels=n_ch, kernel_size=3, stride=1, padding=1, bias=False)]
        self.layers = nn.Sequential(*layers)
        
    def forward(self,x):
        out = self.layers(x)
        return x + out

class ResNet(nn.Module):
    
    def __init__(self):
        super(ResNet, self).__init__()
        
        # mnist니까 in_channels=1, 논문에서 out_channels=64, kernel_size=7, stride=2 (논문에선 사이즈 줄이기 위해 - 28*28이 14*14로)
        # 사이즈 안 줄게 하는 padding 공식 = (kernel_size-1)/2
        
        # 우리는 Residual_Block 2개씩만 하고 (논문은 3개씩), in_channel은 64랑 256만.
        
        layers = []
        layers += [nn.Conv2d(in_channels=1, out_channels=64, kernel_size=7, stride=2, padding=3), # batch * 64 * 14 * 14
                   nn.MaxPool2d(kernel_size=3, stride=2, padding=1), # batch * 64 * 7 * 7
                   Residual_Block(n_ch=64),
                   Residual_Block(n_ch=64),
                   nn.BatchNorm2d(64), # Residual_Block 거치면 x가 새롭게 추가 되니까 그것에 대해서도 BN (논문에선 안함)
                   nn.Conv2d(in_channels=64, out_channels=256, kernel_size=3, padding=1),
                   Residual_Block(n_ch=256),
                   Residual_Block(n_ch=256), # batch * 256 * 7 * 7
                   nn.AdaptiveAvgPool2d((1,1)), # batch * 256 * 1 * 1: (1,1)은 입력되는 커널 사이즈가 아니라 그렇게 나가도록 하라는 것
                   View(-1), # batch * 256 * 1 * 1 는 우리가 보기에 1차원이지만 컴퓨터가 보기엔 4차원이므로 그걸 해결하기 위함
                   nn.Linear(in_features=256, out_features=24)]
                      
        self.layers = nn.Sequential(*layers)
        
    def forward(self,x):
        return self.layers(x)
 

In [8]:
is_test_mode = False

if not is_test_mode:
    
    # Load Dataset and Dataloader
    train_dataset = KeyDataset(metadata_path=metadata_path, audio_dir=audio_dir, sr=sr, split='training')
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    valid_dataset = KeyDataset(metadata_path=metadata_path, audio_dir=audio_dir, sr=sr, split='validation')
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)
    model = ResNet()
#     model = DenseNet()
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-3,momentum=0.9, weight_decay=1e-4)
    best_accuracy = 0

    # Training and Validation
    for epoch in range(n_epochs):
        
        model.train()

        train_correct = 0
        train_loss = 0
        i=0
        j=0
        for idx, (features, labels) in enumerate(train_loader):
            optimizer.zero_grad()
            features = signal_process(features, sr=sr, method=method).to(device)
            features = features.unsqueeze(1)
            labels = labels.to(device)
            
            output = model(features)
            loss = criterion(output, labels)
#             features = features.squeeze(1)
            train_loss += loss.item()

            loss.backward()
            optimizer.step()

            preds = output.argmax(dim=-1, keepdim=True)
            train_correct += (preds.squeeze() == labels).float().sum()
            if i%100 ==0:
                print(i)
            i+=1
        print("==== Epoch: %d, Train Loss: %.2f, Train Accuracy: %.3f" % (
            epoch, train_loss / len(train_loader), train_correct / len(train_dataset)))

        model.eval()

        valid_correct = 0
        valid_loss = 0

        for idx, (features, labels) in enumerate(valid_loader):
            features = signal_process(features, sr=sr, method=method).to(device)
            features = features.unsqueeze(1)
            labels = labels.to(device)

            output = model(features)
            loss = criterion(output, labels)
#             features = features.squeeze(1)
            valid_loss += loss.item()

            preds = output.argmax(dim=-1, keepdim=True)
            valid_correct += (preds.squeeze() == labels).float().sum()
            
        print("==== Epoch: %d, Valid Loss: %.2f, Valid Accuracy: %.3f" % (
            epoch, valid_loss / len(valid_loader), valid_correct / len(valid_dataset)))
        valid_accuracy = valid_correct/len(valid_dataset)
        if valid_accuracy>0.73:
            for param_group in optimizer.param_groups:
                param_group['lr'] = 1e-4
        elif valid_accuracy>0.83:
            for param_group in optimizer.param_groups:
                param_group['lr'] = 1e-5
                
        if valid_accuracy> best_accuracy:
            best_accuracy = valid_accuracy
            torch.save(model.state_dict(), best_saved_model) # official recommended
            print('updated')
        elif epoch%5==0:
            torch.save(model.state_dict(), best_saved_model) # official recommended
            print('updated')
            

# elif is_test_mode:

#     # Load Dataset and Dataloader
#     test_dataset = KeyDataset(metadata_path=metadata_path, audio_dir=audio_dir, sr=sr, split='test')
#     test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

#     # Restore model
#     model = torch.load(restore_path).to(device)
#     print('==== Model restored : %s' % restore_path)

#     # TODO: IMPORTANT!! MUST CALCULATE ACCURACY ! You may change this part, but must print accuracy in the right manner

#     test_correct = 0

#     for features, labels in test_loader:
#         features = signal_process(features, sr=sr, method=method).to(device)
#         labels = labels.to(device)

#         output = model(features)

#         preds = output.argmax(dim=-1, keepdim=True)
#         test_correct += (preds.squeeze() == labels).float().sum()

#     print("=== Test accuracy: %.3f" % (test_correct / len(test_dataset)))


0
100
200
==== Epoch: 0, Train Loss: 3.10, Train Accuracy: 0.087
==== Epoch: 0, Valid Loss: 3.09, Valid Accuracy: 0.105
updated
0
100
200
==== Epoch: 1, Train Loss: 3.04, Train Accuracy: 0.096
==== Epoch: 1, Valid Loss: 3.06, Valid Accuracy: 0.096
0
100
200
==== Epoch: 2, Train Loss: 2.95, Train Accuracy: 0.125
==== Epoch: 2, Valid Loss: 2.92, Valid Accuracy: 0.133
updated
0
100
200
==== Epoch: 3, Train Loss: 2.78, Train Accuracy: 0.167
==== Epoch: 3, Valid Loss: 4.18, Valid Accuracy: 0.051
0
100
200
==== Epoch: 4, Train Loss: 2.59, Train Accuracy: 0.220
==== Epoch: 4, Valid Loss: 2.85, Valid Accuracy: 0.194
updated
0
100
200
==== Epoch: 5, Train Loss: 2.44, Train Accuracy: 0.263
==== Epoch: 5, Valid Loss: 2.50, Valid Accuracy: 0.223
updated
0
100
200
==== Epoch: 6, Train Loss: 2.29, Train Accuracy: 0.308
==== Epoch: 6, Valid Loss: 2.42, Valid Accuracy: 0.240
updated
0
100
200
==== Epoch: 7, Train Loss: 2.15, Train Accuracy: 0.346
==== Epoch: 7, Valid Loss: 2.60, Valid Accuracy: 0.283


KeyboardInterrupt: 