In [1]:
import os

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from dataloader import KeyDataset
from torch.utils.data import DataLoader
from signal_process import signal_process
import warnings

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"]="3"

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
#Data paths
metadata_path = '../metadata.csv'
audio_dir = '../audio'

# example : ckpt/model.pth
ckpt_dir = 'ckpt'
best_saved_model = 'ResNet_LrDecay.pth'
if not os.path.exists(ckpt_dir):
    os.mkdir(ckpt_dir)
restore_path = os.path.join(ckpt_dir, best_saved_model)

In [6]:
#hyper_parameters
n_epochs = 300
batch_size = 16
num_label = 24
method = 'logmelspectrogram'
sr = 22050
learning_rate = 1e-3
momentum = 0.9
weight_decay=1e-4

In [7]:
# TODO : Build your model here
class AllConv(nn.Module):
    def __init__(self):
        super(AllConv, self).__init__()
        self.batch_norm1 = nn.BatchNorm2d(1)
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=20, kernel_size=5, stride=1)
        self.batch_norm2 = nn.BatchNorm2d(20)
        self.conv2 = nn.Conv2d(in_channels=20, out_channels=20, kernel_size=3, stride=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=1)
        self.batch_norm3 = nn.BatchNorm2d(20)
        self.conv3 = nn.Conv2d(in_channels=20, out_channels=40, kernel_size=3, stride=1)
        self.batch_norm4 = nn.BatchNorm2d(40)
        self.conv4 = nn.Conv2d(in_channels=40, out_channels=40, kernel_size=3, stride=1)
        self.batch_norm5 = nn.BatchNorm2d(40)

        self.conv5 = nn.Conv2d(in_channels=40, out_channels=80, kernel_size=3, stride=1)
        self.batch_norm6 = nn.BatchNorm2d(80)
        self.conv6 = nn.Conv2d(in_channels=80, out_channels=80, kernel_size=3, stride=1)
        self.batch_norm7 = nn.BatchNorm2d(80)
        self.conv7 = nn.Conv2d(in_channels=80, out_channels=160, kernel_size=3, stride=1)
        self.batch_norm8 = nn.BatchNorm2d(160)
        self.conv8 = nn.Conv2d(in_channels=160, out_channels=160, kernel_size=3, stride=1)
        self.batch_norm9 = nn.BatchNorm2d(160)
        self.conv9 = nn.Conv2d(in_channels=160, out_channels=24, kernel_size=1, stride=1)
        self.dropout = nn.Dropout(p=0.2)
        self.avgpool = nn.AdaptiveAvgPool2d((1,1))
        
    def forward(self, x):
        # print("연산 전", x.size())
        x = self.batch_norm1(x)
        x = F.elu(self.conv1(x))
        x = self.batch_norm2(x)
        x = F.elu(self.conv2(x))
        x = self.pool(x)
        x = self.dropout(x)

        x = self.batch_norm3(x)
        x = F.elu(self.conv3(x))
        x = self.batch_norm4(x)
        x = F.elu(self.conv4(x))
        x = self.pool(x)
        x = self.dropout(x)

        x = self.batch_norm5(x)
        x = F.elu(self.conv5(x))
        x = self.batch_norm6(x)
        x = F.elu(self.conv6(x))
        x = self.pool(x)
        x = self.dropout(x)

        x = self.batch_norm7(x)
        x = F.elu(self.conv7(x))
        x = self.dropout(x)
        
        x = self.batch_norm8(x)
        x = F.elu(self.conv8(x))
        x = self.dropout(x)

        x = self.batch_norm9(x)
        x = F.elu(self.conv9(x))
#         print(x.shape)
        x = self.avgpool(x)
#         print(x.shape)
        x = x.squeeze()
        
#         print("conv5 layer 연산 후", x.size())

#         print("차원 감소 후", x.size())
#         print("fc1 연산 후", x.size())

        return x

In [8]:
is_test_mode = False

if not is_test_mode:
    
    # Load Dataset and Dataloader
    train_dataset = KeyDataset(metadata_path=metadata_path, audio_dir=audio_dir, sr=sr, split='training')
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    valid_dataset = KeyDataset(metadata_path=metadata_path, audio_dir=audio_dir, sr=sr, split='validation')
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)
    model = AllConv()
#     model = DenseNet()
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-3,momentum=0.9, weight_decay=1e-4)
    best_accuracy = 0

    # Training and Validation
    for epoch in range(n_epochs):
        
        model.train()

        train_correct = 0
        train_loss = 0
        i=0
        j=0
        for idx, (features, labels) in enumerate(train_loader):
            optimizer.zero_grad()
            features = signal_process(features, sr=sr, method=method).to(device)
            features = features.unsqueeze(1)
            labels = labels.to(device)
            
            output = model(features)
            loss = criterion(output, labels)
#             features = features.squeeze(1)
            train_loss += loss.item()

            loss.backward()
            optimizer.step()

            preds = output.argmax(dim=-1, keepdim=True)
            train_correct += (preds.squeeze() == labels).float().sum()
            if i%100 ==0:
                print(i)
            i+=1
        print("==== Epoch: %d, Train Loss: %.2f, Train Accuracy: %.3f" % (
            epoch, train_loss / len(train_loader), train_correct / len(train_dataset)))

        model.eval()

        valid_correct = 0
        valid_loss = 0

        for idx, (features, labels) in enumerate(valid_loader):
            features = signal_process(features, sr=sr, method=method).to(device)
            features = features.unsqueeze(1)
            labels = labels.to(device)

            output = model(features)
            loss = criterion(output, labels)
#             features = features.squeeze(1)
            valid_loss += loss.item()

            preds = output.argmax(dim=-1, keepdim=True)
            valid_correct += (preds.squeeze() == labels).float().sum()
            
        print("==== Epoch: %d, Valid Loss: %.2f, Valid Accuracy: %.3f" % (
            epoch, valid_loss / len(valid_loader), valid_correct / len(valid_dataset)))
        valid_accuracy = valid_correct/len(valid_dataset)

                
        if valid_accuracy> best_accuracy:
            best_accuracy = valid_accuracy
            torch.save(model.state_dict(), best_saved_model) # official recommended
            print('updated')
            

elif is_test_mode:

    # Load Dataset and Dataloader
    test_dataset = KeyDataset(metadata_path=metadata_path, audio_dir=audio_dir, sr=sr, split='test')
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Restore model
    model = torch.load(restore_path).to(device)
    print('==== Model restored : %s' % restore_path)

    test_correct = 0

    for features, labels in test_loader:
        features = signal_process(features, sr=sr, method=method).to(device)
        labels = labels.to(device)

        output = model(features)

        preds = output.argmax(dim=-1, keepdim=True)
        test_correct += (preds.squeeze() == labels).float().sum()

    print("=== Test accuracy: %.3f" % (test_correct / len(test_dataset)))


0
100
200
==== Epoch: 0, Train Loss: 3.16, Train Accuracy: 0.070
==== Epoch: 0, Valid Loss: 3.12, Valid Accuracy: 0.081
updated
0
100
200
==== Epoch: 1, Train Loss: 3.09, Train Accuracy: 0.092
==== Epoch: 1, Valid Loss: 3.14, Valid Accuracy: 0.095
updated
0
100
200
==== Epoch: 2, Train Loss: 3.06, Train Accuracy: 0.097
==== Epoch: 2, Valid Loss: 3.11, Valid Accuracy: 0.094
0
100
200
==== Epoch: 3, Train Loss: 3.04, Train Accuracy: 0.101
==== Epoch: 3, Valid Loss: 3.07, Valid Accuracy: 0.103
updated
0
100
200
==== Epoch: 4, Train Loss: 3.01, Train Accuracy: 0.118
==== Epoch: 4, Valid Loss: 3.02, Valid Accuracy: 0.099
0
100
200
==== Epoch: 5, Train Loss: 2.94, Train Accuracy: 0.142
==== Epoch: 5, Valid Loss: 2.89, Valid Accuracy: 0.143
updated
0
100
200
==== Epoch: 6, Train Loss: 2.83, Train Accuracy: 0.195
==== Epoch: 6, Valid Loss: 2.72, Valid Accuracy: 0.207
updated
0
100
200
==== Epoch: 7, Train Loss: 2.69, Train Accuracy: 0.245
==== Epoch: 7, Valid Loss: 2.57, Valid Accuracy: 0.261


KeyboardInterrupt: 

In [None]:
resnet_lrdecay