In [1]:
import torch
import math
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torchvision
from torchvision import datasets, transforms
import torchvision.models as models
import torch.nn.functional as F
import time
import os
import random
import torch.backends.cudnn as cudnn
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow

Matplotlib created a temporary config/cache directory at /tmp/matplotlib-pdjn7dsd because the default path (/home/khc/.config/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [2]:
#for reproducibility
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device:', device)
seed = 777
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if device == 'cuda':
    torch.cuda.manual_seed(seed)
    #torch.cuda.manual_seed_all(seed) #if use multi-GPU
    #torch.backends.cudnn.enabled = False
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

device: cuda


In [3]:
os.environ["CUDA_VISIBLE_DEVICES"] = '0'                # GPU Number 
#start_time = time.time()
batch_size = 128
learning_rate = 0.004
root_dir = 'drive/app/cifar10/'
default_directory = 'drive/app/torch/save_models'

# Data Augmentation
transform_train = transforms.Compose([
    transforms.Resize(256),
    transforms.RandomCrop(224, padding=16),               # Random Position Crop
    transforms.RandomHorizontalFlip(),                  # right and left flip
    transforms.ToTensor(),                              # change [0,255] Int value to [0,1] Float value
    transforms.Normalize(mean=(0.4914, 0.4824, 0.4467), # RGB Normalize MEAN
                         std=(0.2471, 0.2436, 0.2616))  # RGB Normalize Standard Deviation
])
#mean=(0.4914, 0.4824, 0.4467)
#std=(0.2471, 0.2436, 0.2616)

transform_test = transforms.Compose([
    transforms.Resize(224),
    transforms.ToTensor(),                              # change [0,255] Int value to [0,1] Float value
    transforms.Normalize(mean=(0.4914, 0.4824, 0.4467), # RGB Normalize MEAN
                         std=(0.2471, 0.2436, 0.2616))  # RGB Normalize Standard Deviation
])

#automatically download
train_dataset = datasets.CIFAR10(root=root_dir,
                                 train=True,
                                 transform=transform_train,
                                 download=True)

test_dataset = datasets.CIFAR10(root=root_dir,
                                train=False,
                                transform=transform_test)

train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size,
                                           shuffle=True,            # at Training Procedure, Data Shuffle = True
                                           num_workers=4)           # CPU loader number

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=batch_size,
                                          shuffle=False,            # at Test Procedure, Data Shuffle = False
                                          num_workers=4)            # CPU loader number

Files already downloaded and verified


In [4]:
model = models.resnext50_32x4d(pretrained=True)
optimizer = optim.SGD(model.parameters(), learning_rate,
                                momentum=0.9,
                                weight_decay=1e-4,
                                nesterov=True)
criterion = nn.CrossEntropyLoss()

if torch.cuda.device_count() > 0:
    print("USE", torch.cuda.device_count(), "GPUs!")
    model = nn.DataParallel(model).cuda()
    cudnn.benchmark = True
else:
    print("USE ONLY CPU!")

USE 4 GPUs!


In [None]:
loss_l = []
acc_l = []
def train(epoch):
    model.train()
    train_loss = 0
    total = 0
    correct = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        if torch.cuda.is_available():
            data, target = Variable(data.cuda()), Variable(target.cuda())
        else:
            data, target = Variable(data), Variable(target)

        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = torch.max(output.data, 1)

        total += target.size(0)
        correct += predicted.eq(target.data).cpu().sum()
        loss_ = train_loss / (batch_idx + 1)
        loss_l.append(loss_)
        if batch_idx % 100 == 0:
            print('Epoch: {} | Batch_idx: {} |  Loss: ({:.4f}) | Acc: ({:.2f}%) ({}/{})'
                  .format(epoch, batch_idx, loss_, 100. * correct / total, correct, total))

def test():
    model.eval()
    test_loss = 0
    correct = 0
    total = 0
    for batch_idx, (data, target) in enumerate(test_loader):
        if torch.cuda.is_available():
            data, target = Variable(data.cuda()), Variable(target.cuda())
        else:
            data, target = Variable(data), Variable(target)

        outputs = model(data)
        loss = criterion(outputs, target)

        test_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += target.size(0)
        correct += predicted.eq(target.data).cpu().sum()
        acc_ = correct / total
        acc_l.append(acc_)
    print('# TEST : Loss: ({:.4f}) | Acc: ({:.2f}%) ({}/{})'
          .format(test_loss / (batch_idx + 1), 100. * acc_, correct, total))

def save_checkpoint(directory, state, filename=str(learning_rate)+'latest.tar.gz'):

    if not os.path.exists(directory):
        os.makedirs(directory)

    model_filename = os.path.join(directory, filename)
    torch.save(state, model_filename)
    print("=> saving checkpoint")

def load_checkpoint(directory, filename=str(learning_rate)+'latest.tar.gz'):

    model_filename = os.path.join(directory, filename)
    if os.path.exists(model_filename):
        print("=> loading checkpoint")
        state = torch.load(model_filename)
        return state
    else:
        return None

start_epoch = 0

checkpoint = load_checkpoint(default_directory)
if not checkpoint:
    pass
else:
    start_epoch = checkpoint['epoch'] + 1
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])

for epoch in range(start_epoch, 80):

    if epoch < 50:
        lr = learning_rate
    elif epoch < 65:
        lr = learning_rate * 0.1
    else:
        lr = learning_rate * 0.01
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    train(epoch)
    save_checkpoint(default_directory, {
        'epoch': epoch,
        'model': model,
        'state_dict': model.state_dict(),
        'optimizer': optimizer.state_dict(),
    })
    test()  

now = time.gmtime(time.time() - start_time)
print('{} hours {} mins {} secs for training'.format(now.tm_hour, now.tm_min, now.tm_sec))

Epoch: 0 | Batch_idx: 0 |  Loss: (9.1844) | Acc: (0.00%) (0/128)
Epoch: 0 | Batch_idx: 100 |  Loss: (1.0313) | Acc: (74.31%) (9607/12928)
Epoch: 0 | Batch_idx: 200 |  Loss: (0.6773) | Acc: (81.62%) (20999/25728)
Epoch: 0 | Batch_idx: 300 |  Loss: (0.5334) | Acc: (85.01%) (32753/38528)
=> saving checkpoint
# TEST : Loss: (0.1582) | Acc: (94.62%) (9462/10000)
Epoch: 1 | Batch_idx: 0 |  Loss: (0.1223) | Acc: (96.09%) (123/128)
Epoch: 1 | Batch_idx: 100 |  Loss: (0.1476) | Acc: (94.87%) (12265/12928)
Epoch: 1 | Batch_idx: 200 |  Loss: (0.1460) | Acc: (94.99%) (24439/25728)
Epoch: 1 | Batch_idx: 300 |  Loss: (0.1469) | Acc: (94.95%) (36583/38528)
=> saving checkpoint
# TEST : Loss: (0.1259) | Acc: (95.49%) (9549/10000)
Epoch: 2 | Batch_idx: 0 |  Loss: (0.0974) | Acc: (95.31%) (122/128)
Epoch: 2 | Batch_idx: 100 |  Loss: (0.0846) | Acc: (97.01%) (12541/12928)
Epoch: 2 | Batch_idx: 200 |  Loss: (0.0907) | Acc: (96.78%) (24899/25728)
Epoch: 2 | Batch_idx: 300 |  Loss: (0.0892) | Acc: (96.92%) 