In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="-1"

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import time
import os
import argparse
import utils
import torchvision
from torchvision import datasets, models, transforms
from glob import glob
import apex.amp as amp

  from ._conv import register_converters as _register_converters


In [2]:
data_dir = "data/"

In [5]:
model_conv = torchvision.models.resnet101(pretrained="imagenet").cuda()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=F

In [4]:
freeze_layers = True
# n_class = 17143
n_class = len(glob(data_dir + "train/*"))
# Stage-1 Freezing all the layers 
if freeze_layers:
    for i, param in model_conv.named_parameters():
        param.requires_grad = False

# Since imagenet as 1000 classes , We need to change our last layer according to the number of classes we have,
num_ftrs = model_conv.fc.in_features
model_conv.fc = nn.Linear(num_ftrs, n_class).cuda()

input_shape = 512
batch_size = 100
mean = [0.5, 0.5, 0.5]
std = [0.5, 0.5, 0.5]
scale = 360
use_parallel = False
use_gpu = True
epochs = 100

data_transforms = {
        'train': transforms.Compose([
        transforms.Resize(scale),
        transforms.RandomResizedCrop(input_shape),
        transforms.RandomHorizontalFlip(),
#         transforms.RandomVerticalFlip(),
        transforms.RandomRotation(degrees=90),
        transforms.ToTensor(),
        transforms.Normalize(mean, std)]),
        'val': transforms.Compose([
        transforms.Resize(scale),
        transforms.CenterCrop(input_shape),
        transforms.ToTensor(),
        transforms.Normalize(mean, std)]),
        }

In [5]:
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                      data_transforms[x]) for x in ['train', 'val']}
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size,
                                         shuffle=True, num_workers=4) for x in ['train', 'val']}

dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
class_names = image_datasets['train'].classes

In [6]:
if use_parallel:
    print("[Using all the available GPUs]")
    model_conv = nn.DataParallel(model_conv, device_ids=[0, 1])

print("[Using CrossEntropyLoss...]")
criterion = nn.CrossEntropyLoss()

print("[Using small learning rate with momentum...]")
optimizer_conv = optim.SGD(list(filter(lambda p: p.requires_grad, model_conv.parameters())), lr=0.001, momentum=0.9)

model_conv, optimizer_conv = amp.initialize(model_conv, optimizer_conv, opt_level="O1")
print("[Creating Learning rate scheduler...]")
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_conv, step_size=7, gamma=0.1)

[Using CrossEntropyLoss...]
[Using small learning rate with momentum...]
Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
[Creating Learning rate scheduler...]


In [7]:
print("[Training the model begun ....]")
model_ft = utils.train_model(model_conv, dataloaders, dataset_sizes, criterion, optimizer_conv, exp_lr_scheduler, use_gpu,
                     num_epochs=epochs)


[Training the model begun ....]
MIXUP
randing over epochs
Epoch 0/99
----------


 47%|████▋     | 8002/16911 [3:34:51<2:46:17,  1.12s/it] 

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 59%|█████▉    | 10006/16911 [4:25:55<2:33:55,  1.34s/it]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 71%|███████   | 12008/16911 [5:16:41<2:28:03,  1.81s/it]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 83%|████████▎ | 14013/16911 [6:07:29<1:03:43,  1.32s/it]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 95%|█████████▍| 16015/16911 [6:58:34<17:42,  1.19s/it]  

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


100%|██████████| 16911/16911 [7:21:29<00:00,  1.45it/s]
  0%|          | 0/1879 [00:00<?, ?it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 262144.0
train Loss: 0.0943 Acc: 0.0000


100%|██████████| 1879/1879 [49:11<00:00,  1.59s/it] 


val Loss: 0.0935 Acc: 0.0000

Epoch 1/99
----------


 24%|██▎       | 4002/16911 [1:11:14<4:08:12,  1.15s/it]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 35%|███▌      | 6003/16911 [1:47:18<3:25:13,  1.13s/it]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 47%|████▋     | 8008/16911 [2:23:17<2:40:04,  1.08s/it]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 59%|█████▉    | 10021/16911 [2:59:08<2:21:26,  1.23s/it]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 71%|███████   | 12025/16911 [3:34:34<1:21:23,  1.00it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 83%|████████▎ | 14035/16911 [4:10:10<50:13,  1.05s/it]  

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 95%|█████████▍| 16051/16911 [4:46:07<17:46,  1.24s/it]  

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


100%|██████████| 16911/16911 [5:01:36<00:00,  1.40it/s]
  0%|          | 0/1879 [00:00<?, ?it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 262144.0
train Loss: 0.0919 Acc: 0.0000


100%|██████████| 1879/1879 [33:14<00:00,  1.11it/s]


val Loss: 0.0922 Acc: 0.0000

Epoch 2/99
----------


 24%|██▎       | 4008/16911 [1:11:53<4:26:54,  1.24s/it]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 36%|███▌      | 6011/16911 [1:47:41<3:48:38,  1.26s/it]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 47%|████▋     | 8015/16911 [2:24:16<2:36:35,  1.06s/it]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 59%|█████▉    | 10046/16911 [3:01:07<2:17:44,  1.20s/it]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 71%|███████   | 12047/16911 [3:37:34<1:24:20,  1.04s/it]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 83%|████████▎ | 14063/16911 [4:14:35<1:00:18,  1.27s/it]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 95%|█████████▌| 16067/16911 [4:51:23<17:48,  1.27s/it]  

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


100%|██████████| 16911/16911 [5:06:53<00:00,  1.39it/s]
  0%|          | 0/1879 [00:00<?, ?it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 262144.0
train Loss: 0.0902 Acc: 0.0000


100%|██████████| 1879/1879 [34:37<00:00,  1.12it/s]


val Loss: 0.0912 Acc: 0.0000

Epoch 3/99
----------


 13%|█▎        | 2195/16911 [40:33<4:19:24,  1.06s/it]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 262144.0


 37%|███▋      | 6201/16911 [1:55:41<3:28:35,  1.17s/it]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 49%|████▊     | 8205/16911 [2:33:26<2:29:51,  1.03s/it]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 60%|██████    | 10211/16911 [3:11:12<2:03:48,  1.11s/it]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 72%|███████▏  | 12212/16911 [3:48:57<1:22:34,  1.05s/it]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 84%|████████▍ | 14213/16911 [4:30:51<1:05:47,  1.46s/it]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 96%|█████████▌| 16220/16911 [5:33:57<27:52,  2.42s/it]  

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


100%|██████████| 16911/16911 [6:05:34<00:00,  2.27s/it]
  0%|          | 0/1879 [00:00<?, ?it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 262144.0
train Loss: 0.0888 Acc: 0.0000


100%|██████████| 1879/1879 [1:13:17<00:00,  1.74s/it]


val Loss: 0.0903 Acc: 0.0000

Epoch 4/99
----------


 24%|██▎       | 4005/16911 [2:41:02<11:41:06,  3.26s/it]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 36%|███▌      | 6009/16911 [4:09:09<11:35:04,  3.83s/it]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 47%|████▋     | 8020/16911 [5:39:53<4:25:45,  1.79s/it] 

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 53%|█████▎    | 9047/16911 [6:25:48<4:30:58,  2.07s/it] 

KeyboardInterrupt: 