<a href="https://colab.research.google.com/github/congbrian/pytorch-cifar-DAG/blob/master/DAG_experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/congbrian/pytorch-cifar-DAG.git
!git config --global user.email "brian.cong229@gmail.com"
!git config --global user.name "congbrian"

fatal: destination path 'pytorch-cifar-DAG' already exists and is not an empty directory.


In [2]:
import os
os.chdir('/content/pytorch-cifar-DAG')

In [3]:
'''Train CIFAR10 with PyTorch.'''

import torch
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
torch.cuda.empty_cache()

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.backends.cudnn as cudnn

import torchvision
import torchvision.transforms as transforms

import os
import argparse

from models import *
from utils import progress_bar

import sys
import random
import numpy as np

In [4]:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [5]:
if __name__ == '__main__':
    if any(arg.startswith('-f') for arg in sys.argv):
        # We're in Jupyter/Colab
        class Args:
            lr = 0.1
            resume = False
        args = Args()
        COLLAB_ENVIRONMENT = True
    else:
        parser = argparse.ArgumentParser()
        parser.add_argument('--lr', type=float, default=0.1)
        parser.add_argument('--resume', action='store_true')
        args = parser.parse_args()

device = 'cuda' if torch.cuda.is_available() else 'cpu'
best_acc = 0  # best test accuracy
start_epoch = 0  # start from epoch 0 or last checkpoint epoch


In [6]:
# Data
print('==> Preparing data..')
try:
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])

    trainset = torchvision.datasets.CIFAR100(
        root='./data', train=True, download=True, transform=transform_train)
    trainloader = torch.utils.data.DataLoader(
        trainset, batch_size=128, shuffle=True, num_workers=2)

    testset = torchvision.datasets.CIFAR100(
        root='./data', train=False, download=True, transform=transform_test)
    testloader = torch.utils.data.DataLoader(
        testset, batch_size=100, shuffle=False, num_workers=2)

    classes = trainset.classes + testset.classes
except Exception as e:
    print(e)
else:
    print('==> Done!')

==> Preparing data..
==> Done!


In [13]:
# Training
def train(epoch, model):
    print('\nEpoch: %d' % epoch)
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

        progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
                     % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))
    if COLLAB_ENVIRONMENT:
        # moving loss per epoch
        print('Loss: %.3f | Acc: %.3f%% (%d/%d)'
                     % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))

def test(epoch, model):
    global best_acc
    model.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

            progress_bar(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
                         % (test_loss/(batch_idx+1), 100.*correct/total, correct, total))
    if COLLAB_ENVIRONMENT:
        print('Loss: %.3f | Acc: %.3f%% (%d/%d)'
                     % (test_loss/(batch_idx+1), 100.*correct/total, correct, total))

    # Save checkpoint.
    acc = 100.*correct/total
    if acc > best_acc:
        print('Saving..')
        state = {
            'model': model.state_dict(),
            'name': {model_fn.__name__ if hasattr(model_fn, '__name__') else model_fn.__class__.__name__},
            'acc': acc,
            'epoch': epoch,
        }
        if not os.path.isdir('checkpoint'):
            os.mkdir('checkpoint')
        torch.save(state, './checkpoint/ckpt.pth')
        best_acc = acc



In [14]:
models = [ResNet18, GoogLeNet, lambda num_classes: VGG('VGG19', num_classes)]
# Model
print('==> Building model..')
# net = VGG('VGG19')
# net = ResNet18()
# net = PreActResNet18()
# net = GoogLeNet()
# net = DenseNet121()
# net = ResNeXt29_2x64d()
# net = MobileNet()
# net = MobileNetV2()
# net = DPN92()
# net = ShuffleNetG2()
# net = SENet18()
# net = ShuffleNetV2(1)
# net = EfficientNetB0()
# net = RegNetX_200MF()
# net = SimpleDLA(num_classes=len(classes))
for model_fn in models:
  print(f"\n==> Training model: {model_fn.__name__ if hasattr(model_fn, '__name__') else model_fn.__class__.__name__}")
  model = model_fn(num_classes=len(classes))
  model = model.to(device)
  if device == 'cuda':
      model = torch.nn.DataParallel(model)
      cudnn.benchmark = True

  if args.resume:
      # Load checkpoint.
      print('==> Resuming from checkpoint..')
      assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!'
      checkpoint = torch.load('./checkpoint/ckpt.pth')
      model.load_state_dict(checkpoint['net'])
      best_acc = checkpoint['acc']
      start_epoch = checkpoint['epoch']

  criterion = nn.CrossEntropyLoss()
  optimizer = optim.SGD(model.parameters(), lr=args.lr,
                        momentum=0.9, weight_decay=5e-4)
  scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)


  for epoch in range(start_epoch, start_epoch+200):
      train(epoch, model)
      test(epoch, model)
      scheduler.step()

==> Building model..

==> Training model: ResNet18

Epoch: 0
Loss: 3.981 | Acc: 9.144% (4572/50000)
Loss: 3.534 | Acc: 15.380% (1538/10000)
Saving..

Epoch: 1
Loss: 3.254 | Acc: 19.642% (9821/50000)
Loss: 3.095 | Acc: 23.090% (2309/10000)
Saving..

Epoch: 2
Loss: 2.670 | Acc: 30.734% (15367/50000)
Loss: 2.536 | Acc: 33.780% (3378/10000)
Saving..

Epoch: 3
Loss: 2.246 | Acc: 39.508% (19754/50000)
Loss: 2.231 | Acc: 40.850% (4085/10000)
Saving..

Epoch: 4
Loss: 1.963 | Acc: 46.104% (23052/50000)
Loss: 2.163 | Acc: 42.230% (4223/10000)
Saving..

Epoch: 5
Loss: 1.793 | Acc: 50.060% (25030/50000)
Loss: 2.045 | Acc: 46.070% (4607/10000)
Saving..

Epoch: 6


KeyboardInterrupt: 

In [15]:
!git add .
!git commit -m "Try-catch to fix popen with collab environment"

[master aebaece] Try-catch to fix popen with collab environment
 15 files changed, 24 insertions(+), 14 deletions(-)
 create mode 100644 checkpoint/ckpt.pth
 create mode 100644 data/cifar-100-python.tar.gz
 create mode 100644 data/cifar-100-python/file.txt~
 create mode 100644 data/cifar-100-python/meta
 create mode 100644 data/cifar-100-python/test
 create mode 100644 data/cifar-100-python/train


In [16]:
import os
from google.colab import userdata
token = userdata.get('GithubToken')
repo = "congbrian/pytorch-cifar-DAG"
os.system(f"git push https://{token}@github.com/{repo}.git master")

256