In [7]:
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import torch.nn.functional as F

from ray import tune
from ray.tune.schedulers import ASHAScheduler

class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        # In this example, we don't change the model architecture
        # due to simplicity.
        self.conv1 = nn.Conv2d(1, 3, kernel_size=3)
        self.fc = nn.Linear(192, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 3))
        x = x.view(-1, 192)
        x = self.fc(x)
        out = F.log_softmax(x, dim=1)
        return out



# Change these values if you want the training to run quicker or slower.
EPOCH_SIZE = 512
TEST_SIZE = 256

def train(model, optimizer, train_loader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        # We set this just for the example to run quickly.
        if batch_idx * len(data) > EPOCH_SIZE:
            return
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()


def test(model, data_loader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(data_loader):
            # We set this just for the example to run quickly.
            if batch_idx * len(data) > TEST_SIZE:
                break
            data, target = data.to(device), target.to(device)
            outputs = model(data)
            _, predicted = torch.max(outputs.data, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()

    return correct / total

def train_mnist(config):
    # Data Setup
    mnist_transforms = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.1307, ), (0.3081, ))])

    train_loader = DataLoader(
        datasets.MNIST("/Users/xichao.chen/Work/source/T-1000x/data", train=True, download=False, transform=mnist_transforms),
        batch_size=64,
        shuffle=True)
    test_loader = DataLoader(
        datasets.MNIST("/Users/xichao.chen/Work/source/T-1000x/data", train=False, download=False, transform=mnist_transforms),
        batch_size=64,
        shuffle=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = ConvNet()
    model.to(device)

    optimizer = optim.SGD(
        model.parameters(), lr=config["lr"], momentum=config["momentum"])
    for i in range(50):
        train(model, optimizer, train_loader)
        acc = test(model, test_loader)

        # Send the current training result back to Tune
        tune.report(mean_accuracy=acc, hello="world")

        if i % 5 == 0:
            # This saves the model to the trial directory
            torch.save(model.state_dict(), "/Users/xichao.chen/Work/source/T-1000x/example/model.pth")

search_space = {
    "lr": tune.sample_from(lambda spec: 10**(-10 * np.random.rand())),
    "momentum": tune.uniform(0.1, 0.9)
}

# Uncomment this to enable distributed execution
# `ray.init(address="auto")`

# Download the dataset first
datasets.MNIST("/Users/xichao.chen/Work/source/T-1000x/data", train=True, download=False)



Dataset MNIST
    Number of datapoints: 60000
    Root location: /Users/xichao.chen/Work/source/T-1000x/data
    Split: Train

In [None]:
from hyperopt import hp
from ray.tune.suggest.hyperopt import HyperOptSearch

space = {
    "lr": hp.loguniform("lr", 1e-10, 0.1),
    "momentum": hp.uniform("momentum", 0.1, 0.9),
}

hyperopt_search = HyperOptSearch(space, metric="mean_accuracy", mode="max")

analysis = tune.run(train_mnist, num_samples=10, search_alg=hyperopt_search)

# Obtain a trial dataframe from all run trials of this `tune.run` call.
dfs = analysis.trial_dataframes

# Plot by epoch
ax = None  # This plots everything on the same plot
for d in dfs.values():
    ax = d.mean_accuracy.plot(ax=ax, legend=False)

Trial name,status,loc,lr,momentum
train_mnist_49816ba6,RUNNING,,1.01796,0.763767


Result for train_mnist_4984ec04:
  date: 2021-03-23_18-34-05
  done: false
  experiment_id: ca7202039db748d98615f87f3abffeff
  hello: world
  hostname: MacBook
  iterations_since_restore: 1
  mean_accuracy: 0.0375
  node_ip: 192.168.139.23
  pid: 51519
  time_since_restore: 0.37584495544433594
  time_this_iter_s: 0.37584495544433594
  time_total_s: 0.37584495544433594
  timestamp: 1616495645
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 4984ec04
  
Result for train_mnist_49816ba6:
  date: 2021-03-23_18-34-05
  done: false
  experiment_id: 7fafd1f1520a47e7bb2c23f6c7a0f9f1
  hello: world
  hostname: MacBook
  iterations_since_restore: 1
  mean_accuracy: 0.0125
  node_ip: 192.168.139.23
  pid: 51518
  time_since_restore: 0.38965916633605957
  time_this_iter_s: 0.38965916633605957
  time_total_s: 0.38965916633605957
  timestamp: 1616495645
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 49816ba6
  
Result for train_mnist_49890320:
  date: 2021-03-23_18-3

Trial name,status,loc,lr,momentum,acc,iter,total time (s)
train_mnist_49816ba6,RUNNING,192.168.139.23:51518,1.01796,0.763767,0.096875,9,3.08857
train_mnist_4984ec04,RUNNING,192.168.139.23:51519,1.05951,0.479947,0.1,9,3.06063
train_mnist_498634c4,RUNNING,192.168.139.23:51551,1.06105,0.621933,0.125,7,2.47092
train_mnist_49878b76,RUNNING,192.168.139.23:51550,1.02017,0.274981,0.125,8,2.71146
train_mnist_49890320,RUNNING,192.168.139.23:51553,1.03939,0.223693,0.11875,8,2.70072
train_mnist_498ab21a,RUNNING,192.168.139.23:51561,1.09555,0.106037,0.0875,8,2.69388
train_mnist_498d9232,RUNNING,192.168.139.23:51562,1.10374,0.547219,0.1125,8,2.7034
train_mnist_498f3c54,RUNNING,192.168.139.23:51563,1.08723,0.350248,0.084375,7,2.43361
train_mnist_4991043a,RUNNING,192.168.139.23:51570,1.04296,0.710818,0.06875,7,2.35757
train_mnist_49a39c58,RUNNING,192.168.139.23:51571,1.00999,0.530386,0.084375,7,2.36994


Result for train_mnist_49816ba6:
  date: 2021-03-23_18-34-10
  done: false
  experiment_id: 7fafd1f1520a47e7bb2c23f6c7a0f9f1
  hello: world
  hostname: MacBook
  iterations_since_restore: 16
  mean_accuracy: 0.121875
  node_ip: 192.168.139.23
  pid: 51518
  time_since_restore: 5.418284177780151
  time_this_iter_s: 0.34658384323120117
  time_total_s: 5.418284177780151
  timestamp: 1616495650
  timesteps_since_restore: 0
  training_iteration: 16
  trial_id: 49816ba6
  
Result for train_mnist_4984ec04:
  date: 2021-03-23_18-34-10
  done: false
  experiment_id: ca7202039db748d98615f87f3abffeff
  hello: world
  hostname: MacBook
  iterations_since_restore: 16
  mean_accuracy: 0.121875
  node_ip: 192.168.139.23
  pid: 51519
  time_since_restore: 5.426605939865112
  time_this_iter_s: 0.37284183502197266
  time_total_s: 5.426605939865112
  timestamp: 1616495650
  timesteps_since_restore: 0
  training_iteration: 16
  trial_id: 4984ec04
  
Result for train_mnist_498d9232:
  date: 2021-03-23_18-3

Trial name,status,loc,lr,momentum,acc,iter,total time (s)
train_mnist_49816ba6,RUNNING,192.168.139.23:51518,1.01796,0.763767,0.075,24,8.12855
train_mnist_4984ec04,RUNNING,192.168.139.23:51519,1.05951,0.479947,0.096875,24,8.14848
train_mnist_498634c4,RUNNING,192.168.139.23:51551,1.06105,0.621933,0.11875,22,7.67705
train_mnist_49878b76,RUNNING,192.168.139.23:51550,1.02017,0.274981,0.103125,23,7.80779
train_mnist_49890320,RUNNING,192.168.139.23:51553,1.03939,0.223693,0.090625,23,7.73024
train_mnist_498ab21a,RUNNING,192.168.139.23:51561,1.09555,0.106037,0.096875,22,7.4491
train_mnist_498d9232,RUNNING,192.168.139.23:51562,1.10374,0.547219,0.0875,22,7.43252
train_mnist_498f3c54,RUNNING,192.168.139.23:51563,1.08723,0.350248,0.103125,22,7.553
train_mnist_4991043a,RUNNING,192.168.139.23:51570,1.04296,0.710818,0.09375,22,7.43307
train_mnist_49a39c58,RUNNING,192.168.139.23:51571,1.00999,0.530386,0.1375,22,7.48059


Result for train_mnist_49816ba6:
  date: 2021-03-23_18-34-15
  done: false
  experiment_id: 7fafd1f1520a47e7bb2c23f6c7a0f9f1
  hello: world
  hostname: MacBook
  iterations_since_restore: 31
  mean_accuracy: 0.11875
  node_ip: 192.168.139.23
  pid: 51518
  time_since_restore: 10.591087102890015
  time_this_iter_s: 0.3503279685974121
  time_total_s: 10.591087102890015
  timestamp: 1616495655
  timesteps_since_restore: 0
  training_iteration: 31
  trial_id: 49816ba6
  
Result for train_mnist_4984ec04:
  date: 2021-03-23_18-34-15
  done: false
  experiment_id: ca7202039db748d98615f87f3abffeff
  hello: world
  hostname: MacBook
  iterations_since_restore: 31
  mean_accuracy: 0.10625
  node_ip: 192.168.139.23
  pid: 51519
  time_since_restore: 10.650273084640503
  time_this_iter_s: 0.3529510498046875
  time_total_s: 10.650273084640503
  timestamp: 1616495655
  timesteps_since_restore: 0
  training_iteration: 31
  trial_id: 4984ec04
  
Result for train_mnist_498634c4:
  date: 2021-03-23_18-3

Trial name,status,loc,lr,momentum,acc,iter,total time (s)
train_mnist_49816ba6,RUNNING,192.168.139.23:51518,1.01796,0.763767,0.1125,39,13.4585
train_mnist_4984ec04,RUNNING,192.168.139.23:51519,1.05951,0.479947,0.146875,39,13.4936
train_mnist_498634c4,RUNNING,192.168.139.23:51551,1.06105,0.621933,0.1,36,12.7542
train_mnist_49878b76,RUNNING,192.168.139.23:51550,1.02017,0.274981,0.1,37,12.739
train_mnist_49890320,RUNNING,192.168.139.23:51553,1.03939,0.223693,0.109375,37,12.7103
train_mnist_498ab21a,RUNNING,192.168.139.23:51561,1.09555,0.106037,0.103125,37,12.8185
train_mnist_498d9232,RUNNING,192.168.139.23:51562,1.10374,0.547219,0.13125,37,12.7783
train_mnist_498f3c54,RUNNING,192.168.139.23:51563,1.08723,0.350248,0.11875,36,12.5665
train_mnist_4991043a,RUNNING,192.168.139.23:51570,1.04296,0.710818,0.11875,36,12.4406
train_mnist_49a39c58,RUNNING,192.168.139.23:51571,1.00999,0.530386,0.115625,36,12.5198
