<a href="https://colab.research.google.com/github/edwinsoftwaredev/ai-notebooks/blob/main/digit_recognizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install wandb

In [None]:
!pip install "ray[tune]"

In [None]:
import os
import tempfile

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, TensorDataset

import numpy as np
import pandas as pd

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
import wandb

wandb.login()

In [None]:
import ray
from ray import train, tune
from ray.tune.schedulers import ASHAScheduler

In [None]:
from google.colab import files

files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c digit-recognizer
!unzip digit-recognizer.zip

In [None]:
df = pd.read_csv('train.csv')

df.describe()

In [None]:
df = df.sample(frac=1).reset_index(drop=True)

y = df['label'].values
X = df.drop('label',axis=1).values

In [None]:
train_X = X[:int(len(X)*0.8)]
train_y = y[:int(len(X)*0.8)]

test_X = X[int(len(X)*0.8):]
test_y = y[int(len(X)*0.8):]

train_X = torch.tensor(train_X, dtype=torch.float32)
train_y = torch.tensor(train_y, dtype=torch.long)

test_X = torch.tensor(test_X, dtype=torch.float32)
test_y = torch.tensor(test_y, dtype=torch.long)

train_dataset = TensorDataset(train_X, train_y)
test_dataset = TensorDataset(test_X, test_y)

train_dataset_ref = ray.put(train_dataset)
test_dataset_ref = ray.put(test_dataset)

In [None]:
df_sub = pd.read_csv('test.csv')
df_sub = df_sub.values
submission_dataset = torch.tensor(df_sub, dtype=torch.float32)

In [None]:
class NN(nn.Module):
    def __init__(self, l1_units, l2_units, l1_dropout):
        super().__init__()

        self.relu_stack = nn.Sequential(
            nn.Linear(784, l1_units),
            nn.ReLU(),

            nn.Dropout(l1_dropout),

            nn.Linear(l1_units, l2_units),
            nn.ReLU(),

            nn.Linear(l2_units, 10)
        )

    def forward(self, x):
        return self.relu_stack(x)


In [None]:
def raytune_load_checkpoint(model: NN, optimizer: torch.optim.Adam):
    if tune.get_checkpoint():
        loaded_checkpoint = tune.get_checkpoint()
        with loaded_checkpoint.as_directory() as loaded_checkpoint_dir:
            model_state, optimizer_state = torch.load(
                os.path.join(loaded_checkpoint_dir, 'checkpoint.pt')
            )

            model.load_state_dict(model_state)
            optimizer.load_state_dict(optimizer_state)


In [None]:
def raytune_save_checkpoint(model: NN, optimizer: torch.optim.Adam, train_metrics, test_metrics):
    with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
        path = os.path.join(temp_checkpoint_dir, 'checkpoint.pt')
        torch.save((model.state_dict(), optimizer.state_dict()), path)
        checkpoint = tune.Checkpoint.from_directory(temp_checkpoint_dir)
        tune.report({**train_metrics, **test_metrics}, checkpoint=checkpoint)


In [None]:
def train(dataloader: DataLoader, model: NN, loss_fn, optimizer, epoch):
    correct = 0
    avg_loss = 0

    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X = X.to(device)
        y = y.to(device)

        output = model(X)
        loss = loss_fn(output, y)

        # backprop
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # check results
        pred_probs = torch.nn.functional.softmax(output, dim=1)
        y_pred = pred_probs.argmax(1)
        cur_correct = (y_pred == y).sum().item()

        correct += cur_correct
        avg_loss += loss.item()


    correct /= len(dataloader.dataset)  # correct / dataset size
    avg_loss /= len(dataloader)         # acc loss / num batches

    metrics = { 'train_accuracy': correct, 'train_loss': avg_loss, 'epoch': epoch }

    wandb.log(metrics)

    return metrics


In [None]:
def test(dataloader: DataLoader, model: NN, loss_fn, epoch):
    test_correct = 0
    test_loss = 0

    model.eval()
    with torch.no_grad():
        for X, y in dataloader:
            X = X.to(device)
            y = y.to(device)

            output = model(X)
            loss = loss_fn(output, y)
            cur_loss = loss.item()

            pred_probs = torch.nn.functional.softmax(output, dim=1)
            y_pred = pred_probs.argmax(1)
            cur_correct = (y_pred == y).sum().item()

            test_correct += cur_correct
            test_loss += cur_loss


    test_correct /= len(dataloader.dataset)     # correct / dataset size
    test_loss /= len(dataloader)                # acc loss / num batches

    metrics = { 'test_accuracy': test_correct, 'test_loss': test_loss, 'epoch': epoch }

    wandb.log(metrics)

    return metrics


In [None]:
def test_best_model(best_result, input):
    best_model = NN(
        best_result.config['l1'],
        best_result.config['l2'],
        best_result.config['l1_dropout']
    )

    best_model.to(device)

    checkpoint_path = os.path.join(best_result.checkpoint.to_directory(), 'checkpoint.pt')
    model_state, optimizer_state = torch.load(checkpoint_path)
    best_model.load_state_dict(model_state)

    with torch.no_grad():
        input = input.to(device)

        output = best_model(input)
        pred_probs = torch.nn.functional.softmax(output, dim=1)
        y_pred = pred_probs.argmax(1)


    submission = [['ImageId', 'Label']]
    for i,pred in enumerate(y_pred):
        submission.append([i+1, pred.item()])


    np.savetxt('submission.csv', submission, delimiter=',', fmt='%s')


In [None]:
def train_tuned_model(config):
    lr = config['lr']
    epochs = config['epochs']
    batch_size = config['batch_size']

    model = NN(
        config['l1'],
        config['l2'],
        config['l1_dropout']
    )

    model.to(device)

    # loss function and optimizer
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)


    # test and train data loaders
    train_dataloader = DataLoader(ray.get(train_dataset_ref), batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(ray.get(test_dataset_ref), batch_size=batch_size, shuffle=True)


    # ray tune load checkpoint
    raytune_load_checkpoint(model, optimizer)

    test_metrics = None
    train_metrics = None

    # init logging
    wandb.init(project='Digit Recognizer', group='experiment_3', config=config)

    # train/test iterations
    for t in range(epochs):
        # print(f'Epoch {t+1}\n---------------------')
        train_metrics = train(train_dataloader, model, loss_fn, optimizer, t)
        test_metrics = test(test_dataloader, model, loss_fn, t)


    # finish logging
    wandb.finish()

    # ray tune save checkpoint
    raytune_save_checkpoint(model, optimizer, train_metrics, test_metrics)


    print('Done!')

In [None]:
config = {
    'l1': tune.choice([256,512]),
    'l2': tune.choice([32,64,128]),
    'l1_dropout': tune.uniform(0, 0.5),
    'lr': tune.loguniform(5e-4,1e-3),
    'batch_size': tune.choice([32,64,128]),
    'epochs': 20,
    'num_trials': 30
}

scheduler = ASHAScheduler(
    time_attr='training_iteration',
    max_t=config['epochs'],
    grace_period=10,
    reduction_factor=2
)

tuner = tune.Tuner(
    tune.with_resources(
        tune.with_parameters(train_tuned_model),
        resources={ 'cpu': 2, 'gpu': 1 }
    ),

    tune_config=tune.TuneConfig(
        metric='test_loss',
        mode='min',
        scheduler=scheduler,
        num_samples=config['num_trials']
    ),

    param_space=config
)

results = tuner.fit()

best_result = results.get_best_result('test_loss', 'min')

print(f'Best trial config: { best_result.config }')
print(f'Best trial final validation loss: { best_result.metrics["test_loss"] }')
print(f'Best trial final validation accuracy: { best_result.metrics["test_accuracy"] }')

test_best_model(best_result, submission_dataset)

[Model visualization](https://wandb.ai/edwinsoftwaredev-personal/Digit%20Recognizer)