# Training Model using KubleFlow
Following [this](https://github.com/manceps/fashion-mnist-kfp-lab/blob/master/KF_Fashion_MNIST.ipynb) tutorial.
This notebook is designed to be run inside of a kubeflow notebook server.

In [1]:
# Import Kubeflow SDK
import kfp
import kfp.dsl as dsl
import kfp.components as comp

In [2]:
!which dsl-compile

/opt/conda/bin/dsl-compile


In [3]:
def train(data_dir, model_file):
    
    # import required packages
    import copy
    import numpy as np
    import torch
    import torch.nn as nn
    import torch.optim as optim
    from torch.optim import lr_scheduler
    from torchvision import models
    
    from indoor_outdoor_pytorch.datasets import get_datasets
    
    # get data
    dataloaders, dataset_sizes, class_names = get_datasets(data_dir)
    
    # Download model and add FC layer
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model_ft = models.resnet18(pretrained=True)
    num_ftrs = model_ft.fc.in_features
    model_ft.fc = nn.Linear(num_ftrs, len(class_names))
    model_ft = model_ft.to(device)
    
    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)
    scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
    
    # Train 
    best_model_wts = copy.deepcopy(model_ft.state_dict())
    best_acc = 0.0

    num_epochs = 3
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model_ft.train()  # Set model to training mode
            else:
                model_ft.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model_ft(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))
           
            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model_ft.state_dict())

        print()

    # load best model weights
    model_ft.load_state_dict(best_model_wts)
    
    # Save model
    torch.save(model_ft, f'{model_file}')

In [5]:
# Test out training funrction
train('../data', '../models/resnet.pt')

Epoch 0/2
----------
train Loss: 0.7868 Acc: 0.6675
val Loss: 0.5649 Acc: 0.7796

Epoch 1/2
----------
train Loss: 0.7245 Acc: 0.7725
val Loss: 0.7030 Acc: 0.7312

Epoch 2/2
----------
train Loss: 0.6368 Acc: 0.7750
val Loss: 0.7016 Acc: 0.8065



## Build Container Components

In [6]:
train_op = comp.func_to_container_op(train, base_image='indoor_outdoor:latest')

## Build Pipeline

In [24]:
#client = kfp.Client(host='pipelines-api.kubeflow.svc.cluster.local:8000')
client = kfp.Client()

Failed to load kube config.


In [20]:
# Define the pipeline
@dsl.pipeline(
    name='Indoor Outdoor Pipeline',
    description='Train a classifier that can classify scenes'
)

def indoor_outdoor_pipeline(
    data_path: str,
    model_file: str
):
    # Define a volume to share data between components
    vop = dsl.VolumeOp(
        name="create_volume",
        resource_name="data-volume", 
        size="1Gi", 
        modes=dsl.VOLUME_MODE_RWM
    )
    
    training_container = train_op(data_path, model_file).add_pvolumes({data_path: vop.volume})
    
    # Print that pipeline finished
    end_container = dsl.ContainerOp(
        name='end_function',
        image='indoor_outdoor:latest',
        pvolumes={data_path: training_container.pvolume},
        arguments=['ls', f'{data_path}']
    )

In [21]:
DATA_PATH = '../data'
MODEL_FILE = 'models/resnet_kf.pt'

In [22]:
pipeline_func = indoor_outdoor_pipeline

In [23]:
experiment_name = 'test_kubeflow'
run_name = pipeline_func.__name__ + ' run'

arguments = {"data_path":DATA_PATH,
             "model_file":MODEL_FILE}

# Compile pipeline to generate compressed YAML definition of the pipeline.
kfp.compiler.Compiler().compile(pipeline_func,  
  '{}.zip'.format(experiment_name))

# Submit pipeline directly from pipeline function
run_result = client.create_run_from_pipeline_func(pipeline_func, 
                                                  experiment_name=experiment_name, 
                                                  run_name=run_name, 
                                                  arguments=arguments)



MaxRetryError: HTTPConnectionPool(host='pipelines-api.kubeflow.svc.cluster.local', port=8000): Max retries exceeded with url: /apis/v1beta1/experiments (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f98fe5f0410>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution'))