<a href="https://colab.research.google.com/github/edwinsoftwaredev/ai-notebooks/blob/main/spaceship-titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install wandb

In [None]:
!pip install "ray[tune]"

In [None]:
import os
import tempfile

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset, TensorDataset

import numpy as np
import pandas as pd

In [None]:
import wandb

wandb.login()

In [None]:
from ray import train, tune
from ray.tune.schedulers import ASHAScheduler

In [None]:
from google.colab import files

files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c spaceship-titanic
!unzip spaceship-titanic.zip

In [None]:
df = pd.read_csv('train.csv')

df.describe()

In [None]:
df.sample(frac=1).reset_index(drop=True)

train_ids = df['PassengerId'].values

# removes PassengerId and Name
df = df.drop('PassengerId', axis=1)
df = df.drop('Name', axis=1)

for label,ser in df.items():
    if ser.dtype == 'object':
        df[label] = pd.factorize(ser)[0]

    else:
        df[label] = ser.fillna(0)


X = df.drop('Transported', axis=1).values
y = df['Transported'].values

train_X = X[:6000]
train_y = y[:6000]

test_X = X[6000:]
test_y = y[6000:]

train_X = torch.tensor(train_X, dtype=torch.float32)
train_y = torch.tensor(train_y, dtype=torch.float32).unsqueeze(1)

test_X = torch.tensor(test_X, dtype=torch.float32)
test_y = torch.tensor(test_y, dtype=torch.float32).unsqueeze(1)

train_dataset = TensorDataset(train_X, train_y)
test_dataset = TensorDataset(test_X, test_y)

In [None]:
class NN(nn.Module):
    def __init__(self, l1_units, l2_units, l3_units, l1_dropout, l2_dropout):
        super().__init__()

        self.relu_stack = nn.Sequential(
            nn.Dropout(l1_dropout),

            nn.Linear(11, l1_units, bias=False),
            nn.BatchNorm1d(l1_units),
            nn.ReLU(),

            nn.Dropout(l2_dropout),

            nn.Linear(l1_units, l2_units, bias=False),
            nn.BatchNorm1d(l2_units),
            nn.ReLU(),

            nn.Linear(l2_units, l3_units, bias=False),
            nn.BatchNorm1d(l3_units),
            nn.ReLU(),

            nn.Linear(l3_units, 1)
        )

    def forward(self, x):
        return self.relu_stack(x)


In [None]:
def train(dataloader: DataLoader, batch_size, model: NN, loss_fn, optimizer, lr):
    size = len(dataloader.dataset)

    correct = 0
    avg_loss = 0

    model.train()
    for batch, (X, y) in enumerate(dataloader):
        output = model(X)
        loss = loss_fn(output, y)

        # backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        probs = torch.sigmoid(output)
        pred = (probs >= 0.5).float()
        cur_correct = (pred == y).sum().item()

        wandb.log({"accuracy": cur_correct / len(X), "loss": loss.item()})

        correct += cur_correct
        avg_loss += loss.item()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * batch_size + len(X)
            print(f'loss: {loss:>7f} [{current:>5d} /{size:>5d}]')

    correct /= len(dataloader.dataset)
    avg_loss /= len(dataloader)

    metrics = { "train_model_accuracy": correct, "train_model_loss": avg_loss, 'learning_rate': lr }

    wandb.log(metrics)

    return metrics


In [None]:
def test(dataloader: DataLoader, model, loss_fn, lr):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)

    test_loss, correct = 0, 0
    model.eval()
    with torch.no_grad():
        for X, y in dataloader:
            probs = torch.sigmoid(model(X))
            pred = (probs >= 0.5).float()
            cur_loss = loss_fn(pred, y).item()
            cur_correct = (pred == y).sum().item()

            wandb.log({"test_accuracy": cur_correct / len(X), "test_loss": cur_loss})

            test_loss += cur_loss
            correct += cur_correct

    test_loss /= num_batches
    correct /= size

    metrics = { "test_model_accuracy": correct, "test_model_loss": test_loss, 'learning_rate':  lr }

    wandb.log(metrics)

    print(f'Test Error: \n Accuracy: {100*correct:0.1f}%, Avg loss: {test_loss:>8f} \n')

    return metrics

In [None]:
df_v = pd.read_csv('test.csv')

ids = df_v.PassengerId.values

df_v = df_v.drop('PassengerId', axis=1)
df_v = df_v.drop('Name', axis=1)

for label,ser in df_v.items():
    if ser.dtype == 'object':
        df_v[label] = pd.factorize(ser)[0]

    else:
        df_v[label] = ser.fillna(0)

X_val = df_v.values

X_val = torch.tensor(X_val, dtype=torch.float32)

In [None]:
def test_best_model(best_result):
    best_model = NN(
        best_result.config['l1'],
        best_result.config['l2'],
        best_result.config['l3'],
        best_result.config['l1_dropout'],
        best_result.config['l2_dropout']
    )
    checkpoint_path = os.path.join(best_result.checkpoint.to_directory(), "checkpoint.pt")
    model_state, _optimizer_state = torch.load(checkpoint_path)
    best_model.load_state_dict(model_state)

    with torch.no_grad():
        probs = torch.sigmoid(best_model(X_val))
        preds = (probs >= 0.5).bool()

    submission = [['PassengerId','Transported']]
    for i,id in enumerate(ids):
        submission.append([id, preds[i].item()])


    np.savetxt('submission.csv', submission, delimiter=',', fmt='%s')


In [None]:
def train_tuned_model(config):
    model = NN(config['l1'], config['l2'], config['l3'], config['l1_dropout'], config['l2_dropout'])
    learning_rate = config['lr']
    batch_size = config['batch_size']
    epochs = config['epochs']

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

    size = len(train_dataloader.dataset)

    loss_fn = nn.BCEWithLogitsLoss()

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    if tune.get_checkpoint():
        loaded_checkpoint = tune.get_checkpoint()
        with loaded_checkpoint.as_directory() as loaded_checkpoint_dir:
            model_state, optimizer_state = torch.load(
                os.path.join(loaded_checkpoint_dir, 'checkpoint.pt')
            )

            model.load_state_dict(model_state)
            optimizer.load_state_dict(optimizer_state)


    wandb.init(project='Spaceship Titanic', config={
        "epochs": epochs,
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "l1_units": config['l1'],
        "l2_units": config['l2'],
        "l3_units": config['l3'],
        "l1_dropout": config['l1_dropout'],
        "l2_dropout": config['l2_dropout']
    })

    train_metrics = None
    test_metrics = None

    for t in range(epochs):
        print(f'Epoch {t+1}\n---------------------')
        train_metrics = train(train_dataloader, batch_size, model, loss_fn, optimizer, learning_rate)
        print()
        test_metrics = test(test_dataloader, model, loss_fn, learning_rate)
        print()


    wandb.finish()

    with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
        path = os.path.join(temp_checkpoint_dir, 'checkpoint.pt')
        torch.save(
            (model.state_dict(), optimizer.state_dict()),
            path
        )

        checkpoint = tune.Checkpoint.from_directory(temp_checkpoint_dir)
        tune.report({**train_metrics, **test_metrics}, checkpoint=checkpoint)


    print('Done!')


In [None]:
config = {
    'l1': tune.choice([256,512,1024]),
    'l2': tune.choice([64,128,256]),
    'l3': tune.choice([8,16,32,64]),
    'l1_dropout': tune.uniform(0, 0.15),
    'l2_dropout': tune.uniform(0, 0.01),
    'lr': tune.loguniform(1e-4, 1e-2),
    'batch_size': tune.choice([32,64]),
    'epochs': 50,
    'num_trials': 200
}

scheduler = ASHAScheduler(
    time_attr="training_iteration",
    max_t=config['epochs'],
    grace_period=20,
    reduction_factor=2
)

tuner = tune.Tuner(
    tune.with_resources(
        tune.with_parameters(train_tuned_model),
        resources={}
    ),

    tune_config=tune.TuneConfig(
        metric='test_model_loss',
        mode='min',
        scheduler=scheduler,
        num_samples=config['num_trials']
    ),

    param_space=config
)

results = tuner.fit()

best_result = results.get_best_result('test_model_loss', 'min')

print(f'Best trial config: { best_result.config }')
print(f'Best trial final validation loss: { best_result.metrics["test_model_loss"] }')
print(f'Best trial final validation accuracy: { best_result.metrics["test_model_accuracy"] }')

test_best_model(best_result)