# Training
We need to create a class to grab the data, and a training script with the appropriate configurations

In [22]:
import torch
from torch import nn
import pandas as pd
from torch.utils.data import Dataset, DataLoader

import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from titanic_mlp.mlp import MLP

## Dataset and Preprocessing
Utilizing the `Dataset` class eases the loading our data during training and validation.

In [23]:
class TitanicDataSet(Dataset):
    def __init__(self, train, idx, dataframe, normalize_fn, retrieve_stats):
        self.data = dataframe[:idx] if train else dataframe[idx:]
        self.labels = torch.tensor(self.data['Survived'].values, dtype=torch.float32)
        if normalize_fn:
            stats = retrieve_stats(self.data)
            self.features = normalize_fn(self.data.drop('Survived', axis=1), stats)
            self.features = torch.tensor(self.features.astype(float).values)
        else:
            self.features = torch.tensor(self.data.drop('Survived', axis=1))
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        label, feature = self.labels[idx], self.features[idx]
        return feature, label

We also need to normalize the columns `Age` and `Fare` to a mean of 0 and a std of 1 using Z-score normalization.
$$
Z = \frac{X - \mu}{\sigma}
$$

In [24]:
def normalize_data(data, stat):
    a_m, a_s, f_m, f_s = stat
    
    data['Age'] = (data['Age'] - a_m) / a_s
    
    data['Fare'] =  (data['Fare'] - f_m) / f_s
    
    return data

def retrieve_stats(data):
    return [
        data['Age'].mean(),
        data['Age'].std(),
        data['Fare'].mean(),
        data['Fare'].std()
    ]

We will need another function to process raw data during inference.

In [25]:
def process_raw_data(raw_data, stat): 
    MASTER_COLUMNS = [
    'Age', 'Fare', 'FamilySize', 'IsAlone_1', 
    'Pclass_2', 'Pclass_3', 'Sex_male', 
    'Deck_DE', 'Deck_FG', 'Deck_U', 
    'Embarked_Q', 'Embarked_S']

    if isinstance(raw_data, pd.Series):
        raw_data = raw_data.to_frame().T
    else:
        raw_data = raw_data.copy()
        
    raw_data = raw_data[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']]
    
    # normalize raw_data with stat
    raw_data = normalize_data(raw_data, stat)
    
    # process deck
    raw_data['Deck'] = raw_data['Cabin'].str.get(0)

    raw_data['Deck'] = raw_data['Deck'].fillna('U')
    raw_data['Deck'] = raw_data['Deck'].replace('T', 'U')
    
    raw_data['Deck'] = raw_data['Deck'].replace(['A', 'T'], 'ABC')
    raw_data['Deck'] = raw_data['Deck'].replace(['B', 'C'], 'ABC')
    raw_data['Deck'] = raw_data['Deck'].replace(['D', 'E'], 'DE')
    raw_data['Deck'] = raw_data['Deck'].replace(['F', 'G'], 'FG')


    del raw_data['Cabin']
    
    # process FamilySize and isAlone
    raw_data['FamilySize'] = raw_data['SibSp'] + raw_data['Parch'] + 1 
    raw_data['IsAlone'] = (raw_data['FamilySize'] == 1).astype(int)
    
    del raw_data['SibSp']
    
    # Convert categorical to dummies
    df_dummies = pd.get_dummies(raw_data, columns=['IsAlone', 'Pclass', 'Sex', 'Deck', 'Embarked'], drop_first=True)
    
    df_processed = df_dummies.reindex(columns=MASTER_COLUMNS, fill_value=0)
    
    # Return as a PyTorch Tensor
    return torch.tensor(df_processed.values.astype('float32'))

# Training Loop
We will use the usual SGD as our optimization function, our loss function MSE, and initialize our weights with Xavier Uniform. Our epochs will be `40`, learning rate `0.001`, and batch size of 32. Below lies our training loop script

In [None]:
# set hyperparameters
epochs = 40
lr = 0.001
batch_size = 40

# create model
model = MLP()

# dry run the model with a feature of 13
model(torch.zeros((32, 13)))

# create optimizer and loss function
optim = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
loss_fn = nn.MSELoss()

# go through MLP's net
for layer in model.net:
    if isinstance(layer, nn.Linear):
        # init with xavier
        layer.weight = nn.init.xavier_uniform_(layer.weight)
        
# load data
df = pd.read_csv('../data/titanic_processed.csv', index_col=0)
        
# initialize training_set and validation_set
training_set = TitanicDataSet(train=True, idx=600, dataframe=df, 
                              normalize_fn=normalize_data, retrieve_stats=retrieve_stats)
validation_set = TitanicDataSet(train=False, idx=600, dataframe=df, 
                              normalize_fn=normalize_data, retrieve_stats=retrieve_stats)

# initialize dataloaders
training_loader = DataLoader(training_set, batch_size=32, shuffle=True)
validation_loader = DataLoader(validation_set, batch_size=32, shuffle=False)

# go through epochs
for epoch in range(epochs):
    for i, data in enumerate(training_loader):
        inputs, labels = data
        
        optim.zero_grad()
        
        outputs = model(inputs)
        
        loss = loss_fn(outputs, labels)
        loss.backward()
        
        optim.step()
        
                running_loss += loss.item()
        if i % 1000 == 999:
            last_loss = running_loss / 1000 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(training_loader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.
