In [1]:
################################

#  Fit coefficients and predict NN by the simulated new dataset

################################

# 1. import data and packages
import numpy as np
import pandas as pd

# Read CSV train data file into DataFrame
train_df = pd.read_csv("../data/kaggle_titanic/train.csv")

# Read CSV test data file into DataFrame
test_df = pd.read_csv("../data/kaggle_titanic/test.csv")

# 2. data quality check
# check missing values in train data
train_df.isnull().sum()
# data adjustment
train_data = train_df.copy()
train_data["Age"].fillna(train_df["Age"].median(skipna=True), inplace=True)
train_data["Embarked"].fillna(train_df['Embarked'].value_counts().idxmax(), inplace=True)
train_data.drop('Cabin', axis=1, inplace=True)
# double check missing values in adjusted train data
train_data.isnull().sum()

## Create categorical variable for traveling alone
train_data['TravelAlone'] = np.where((train_data["SibSp"]+train_data["Parch"])>0, 0, 1)
train_data.drop('SibSp', axis=1, inplace=True)
train_data.drop('Parch', axis=1, inplace=True)

#create categorical variables and drop some variables

training = pd.get_dummies(train_data, columns=["Pclass","Embarked","Sex"])
training.drop('Sex_female', axis=1, inplace=True)
training.drop('PassengerId', axis=1, inplace=True)
training.drop('Name', axis=1, inplace=True)
training.drop('Ticket', axis=1, inplace=True)

final_train = training

# apply change to test data
test_df.isnull().sum()

test_data = test_df.copy()
test_data["Age"].fillna(train_df["Age"].median(skipna=True), inplace=True)
test_data["Fare"].fillna(train_df["Fare"].median(skipna=True), inplace=True)
test_data.drop('Cabin', axis=1, inplace=True)

test_data['TravelAlone']=np.where((test_data["SibSp"]+test_data["Parch"])>0, 0, 1)
test_data.drop('SibSp', axis=1, inplace=True)
test_data.drop('Parch', axis=1, inplace=True)

testing = pd.get_dummies(test_data, columns=["Pclass","Embarked","Sex"])
testing.drop('Sex_female', axis=1, inplace=True)
testing.drop('PassengerId', axis=1, inplace=True)
testing.drop('Name', axis=1, inplace=True)
testing.drop('Ticket', axis=1, inplace=True)

final_test = testing

final_test.head()

# 3. data analysis

# add 16 boundary for age
final_train['IsMinor'] = np.where(final_train['Age'] <= 16, 1, 0)
final_test['IsMinor'] = np.where(final_test['Age'] <= 16, 1, 0)

# 4. logistic regression
from sklearn.linear_model import LogisticRegression


# model evaluation procedures
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_curve, auc, log_loss

# create X (features) and y (response)
Selected_features = ['Age', 'TravelAlone', 'Pclass_1', 'Pclass_2', 'Embarked_C',
                     'Embarked_S', 'Sex_male', 'IsMinor']
X = final_train[Selected_features]
y = final_train['Survived']

# use train/test split with different random_state values
# we can change the random_state values that changes the accuracy scores
# the scores change a lot, this is why testing scores is a high-variance estimate
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

# check classification scores of logistic regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
y_pred_proba = logreg.predict_proba(X_test)[:, 1]
[fpr, tpr, thr] = roc_curve(y_test, y_pred_proba)
# print('Train/Test split results:')
# print(logreg.__class__.__name__+" accuracy is %2.3f" % accuracy_score(y_test, y_pred))
# print(logreg.__class__.__name__+" log_loss is %2.3f" % log_loss(y_test, y_pred_proba))
# print(logreg.__class__.__name__+" auc is %2.3f" % auc(fpr, tpr))

# get logistic regression coefficients and intercept
coef = logreg.coef_
inter = logreg.intercept_

# calculate mean and std from the input data and simulate x
train_mean = X_train.mean()
train_std = X_train.std()
X_simulated = np.random.normal(train_mean, train_std, size=(len(X_train), len(train_mean)))

# calculate the output using simulated inputs and fitted coefficients
y_simulated = 1/(1 + np.exp(- (np.matmul(X_simulated,np.transpose(coef)) + inter)))

# simulate y by fitted coefficients ouputs
y_round = np.random.binomial(1,y_simulated)
y_trainnew = y_round.reshape((len(y_round),))

# check classification scores and set benchmark for logistic regression again
lognew = LogisticRegression()
lognew.fit(X_simulated, y_trainnew)
y_pred = lognew.predict(X_test)
y_pred_proba = lognew.predict_proba(X_test)[:, 1]
[fpr, tpr, thr] = roc_curve(y_test, y_pred_proba)
print('Train/Test split results of simulated data')
print(lognew.__class__.__name__+" accuracy is %2.3f" % accuracy_score(y_test, y_pred))
print(lognew.__class__.__name__+" log_loss is %2.3f" % log_loss(y_test, y_pred_proba))
print(lognew.__class__.__name__+" auc is %2.3f" % auc(fpr, tpr))

Train/Test split results of simulated data
LogisticRegression accuracy is 0.782
LogisticRegression log_loss is 0.518
LogisticRegression auc is 0.829


In [5]:
#####################################

# basic neural network predictor

#####################################

import torch
import torch.nn as nn
import pytorch_lightning as pl
from torch.utils.data import DataLoader, TensorDataset

# change train test data for neural network
train_loader_x = torch.tensor(X_simulated).float()
train_loader_y = torch.tensor(y_trainnew).float()
test_loader_x = torch.tensor(X_test.values).float()
test_loader_y = torch.tensor(y_test.values).float()

train_loader = DataLoader(TensorDataset(train_loader_x, train_loader_y))
test_loader = DataLoader(TensorDataset(test_loader_x, test_loader_y))

class BasicNN(pl.LightningModule):

    def __init__(self):
        super().__init__()
        self.linear1 = nn.Linear(8, 20)
        self.linear2 = nn.Linear(20, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.linear1(x)
        x = self.sigmoid(self.linear2(x))
        return x

    def training_step(self, batch, batch_idx):
        # training_step defined the train loop.
        # It is independent of forward
        inputs, labels = batch
        # reshape train data y
        labels = labels.view(1,-1)

        outputs = self.forward(inputs)

        criterion = nn.BCELoss()

        loss = criterion(outputs, labels)
        # Logging to TensorBoard by default
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        return loss

    #def training_epoch_end(self, loss):
        # do something with all training_step outputs
        # print(self.log('train_loss', loss, on_step=False, on_epoch=True, prog_bar=False, logger=False))


    def configure_optimizers(self):
        optimizer = torch.optim.SGD(self.parameters(), lr=0.001)
        return optimizer

NN = BasicNN()
trainer = pl.Trainer(max_epochs=100)
trainer.fit(NN, train_loader)

# test performance
correct = 0
total = 0
with torch.no_grad():
    for data in test_loader:
        inputs, labels = data
        # reshape test data y
        labels = labels.view(1)

        outputs = NN(inputs)

        predicted = torch.round(outputs)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print('Accuracy of basic neural network using the simulated input: %d %%' % (
    100 * correct / total))

GPU available: False, used: False
TPU available: False, using: 0 TPU cores

  | Name    | Type    | Params
------------------------------------
0 | linear1 | Linear  | 180   
1 | linear2 | Linear  | 21    
2 | sigmoid | Sigmoid | 0     


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

Accuracy of basic neural network using the simulated input: 74 %


In [8]:
#####################################

# deep neural network predictor

#####################################


class DNN(pl.LightningModule):

    def __init__(self):
        super().__init__()
        self.linear1 = nn.Linear(8, 100)
        self.sigmoid1 = nn.Sigmoid()
        self.linear2 = nn.Linear(100, 20)
        self.sigmoid2 = nn.Sigmoid()
        self.linear3 = nn.Linear(20, 10)
        self.sigmoid3 = nn.Sigmoid()
        self.linear4 = nn.Linear(10, 5)
        self.sigmoid4 = nn.Sigmoid()
        self.linear5 = nn.Linear(5, 1)


    def forward(self, x):
        x = self.linear1(x)
        x = self.sigmoid1(self.linear2(x))
        x = self.sigmoid2(self.linear3(x))
        x = self.sigmoid3(self.linear4(x))
        x = self.sigmoid4(self.linear5(x))
        return x

    def training_step(self, batch, batch_idx):
        # training_step defined the train loop.
        # It is independent of forward
        inputs, labels = batch
        # reshape train data y
        labels = labels.view(1,-1)

        outputs = self.forward(inputs)

        criterion = nn.BCELoss()

        loss = criterion(outputs, labels)
        # Logging to TensorBoard by default
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        return loss

    #def training_epoch_end(self, loss):
        # do something with all training_step outputs
        #print(self.log('train_loss', loss, on_step=False, on_epoch=True, prog_bar=False, logger=False))

    def configure_optimizers(self):
        optimizer = torch.optim.SGD(self.parameters(), lr=0.001)
        return optimizer

DeepNN = DNN()
trainer = pl.Trainer(max_epochs=10)
trainer.fit(DeepNN, train_loader)

# test performance
correct = 0
total = 0
with torch.no_grad():
    for data in test_loader:
        inputs, labels = data
        # reshape test data y
        labels = labels.view(1)

        outputs = DeepNN(inputs)

        predicted = torch.round(outputs)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print('Accuracy of deep neural network using the simulated input: %d %%' % (
    100 * correct / total))

GPU available: False, used: False
TPU available: False, using: 0 TPU cores

  | Name     | Type    | Params
-------------------------------------
0 | linear1  | Linear  | 900   
1 | sigmoid1 | Sigmoid | 0     
2 | linear2  | Linear  | 2 K   
3 | sigmoid2 | Sigmoid | 0     
4 | linear3  | Linear  | 210   
5 | sigmoid3 | Sigmoid | 0     
6 | linear4  | Linear  | 55    
7 | sigmoid4 | Sigmoid | 0     
8 | linear5  | Linear  | 6     


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

Accuracy of deep neural network using the simulated input: 55 %


In [12]:
from blitz.modules import BayesianLinear

class LitBayesian(pl.LightningModule):

    def __init__(self, input_dim, output_dim):
        super().__init__()
        # self.linear = nn.Linear(input_dim, output_dim)
        self.blinear1 = BayesianLinear(input_dim, 100)
        self.sigmoid1 = nn.Sigmoid()
        self.blinear2 = BayesianLinear(100, 20)
        self.sigmoid2 = nn.Sigmoid()
        self.blinear3 = BayesianLinear(20, 10)
        self.sigmoid3 = nn.Sigmoid()
        self.blinear4 = BayesianLinear(10,5)
        self.sigmoid4 = nn.Sigmoid()
        self.blinear5 = BayesianLinear(5, output_dim)

    def forward(self, x):
        x1 = self.blinear1(x)
        x2 = self.sigmoid1(self.blinear2(x1))
        x3 = self.sigmoid2(self.blinear3(x2))
        x4 = self.sigmoid3(self.blinear4(x3))
        x5 = self.sigmoid4(self.blinear5(x4))
        return x5

    def configure_optimizers(self):
        optimizer = torch.optim.SGD(self.parameters(), lr=0.001)
        return optimizer

    def training_step(self, batch, batch_idx):
        # training_step defined the train loop.
        # It is independent of forward
        inputs, labels = batch
        # reshape train data y
        labels = labels.view(1, -1)

        outputs = self.forward(inputs)

        criterion = nn.BCELoss()

        loss = criterion(outputs, labels)
        # Logging to TensorBoard by default
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    #def training_epoch_end(self, loss):
        # do something with all training_step outputs
        #print(self.log('train_loss', loss, on_step=False, on_epoch=True, prog_bar=False, logger=False))

BayesianNN = LitBayesian(8,1)
trainer = pl.Trainer(max_epochs=10)
trainer.fit(BayesianNN, train_loader)

# test performance
correct = 0
total = 0
with torch.no_grad():
    for data in test_loader:
        inputs, labels = data
        # reshape test data y
        labels = labels.view(1)

        outputs = BayesianNN(inputs)

        predicted = torch.round(outputs)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print('Accuracy of bayesian neural network using the simulated input: %d %%' % (
    100 * correct / total))

GPU available: False, used: False
TPU available: False, using: 0 TPU cores

  | Name     | Type           | Params
--------------------------------------------
0 | blinear1 | BayesianLinear | 3 K   
1 | sigmoid1 | Sigmoid        | 0     
2 | blinear2 | BayesianLinear | 8 K   
3 | sigmoid2 | Sigmoid        | 0     
4 | blinear3 | BayesianLinear | 840   
5 | sigmoid3 | Sigmoid        | 0     
6 | blinear4 | BayesianLinear | 220   
7 | sigmoid4 | Sigmoid        | 0     
8 | blinear5 | BayesianLinear | 24    






HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…


Accuracy of bayesian neural network using the simulated input: 55 %
