In [1]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing

trainRaw = pd.read_csv("train.csv")
evalRaw = pd.read_csv("test.csv")

 # trainRaw.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S


In [2]:
from sklearn.model_selection import train_test_split

def normalizer(data): #scale values like the cost of a ticket to create more stable predictions
    
    x_normal = preprocessing.MinMaxScaler().fit_transform(data.values)
    
    return pd.DataFrame(x_normal, columns=data.columns)

def clean(data): #drops irrelevant info, converts categorical values to numerical, fills NaN, calls normalizer
    
    data = data.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)
    data = pd.get_dummies(data, columns=["Sex", "Embarked"])
    
    
    data.fillna(data.mean(), inplace=True)
    
    
 #   cols = ["Age", "SibSp", "Parch", "Fare"]
 #   for col in cols:
 #       data[col].fillna(data[col].mean(), inplace=True)
    
    
    if 'Survived' in data.columns:
        data_x = normalizer(data.drop(["Survived"], axis=1))
        data_y = data["Survived"]
        return data_x, data_y
    else:
        return normalizer(data)


 # cols = ["Sex", "Embarked"]
    
 # for col in cols:
 #   trainData[col] = pre.fit_transform(trainData[col])
 #   testData[col] = pre.transform(testData[col])
 #   print(pre.classes_)


t_x, t_y = clean(trainRaw)
eval_x = clean(evalRaw)

trn_X, tst_X, trn_y, tst_y = train_test_split(t_x, t_y, test_size = 0.2) #splits data into train(80%) and test(20%)

train_X = torch.Tensor(trn_X.to_numpy())
train_y = torch.Tensor(trn_y.to_numpy()) #.type(torch.LongTensor)
test_X = torch.Tensor(tst_X.to_numpy())
test_y = torch.Tensor(tst_y.to_numpy()) #.type(torch.LongTensor)

eval_X = torch.Tensor(eval_x.to_numpy())

 # print(t_x.head(1))
 # print(train_X[1])

   Pclass       Age  SibSp  Parch      Fare  Sex_female  Sex_male  Embarked_C  \
0     1.0  0.271174  0.125    0.0  0.014151         0.0       1.0         0.0   

   Embarked_Q  Embarked_S  
0         0.0         1.0  
tensor([1.0000, 0.3591, 0.0000, 0.0000, 0.0154, 0.0000, 1.0000, 0.0000, 0.0000,
        1.0000])


In [3]:
import torch.nn as nn
import torch.nn.functional as F

class Network(torch.nn.Module):

    def __init__(self):
        super(Network, self).__init__()
        
        self.lay1 = nn.Linear(10, 100)
        self.lay2 = nn.Linear(100, 1)

    def forward(self, x):
        x = self.lay1(x)
        x = F.relu(x) 
        x = F.dropout(x, p=0.1)
        x = self.lay2(x)
        x = torch.sigmoid(x)
        
        return x
    
model = Network()

 # print(model)
 # len(model(train_X))

Network(
  (lay1): Linear(in_features=10, out_features=100, bias=True)
  (lay2): Linear(in_features=100, out_features=1, bias=True)
)


In [4]:
from sklearn.utils import shuffle
from torch.autograd import Variable

    
optimizer = torch.optim.SGD(model.parameters(), lr = 0.05)
loss_fn = nn.BCELoss()
 # loss_fn = nn.BCEWithLogitsLoss()

def accuracy_fn(y_true, y_prediction):
    correct = torch.eq(y_true, y_prediction).sum().item()
    acc = (correct / len(y_prediction)) * 100
    return acc
    
    
epochs = 115

train_X, train_y = shuffle(train_X, train_y)
 # train_X = Variable(torch.FloatTensor(train_X))

 # train_losses, test_losses = [0] * epochs, [0] * epochs
 # accuracy = [0] * epochs


 # print(torch.Tensor.size(model(train_X)))

 # x = model(train_X).squeeze()
 # print(torch.Tensor.size(x))


for e in range(epochs):
    optimizer.zero_grad()
    y_pred_raw = model(train_X).squeeze()
    y_pred = torch.round((y_pred_raw))
    loss = loss_fn(y_pred_raw, train_y)
    loss.backward()
    optimizer.step()
    #train_losses[e] = loss.item()
    acc = accuracy_fn(train_y, y_pred)

    model.eval()
    with torch.no_grad():
        y_test_raw = model(test_X).squeeze()
        y_test = torch.round(y_test_raw)
        test_loss = loss_fn(y_test_raw, test_y)
        #test_losses[e] = test_loss.item()
        test_acc = accuracy_fn(test_y, y_test)
        
    model.train()
    
    if e % 10 == 0:
        print(f"Epoch: {e} | Loss: {loss:.5f}, Accuracy: {acc:.2f}% | Test loss: {test_loss:.5f}, Test acc: {test_acc:.2f}%")
        #print(f"Epoch: {e} | Raw Prediction: {y_pred_raw} | Transform: {y_pred} | Actual: {train_y}")

Epoch: 0 | Loss: 0.69776, Accuracy: 50.42% | Test loss: 0.70157, Test acc: 54.19%
Epoch: 10 | Loss: 0.64558, Accuracy: 62.78% | Test loss: 0.65823, Test acc: 56.98%
Epoch: 20 | Loss: 0.61993, Accuracy: 62.78% | Test loss: 0.63286, Test acc: 56.98%
Epoch: 30 | Loss: 0.60128, Accuracy: 63.06% | Test loss: 0.61132, Test acc: 56.98%
Epoch: 40 | Loss: 0.58794, Accuracy: 63.90% | Test loss: 0.59676, Test acc: 58.66%
Epoch: 50 | Loss: 0.57477, Accuracy: 68.68% | Test loss: 0.56895, Test acc: 66.48%
Epoch: 60 | Loss: 0.56221, Accuracy: 73.31% | Test loss: 0.56166, Test acc: 70.39%
Epoch: 70 | Loss: 0.54833, Accuracy: 76.54% | Test loss: 0.54376, Test acc: 79.33%
Epoch: 80 | Loss: 0.54400, Accuracy: 78.51% | Test loss: 0.52488, Test acc: 84.36%
Epoch: 90 | Loss: 0.53636, Accuracy: 79.63% | Test loss: 0.52139, Test acc: 82.12%
Epoch: 100 | Loss: 0.52702, Accuracy: 78.79% | Test loss: 0.50488, Test acc: 82.12%
Epoch: 110 | Loss: 0.52051, Accuracy: 79.07% | Test loss: 0.49736, Test acc: 83.24%


In [5]:
#print(test_y[100])
#print(test_X[100])

#print(torch.round(model(test_X[10].squeeze())))

tensor(1.)
tensor([0.5000, 0.2209, 0.0000, 0.1667, 0.0449, 1.0000, 0.0000, 0.0000, 0.0000,
        1.0000])
tensor([1.], grad_fn=<RoundBackward0>)


In [7]:
with torch.no_grad():
    results_raw = model(eval_X).squeeze()
    results = torch.round(results_raw)
    
 # print(results.size())

torch.Size([418])


In [16]:
submission = pd.DataFrame({"PassengerID" : evalRaw["PassengerId"], 
                           "Survived" : results})

submission.head()
submission.describe()

Unnamed: 0,PassengerID,Survived
count,418.0,418.0
mean,1100.5,0.339713
std,120.810458,0.474179
min,892.0,0.0
25%,996.25,0.0
50%,1100.5,0.0
75%,1204.75,1.0
max,1309.0,1.0


In [18]:
submission.to_csv('titanicML_submission.csv', index=False)