## Binary Classification - Titanic: Machine Learning from Disaster

https://www.kaggle.com/c/titanic

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.functional as F
import torch.optim as optim
import numpy as np
import random

random.seed(777)
torch.manual_seed(777)

<torch._C.Generator at 0x2a1c1bf9a50>

In [2]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.preprocessing import normalize

## Load Data & Preprocessing

In [3]:
import pandas as pd

train_data = pd.read_csv("./Titanic/train.csv")
test_data = pd.read_csv("./Titanic/test.csv")
train_data = train_data.fillna(0)
test_data = test_data.fillna(0)

In [4]:
train_data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,0,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,0,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,0,S
5,6,0,3,"Moran, Mr. James",male,0.0,0,0,330877,8.4583,0,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,0,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,0,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,0,C


In [5]:
# Use Pclass, Sex, Age, SibSp, Parch, Fare Columns

train_data["Sex"] = train_data["Sex"].map({"male": 1, "female": 0})
test_data["Sex"] = test_data["Sex"].map({"male": 1, "female": 0})

train_X = train_data[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"]]
train_y = train_data["Survived"]

test_X = test_data[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"]]

In [6]:
train_X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,1,22.0,1,0,7.25
1,1,0,38.0,1,0,71.2833
2,3,0,26.0,0,0,7.925
3,1,0,35.0,1,0,53.1
4,3,1,35.0,0,0,8.05


In [7]:
train_y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [8]:
len(train_X)

891

In [9]:
class trainData(Dataset):
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)
    
class testData(Dataset):
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)

In [10]:
train_data = trainData(np.array(train_X), np.array(train_y))
test_data = testData(np.array(test_X))

# train_data = trainData(np.array(normalize(train_X)), np.array(train_y))
# test_data = testData(np.array(normalize(test_X)))

In [11]:
train_data.__getitem__(0)

(array([ 3.  ,  1.  , 22.  ,  1.  ,  0.  ,  7.25]), 0)

## Modeling

In [12]:
class Binary_Classification(nn.Module):
    def __init__(self, num_feature):
        super(Binary_Classification, self).__init__()
        
        self.Layer_1 = nn.Sequential(
                                nn.Linear(num_feature, 30),
                                nn.Sigmoid()
                            )
        self.Layer_2 = nn.Sequential(
                                nn.Linear(30, 12),
                                nn.Sigmoid()
                            )

        self.FC1 = nn.Sequential(
                                nn.Linear(12, 12),
                                nn.Sigmoid()
                            )

        self.FC2 = nn.Sequential(
                                nn.Linear(12, 6),
                                nn.Sigmoid()
                            )

        
        self.dropout = nn.Dropout(0.5)
        
        self.out = nn.Sequential(
                                nn.Linear(6, 1),
                                nn.Sigmoid()
                            )
        
    def forward(self, inputs):
        x = self.Layer_1(inputs)
        x = self.dropout(x)
        x = self.Layer_2(x)
        x = self.dropout(x)
        x = self.FC1(x)
        x = self.FC2(x)
        x = self.out(x)
        return x
    
    def predict(self, test_inputs): #test할 때는 dropout하면 안된다.
        x = self.Layer_1(test_inputs)
        x = self.Layer_2(x)
        x = self.FC1(x)
        x = self.FC2(x)
        x = self.out(x)
        return torch.round(x)

## Training

In [13]:
EPOCHS = 10000
BATCH_SIZE = 891
FEATURE_SIZE = 6

model = Binary_Classification(FEATURE_SIZE)
criterion = nn.BCELoss()
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)

optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(EPOCHS):
    for X_batch, y_batch in train_loader:
        inputs = Variable(torch.Tensor(X_batch.float()))
        targets = Variable(torch.Tensor(y_batch.float()))
        model.zero_grad()
        y_pred = model(inputs)
        loss = criterion(y_pred, targets)
        loss.backward()
        optimizer.step()
    
    if epoch % 1000 == 0:
        print(loss)

  "Please ensure they have the same size.".format(target.size(), input.size()))


tensor(0.6716)
tensor(0.4838)
tensor(0.4521)
tensor(0.4757)
tensor(0.4551)
tensor(0.4305)
tensor(0.4495)
tensor(0.4211)
tensor(0.4270)
tensor(0.4282)


## Test

In [14]:
test_loader = DataLoader(dataset=test_data, batch_size=1)
test_y_pred = []
for X_batch in test_loader:
    inputs = Variable(torch.Tensor(X_batch.float()))
    y_pred = model.predict(inputs)
    test_y_pred.append(int(y_pred.item()))

In [15]:
submit_data = pd.read_csv("./Titanic/gender_submission.csv")
submit_data.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [16]:
len(test_y_pred)

418

In [17]:
submit_data["Survived"] = test_y_pred

In [18]:
print(sum(submit_data["Survived"]))

144


In [19]:
submit_data.to_csv("./Titanic/submit_best.csv", index=False)