## Binary Classification - Titanic: Machine Learning from Disaster

https://www.kaggle.com/c/titanic

- 유명한 자료인 타이타닉 데이터 셋으로 생존과 사망에 대한 분류 문제를 풀어보도록 하겠습니다.

In [55]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.functional as F
import torch.optim as optim
import numpy as np
import random
import pandas as pd

random.seed(777)
torch.manual_seed(777)

<torch._C.Generator at 0x17866942cd0>

In [3]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.preprocessing import normalize

## Load Data & Preprocessing

In [81]:
import pandas as pd

train_data = pd.read_csv("./data/titanic/train.csv")
test_data = pd.read_csv("./data/titanic/test.csv")

In [82]:
train_data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [83]:
# Pclass, Sex, Age, SibSp, Parch, Fare 총 6개의 feature를 사용하겠습니다.
# 별도의 feature engineering은 하지 않겠습니다.
# kaggle data이기 때문에 test_data의 "Survived" 열은 없습니다.
# 우리가 예측한 결과를 csv 파일로 제출하는 형태입니다.

train_data["Sex"] = train_data["Sex"].map({"male": 1, "female": 0})
test_data["Sex"] = test_data["Sex"].map({"male": 1, "female": 0})

train_X = train_data[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"]]
train_y = train_data["Survived"]

test_X = test_data[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"]]

In [84]:
print('Number of null:', train_X.isnull().sum())
print("\n",'Number of null:', train_y.isnull().sum())
print("\n",'Number of null:', test_X.isnull().sum())
print("\n",np.mean(train_X["Age"]))
print("\n",np.mean(test_X["Fare"]))

Number of null: Pclass      0
Sex         0
Age       177
SibSp       0
Parch       0
Fare        0
dtype: int64

 Number of null: 0

 Number of null: Pclass     0
Sex        0
Age       86
SibSp      0
Parch      0
Fare       1
dtype: int64

 29.69911764705882

 35.6271884892086


In [85]:
# 간단하게 age와 fare의 na 값은 평균으로 대체하겠습니다.

pd.options.mode.chained_assignment = None

train_X = train_X.fillna(30)
test_X.loc[:,"Age"] = test_X.loc[:,"Age"].replace(np.nan,30)
test_X.loc[:,"Fare"] = test_X.loc[:,"Fare"].replace(np.nan,35)

In [86]:
print('Number of null:', train_X.isnull().sum())
print("\n",'Number of null:', test_X.isnull().sum())

Number of null: Pclass    0
Sex       0
Age       0
SibSp     0
Parch     0
Fare      0
dtype: int64

 Number of null: Pclass    0
Sex       0
Age       0
SibSp     0
Parch     0
Fare      0
dtype: int64


In [87]:
train_y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [88]:
# training set의 개수는 891입니다.

len(train_X)

891

In [89]:
# train, test dataset을 출력할 수 있는 class를 만듭니다.

class trainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

# 이전과 달리 test_data의 y가 없기 때문에 따로 class를 만들었습니다.
    
class testData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)

In [90]:
# normalize까지만 전처리를 해보겠습니다.

train_data = trainData(np.array(normalize(train_X)), np.array(train_y))
test_data = testData(np.array(normalize(test_X)))

In [91]:
train_data.__getitem__(0)

(array([0.12820484, 0.04273495, 0.94016879, 0.04273495, 0.        ,
        0.30982835]), 0)

## Modeling

In [115]:
# class 흐름이 익숙하지 않은 분들을 위해 다소 길게 적었습니다.

class Binary_Classification(nn.Module):
    
    def __init__(self):
        
        super(Binary_Classification, self).__init__()
        
        self.Layer_1 = nn.Linear(6, 30)
        self.Layer_2 = nn.Linear(30, 12)
        self.Layer_3 = nn.Linear(12, 12)
        self.Layer_4 = nn.Linear(12, 6)
        
        self.dropout = nn.Dropout(0.5)
        self.sigmoid = nn.Sigmoid()
        self.out = nn.Linear(6, 1)
        
    def forward(self, inputs):
        
        x = self.Layer_1(inputs)
        x = self.sigmoid(x)
        x = self.dropout(x)
        
        x = self.Layer_2(x)
        x = self.sigmoid(x)
        x = self.dropout(x)
        
        x = self.Layer_3(x)
        x = self.sigmoid(x)
        x = self.dropout(x)
        
        x = self.Layer_4(x)
        x = self.sigmoid(x)
        x = self.dropout(x)
        
        x = self.out(x)
        x = self.sigmoid(x)
        return x
    
    #test할 때는 dropout하면 안됩니다.
    
    def predict(self, test_inputs):
        
        x = self.Layer_1(test_inputs)
        x = self.sigmoid(x)
        
        x = self.Layer_2(x)
        x = self.sigmoid(x)
        
        x = self.Layer_3(x)
        x = self.sigmoid(x)
        
        x = self.Layer_4(x)
        x = self.sigmoid(x)
        
        x = self.out(x)
        x = self.sigmoid(x)
        
        return torch.round(x)

## Training

In [116]:
# training set 전체를 10000번 정도 학습시켜 보겠습니다.

EPOCHS = 10000
BATCH_SIZE = 891

model = Binary_Classification()

# Loss는 binary cross entropy입니다.
criterion = nn.BCELoss()

train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)

optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(EPOCHS):
    for X_batch, y_batch in train_loader:
        
        inputs = Variable(torch.Tensor(X_batch.float()))
        targets = Variable(torch.Tensor(y_batch.float())).unsqueeze(1)
        
        model.zero_grad()
        
        y_pred = model(inputs)
        
        loss = criterion(y_pred, targets)
        loss.backward()
        
        optimizer.step()
    
    if epoch % 1000 == 0:
        print(loss)

tensor(0.7469, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.6688, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.6409, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.6201, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.6221, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.6248, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.6192, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.6064, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.6014, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.5979, grad_fn=<BinaryCrossEntropyBackward>)


## Test

In [117]:
# test data에 대한 결과를 test_y_pred에 저장합니다.

test_loader = DataLoader(dataset=test_data, batch_size=1)
test_y_pred = []

for X_batch in test_loader:
    
    inputs = Variable(torch.Tensor(X_batch.float()))
    y_pred = model.predict(inputs)
    test_y_pred.append(int(y_pred.item()))

In [118]:
submit_data = pd.read_csv("./data/titanic/gender_submission.csv")
submit_data.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [119]:
len(test_y_pred)

418

In [120]:
submit_data["Survived"] = test_y_pred

In [121]:
# 제 모델에서는 418명 중 195명을 생존으로 분류했습니다.

print(sum(submit_data["Survived"]))

195


In [122]:
# csv파일로 내보냅니다.
submit_data.to_csv("./data/titanic/submit_best.csv", index=False)