# Overview
- pytorch を使って titanic の問題を解く
- data は、`15_pytorch_NN` 内にある

# Import everything I need :)

In [27]:
import random
import os
import time
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.nn.functional as F

# Preparation

## load data

In [2]:
# train
path = '15_pytorch_NN/train.csv'
train = pd.read_csv(path)

# test
path = '15_pytorch_NN/test.csv'
test = pd.read_csv(path)

In [3]:
target = train['Survived'].values
train = train.drop(['Survived'], axis=1)

## set

In [4]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed = 1337
seed_everything(seed=seed)

# EDA

data shape

In [5]:
print(f'data shape')
print(f'train: {train.shape}')
print(f'test:  {test.shape}')

data shape
train: (891, 11)
test:  (418, 11)


<br>
<br>
features

In [6]:
train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


<br>
<br>
check null

In [7]:
# train
train.isnull().sum(axis=0)

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [8]:
# test
test.isnull().sum(axis=0)

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

# FeatureEngineering

NameとTicket,Cabin はいらないだろう

<br/>
<br/>
カテゴリカル特徴量 と 数値特徴量

In [12]:
cat_cols = ['Cabin','Embarked','Name','Sex','Ticket',]
num_cols = list(set(train.columns) - set(cat_cols) - set(["Survived"]))
print(f'カテゴリカル: {cat_cols}')
print(f'数値:        {num_cols}')

カテゴリカル: ['Cabin', 'Embarked', 'Name', 'Sex', 'Ticket']
数値:        ['Fare', 'PassengerId', 'Pclass', 'SibSp', 'Age', 'Parch']


<br/>
<br/>
ラベルエンコーディング

In [13]:
def encode(encoder, x):
    len_encoder = len(encoder)
    try:
        id = encoder[x]
    except KeyError:
        id = len_encoder
    return id

encoders = [{} for cat in cat_cols]

In [15]:
for i, cat in enumerate(cat_cols):
    print('encoding %s ...' % cat, end=' ')
    encoders[i] = {l: id for id, l in enumerate(train.loc[:, cat].astype(str).unique())}
    train[cat] = train[cat].astype(str).apply(lambda x: encode(encoders[i], x))
    print('Done')

embed_sizes = [len(encoder) for encoder in encoders]

encoding Cabin ... Done
encoding Embarked ... Done
encoding Name ... Done
encoding Sex ... Done
encoding Ticket ... Done


<br>
<br>
数値特徴量を標準化

In [17]:
train[num_cols] = train[num_cols].fillna(0)
print('scaling numerical columns')

scaler = StandardScaler()
train[num_cols] = scaler.fit_transform(train[num_cols])

scaling numerical columns


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


# PyTorch

In [26]:
class CustomLinear(nn.Module):
    def __init__(self, in_features, out_features, bias=True, p=0.5):
        super().__init__()
        self.linear = nn.Linear(in_features, out_features, bias)
        self.relu = nn.ReLU()
        self.drop = nn.Dropout(p)
        
    def forward(self, x):
        x = self.linear(x)
        x = self.relu(x)
        x = self.drop(x)
        return x

net = nn.Sequential(CustomLinear(12, 32), nn.Linear(32, 1))

In [30]:
class Model(nn.Module):
    def __init__(self, in_features, out_features, bias=True, p=0.5):
        super(Model, self).__init__()
        self.liner = nn.Linear(in_features, out_features, bias)
        self.relu = nn.ReLU()
        self.drop = nn.Dropout(p)
        
    def forward(self, x):
        x = self.linear(x)
        x = self.relu(x)
        x = self.drop(x)
        return x