# Overview
- pytorch を使って titanic の問題を解く
- data は、`15_pytorch_NN` 内にある
- ref: 
    > https://kaeru-nantoka.hatenablog.com/entry/2019/03/30/003112

# Import everything I need :)

In [2]:
import random
import os
import time
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import torch
import torch.utils.data
import torch.nn as nn
import torch.nn.functional as F
from fastprogress import progress_bar

# Preparation

cuda?

In [3]:
is_cuda = torch.cuda.is_available()
is_cuda

False

## load data

In [4]:
# train
path = '15_pytorch_NN/train.csv'
train = pd.read_csv(path)

# test
path = '15_pytorch_NN/test.csv'
test = pd.read_csv(path)

In [5]:
target = train['Survived'].values
train = train.drop(['Survived'], axis=1)

## set

In [6]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed = 1337
seed_everything(seed=seed)

# EDA

data shape

In [7]:
print(f'data shape')
print(f'train: {train.shape}')
print(f'test:  {test.shape}')

data shape
train: (891, 11)
test:  (418, 11)


<br>
<br>
features

In [8]:
train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


<br>
<br>
check null

In [9]:
# train
train.isnull().sum(axis=0)

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [10]:
# test
test.isnull().sum(axis=0)

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

# FeatureEngineering

NameとTicket,Cabin はいらないだろう

<br/>
<br/>
カテゴリカル特徴量 と 数値特徴量

In [11]:
cat_cols = ['Cabin','Embarked','Name','Sex','Ticket',]
num_cols = list(set(train.columns) - set(cat_cols) - set(["Survived"]))
print(f'カテゴリカル: {cat_cols}')
print(f'数値:        {num_cols}')

カテゴリカル: ['Cabin', 'Embarked', 'Name', 'Sex', 'Ticket']
数値:        ['Pclass', 'Age', 'Fare', 'SibSp', 'PassengerId', 'Parch']


<br/>
<br/>
ラベルエンコーディング

In [12]:
def encode(encoder, x):
    len_encoder = len(encoder)
    try:
        id = encoder[x]
    except KeyError:
        id = len_encoder
    return id

encoders = [{} for cat in cat_cols]

In [13]:
for i, cat in enumerate(cat_cols):
    print('encoding %s ...' % cat, end=' ')
    encoders[i] = {l: id for id, l in enumerate(train.loc[:, cat].astype(str).unique())}
    train[cat] = train[cat].astype(str).apply(lambda x: encode(encoders[i], x))
    test[cat]  =  test[cat].astype(str).apply(lambda x: encode(encoders[i], x))
    print('Done')

embed_sizes = [len(encoder) for encoder in encoders]

encoding Cabin ... Done
encoding Embarked ... Done
encoding Name ... Done
encoding Sex ... Done
encoding Ticket ... Done


<br>
<br>
数値特徴量を標準化

In [14]:
train[num_cols] = train[num_cols].fillna(0)
test[num_cols]  =  test[num_cols].fillna(0)
print('scaling numerical columns')

scaler = StandardScaler()
train[num_cols] = scaler.fit_transform(train[num_cols])
test[num_cols] = scaler.transform(test[num_cols])

scaling numerical columns


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


<br>
<br>
学習データ・セット

In [15]:
X = train
y = target
X_test = test

# PyTorch

In [36]:
# ----- set params -----
n_splits = 3
batch_size = 20
train_epochs = 15
VERBOSE = 4
# -----------------------

In [37]:
# sigmoid
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# analysis
def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    for threshold in progress_bar([i * 0.01 for i in range(100)]):
        score = accuracy_score(y_true=y_true, y_pred=y_proba > threshold)
        if score > best_score:
            best_threshold = threshold
            best_score = score
    search_result = {'threshold': best_threshold, 'accuracy_score': best_score}
    return search_result

# Model
class Model(nn.Module):
    def __init__(self, in_features, out_features, bias=True, p=0.5):
        super(Model, self).__init__()
        self.linear = nn.Linear(in_features, out_features, bias)
        self.relu = nn.ReLU()
        self.drop = nn.Dropout(p)
        
    def forward(self, x):
        x = self.linear(x)
        x = self.relu(x)
        x = self.drop(x)
        return x
# net = Model(11, 1)

In [38]:
n_splits = 3
folds = KFold(n_splits=n_splits, shuffle=True)

In [41]:
# Kfold のループ部分
train_preds = np.zeros((len(X)))
test_preds = np.zeros((len(X_test)))
for i, (train_idx, valid_idx) in enumerate(folds.split(X, y)):

    # X, y, X_val, y_val をテンソル化(PyTorch で扱える形に変換)し、 .cuda() (GPUで計算するために特徴量を GPU に渡す処理)をする。
    X_train_fold = torch.tensor(X.iloc[train_idx, :].values, dtype=torch.float32)
    X_val_fold   = torch.tensor(X.iloc[valid_idx, :].values, dtype=torch.float32)
    X_test_      = torch.tensor(X_test.iloc[:,:].values, dtype=torch.float32) 
    y_train_fold = torch.tensor(y[train_idx, np.newaxis], dtype=torch.float32)
    y_val_fold   = torch.tensor(y[valid_idx, np.newaxis], dtype=torch.float32)
    
    # model を呼び出して、
    model = Model(11, 1)
    
    # gpu 使えるならcudaに渡す
    if is_cuda:
        X_train_fold.cuda()
        y_train_fold.cuda()
        X_val_fold.cuda()
        y_val_fold.cuda()
    
    # loss 関数を呼び出す。BCELoss() よりも好まれるらしい。。
    loss_fn = torch.nn.BCEWithLogitsLoss(reduction="sum")
    optimizer = torch.optim.Adam(model.parameters())
    
    # dataloader で扱える形( = Dataset )にする
    train_ = torch.utils.data.TensorDataset(X_train_fold, y_train_fold)
    valid_ = torch.utils.data.TensorDataset(X_val_fold, y_val_fold)
    test_  = torch.utils.data.TensorDataset(X_test_)
    
    # X_train_fold batch_size個, y_train_fold batch_size個ずつを各ループで返す iterater の定義
    train_loader = torch.utils.data.DataLoader(train_, batch_size=batch_size, shuffle=True)
    # X_valid_fold batch_size個, y_valid_fold batch_size個ずつを各ループで返す iterater の定義
    valid_loader = torch.utils.data.DataLoader(valid_, batch_size=batch_size, shuffle=False)
    # X_valid_fold batch_size個, y_valid_fold batch_size個ずつを各ループで返す iterater の定義
    test_loader = torch.utils.data.DataLoader(test_, batch_size=batch_size, shuffle=False)
    
    print('-'*50)
    print(f'- Fold {i + 1}/{n_splits}')
    
    # epoch 分のループを回す
    for epoch in range(train_epochs):
        start_time = time.time()
        
        # model を train mode にする
        model.train()
        avg_loss = 0.

        # X_train_fold と y_train_fold を batch_size 個ずつ渡すループ
        for X_batch, y_batch in train_loader:
            # predict
            y_pred = model.forward(X_batch)
            # loss の計算
            loss = loss_fn(y_pred, y_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            avg_loss += loss.item() / len(train_loader)
        
        
        # evaluate
        if ((epoch+1)%VERBOSE==0) or (epoch+1==train_epochs):
            model.eval()
            valid_preds_fold = np.zeros((X_val_fold.size(0)))
            test_preds_fold = np.zeros(len(X_test_))
            avg_val_loss = 0.
            for i, (X_batch, y_batch) in enumerate(valid_loader):
                y_pred = model(X_batch).detach()
                avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
                valid_preds_fold[i * batch_size:(i+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]

            elapsed_time = time.time() - start_time 
            print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f} \t time={:.2f}s'.format(
                epoch + 1, train_epochs, avg_loss, avg_val_loss, elapsed_time))

    # X_test_fold を batch_size ずつ渡すループ    
    for i, (X_batch,) in enumerate(test_loader):
        y_pred = model(X_batch).detach()

        # batch_size のリストのリストになっているのを単一階層のリストに変換して、cpuに値を渡し、テンソルから numpy.array()に変換したものを sigmoid 関数に渡す
        test_preds_fold[i * batch_size:(i+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]

    train_preds[valid_idx] = valid_preds_fold

    # 予測値の kfold数で割った値を加える
    test_preds += test_preds_fold / n_splits

--------------------------------------------------
- Fold 1/3
Epoch 4/15 	 loss=13.7243 	 val_loss=13.7243 	 time=0.02s
Epoch 8/15 	 loss=13.7243 	 val_loss=13.7243 	 time=0.01s
Epoch 12/15 	 loss=13.7243 	 val_loss=13.7243 	 time=0.01s
Epoch 15/15 	 loss=13.7243 	 val_loss=13.7243 	 time=0.02s
--------------------------------------------------
- Fold 2/3
Epoch 4/15 	 loss=13.9304 	 val_loss=13.8568 	 time=0.02s
Epoch 8/15 	 loss=13.8028 	 val_loss=13.8413 	 time=0.01s
Epoch 12/15 	 loss=13.8453 	 val_loss=13.8233 	 time=0.01s
Epoch 15/15 	 loss=13.8032 	 val_loss=13.8095 	 time=0.01s
--------------------------------------------------
- Fold 3/3
Epoch 4/15 	 loss=13.7069 	 val_loss=13.7194 	 time=0.02s
Epoch 8/15 	 loss=13.7088 	 val_loss=13.7291 	 time=0.02s
Epoch 12/15 	 loss=13.7117 	 val_loss=13.7244 	 time=0.01s
Epoch 15/15 	 loss=13.6937 	 val_loss=13.7310 	 time=0.01s


In [42]:
search_result = threshold_search(y, train_preds)
search_result

{'threshold': 0.79, 'accuracy_score': 0.6161616161616161}