# Preparation

## Load Module

In [181]:
import pandas as pd
import numpy as np

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

import statsmodels.stats.anova as anova
from scipy import stats as st
import statsmodels.api as sm

import warnings
warnings.filterwarnings('ignore')

## Read Data

In [182]:
DIR = "../input/titanic"
TRAIN = "../input/titanic/train.csv"
TEST  = "../input/titanic/test.csv"

In [183]:
train = pd.read_csv(TRAIN)
test  = pd.read_csv(TEST)
train["label"] = "train"
test["label"]  = "test"

data = pd.concat([train, test], axis=0).reset_index(drop=True)

In [184]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
 12  label        1309 non-null   object 
dtypes: float64(3), int64(4), object(6)
memory usage: 133.1+ KB


# Preprocessing

In [185]:
data['Sex'].replace(['male','female'], [0, 1], inplace=True)
data['Embarked'].fillna(('S'), inplace=True)
data['Embarked'] = data['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
data['Fare'].fillna(np.mean(data['Fare']), inplace=True)
data['Age'].fillna(data['Age'].median(), inplace=True)
data['FamilySize'] = data['Parch'] + data['SibSp'] + 1
data['IsAlone'] = data['FamilySize'].apply(lambda x: 1 if x == 1 else 0)

In [186]:
train = data[data["label"] == "train"]
test  = data[data["label"] == "test"]

target = "Survived"

# Modeling

基本的な前処理を行った状態で、以下の9個の変数が存在する。

Pclass    : 乗客のクラス<br>
Sex       : 性別<br>
Age       : 年齢<br>
SibSp     : 兄弟数<br>
Parch     : 配偶者数<br>
FamilySize: 家族数<br>
IsAlone   : 一人かどうか<br>
Fare      : 運賃<br>
Embarked  : 出港場所<br>

このとき、<br>
パターンA: 変数Embarkedを除く<br>
パターンB: 何も除かない<br>

でモデルの比較を行う。

## Helper

In [187]:
def lgb_train(X, y, cat_feat):
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=0, train_size=0.5)
    categorical_features = cat_feat
    lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features)
    lgb_eval = lgb.Dataset(X_valid, y_valid, categorical_feature=categorical_features)
    params = {
    'objective': 'binary'
    }
    estimator = lgb.train(
        params, lgb_train,
        valid_sets=[lgb_train, lgb_eval],
        verbose_eval=10,
        num_boost_round=1000,
        early_stopping_rounds=10
    )
    y_pred = estimator.predict(X_valid)
    logloss = log_loss(y_valid, y_pred)
    
    print(f"logloss: {logloss}")
    return estimator, y_pred

In [188]:
def get_valid(X, y):
    _, _, _, y_valid = train_test_split(X, y, random_state=0, train_size=0.5)
    return y_valid 

## Training A

In [191]:
features = [
    "Pclass",
    "Sex",
    "Age",
    "SibSp",
    "Parch",
    "Fare",
    "FamilySize",
    "IsAlone",
    
]

X_A = train[features]
X_A_test  = test[features]
y_A = train[target].astype('int')

In [192]:
cat_feat = ["Pclass", "Sex"]
model_B, pred_B = lgb_train(X_B, y_B, cat_feat)

Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.455479	valid_1's binary_logloss: 0.471895
[20]	training's binary_logloss: 0.37142	valid_1's binary_logloss: 0.420431
[30]	training's binary_logloss: 0.327665	valid_1's binary_logloss: 0.405082
[40]	training's binary_logloss: 0.295529	valid_1's binary_logloss: 0.414988
Early stopping, best iteration is:
[30]	training's binary_logloss: 0.327665	valid_1's binary_logloss: 0.405082
logloss: 0.4050821501383518


## Training B

In [193]:
features = [
    "Pclass",
    "Sex",
    "Age",
    "SibSp",
    "Parch",
    "Fare",
    "Embarked",
    "FamilySize",
    "IsAlone"
]

X_B = train[features]
X_B_test  = test[features]
y_B = train[target].astype('int')

In [194]:
cat_feat = ["Sex", "Pclass", "Embarked"]
model_B, pred_B = lgb_train(X_B, y_B, cat_feat)

Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.453581	valid_1's binary_logloss: 0.470448
[20]	training's binary_logloss: 0.370346	valid_1's binary_logloss: 0.41895
[30]	training's binary_logloss: 0.325687	valid_1's binary_logloss: 0.407029
[40]	training's binary_logloss: 0.29136	valid_1's binary_logloss: 0.41013
Early stopping, best iteration is:
[33]	training's binary_logloss: 0.314879	valid_1's binary_logloss: 0.404389
logloss: 0.40438943920917825


# Check Significant

変数が増えれば、理論上は精度はよくなるため、logloss(0に近いほど良い)は

A > B

である。

AとBを比較し、誤差である(つまり偶然)と判定された場合はAを採択し、誤差でなければBを採択する。

In [195]:
true = get_valid(X_A, y_A)
true_pred = pd.concat([pd.DataFrame(true).reset_index(drop=True), 
                       pd.DataFrame(pred_A), 
                       pd.DataFrame(pred_B)], axis=1)
true_pred.columns = ["TRUE", "PRED_A", "PRED_B"]

## 正規性の確認

In [196]:
print("W_A: {}".format(st.shapiro(true_pred["PRED_A"])[1]))
print("W_B: {}".format(st.shapiro(true_pred["PRED_B"])[1]))

W_A: 4.2599944483290774e-16
W_B: 1.4202434691545594e-18


全ての予測値でW(右側)が0.05を下回っているため、正規性は棄却される

In [197]:
st.wilcoxon(true_pred["PRED_A"],true_pred["PRED_B"])

WilcoxonResult(statistic=48391.0, pvalue=0.5945845228685844)

「モデルAとモデルBに差がない」ということが棄却できなかったので、モデルAを採用する