In [62]:
!pip install LightGBM

Collecting LightGBM
  Downloading lightgbm-4.4.0-py3-none-macosx_14_0_arm64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: LightGBM
Successfully installed LightGBM-4.4.0


In [173]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# シードの固定
random_seed = 42

# データ前処理

In [193]:
# 訓練/テストデータの読込
train = pd.read_csv('../data/train.csv')
test  = pd.read_csv("../data/test.csv")

In [194]:
display(train.head())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### 特徴エンジニアリング

In [195]:
data = pd.concat([train, test], sort=False)

data["Sex"].replace(["male", "female"], [0, 1], inplace=True)
data["Embarked"].fillna(("S"), inplace=True)
data["Embarked"] = data["Embarked"].map({"S":0, "C":1, "Q":2}).astype(int)
data["Fare"].fillna(np.mean(data["Fare"]), inplace=True)
data["Age"].fillna(data["Age"].median(), inplace=True)
data["FamilySize"] = data["Parch"] + data["SibSp"] + 1
data["IsAlone"] = 0
data.loc[data["FamilySize"] == 1, "IsAlone"] == 1

data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,IsAlone
0,1,0.0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,0,2,0
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1,2,0
2,3,1.0,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,0,1,0
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,0,2,0
4,5,0.0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,0,1,0


### データ分割

In [196]:
del_cols = ["Name", "Ticket", "Cabin"]
data.drop(del_cols, axis=1, inplace=True)

train = data[:len(train)]
test  = data[len(train):]

# 訓練データとテストデータに分割
y = train.pop("Survived")
test.pop("Survived")
X = train
X_test  = test

In [197]:
# 訓練データを訓練データと検証データに分割
# ホールドアウト法
X_train, X_valid, y_train, y_valid = train_test_split(
    X.iloc[:,1:], # PassengerId 排除
    y,
    test_size=0.3,
    random_state=random_seed,
    stratify=y
)

# モデルの構築・学習
- ランダムフォレスト

## 関数

In [198]:
from sklearn.model_selection import cross_val_score

def print_acc(
    y_true: list,
    y_pred: list,
):
    acc = accuracy_score(y_true=y_true, y_pred=y_pred)
    print("ホールドアウト法")
    print(f"精度: {acc}")
    print("\n交差検証（分割数:4）")
    scores = cross_val_score(model, X, y, cv=4)
    print(f"精度: {scores.mean()}")

In [199]:
"""ランダムフォレスト"""
from sklearn.ensemble import RandomForestClassifier

# インスタンス作成
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=5,
    random_state=random_seed
)

# モデルの学習
model.fit(X_train, y_train)

# 予測
y_pred = model.predict(X_valid)
y_pred = (y_pred > 0).astype(int)

print_acc(y_true=y_valid, y_pred=y_pred)


# 精度: 0.7873134328358209

ホールドアウト法
精度: 0.7873134328358209

交差検証（分割数:4）
精度: 0.8114622470003636


In [200]:
print(train.shape)
print(y_train.shape)

(891, 10)
(623,)


### LightGBM用データ分割

In [165]:
"""LightGBM
LightGBMはカテゴリカル変数の前処理が不要だからもう一度読み込む
"""
import lightgbm as lgb

# カテゴリカル変数
categorical_cols = ["Embarked", "Pclass", "Sex"]

# モデルのハイパーパラメータ
params = {
    'objective': 'binary',
    'learning_rate': 0.01,
}

lgb_train = lgb.Dataset(
    X_train,
    y_train,
    categorical_feature=categorical_cols
    )
lgb_eval = lgb.Dataset(
    X_valid,
    y_valid,
    categorical_feature=categorical_cols,
    reference=lgb_train
    )

model = lgb.train(params, lgb_train,
    valid_sets=[lgb_train, lgb_eval],
)

y_pred = model.predict(X_valid, num_iteration=model.best_iteration)
y_pred = (y_pred > 0).astype(int)

acc = accuracy_score(y_true=y_valid, y_pred=y_pred)

print(f"精度: {acc}")

[LightGBM] [Info] Number of positive: 239, number of negative: 384
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001167 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 205
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
精度: 0.3843283582089552


# Submission

In [207]:
(predictions).astype(int)

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [208]:
predictions = model.predict(X_test.iloc[:,1:]).astype(int) # PassengerId 排除

output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': predictions})
output.to_csv('../data/submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
