## 作成した特徴量を使ってモデルを学習するぞい

In [1]:
import itertools
import dataclasses
from typing import List

import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
import pandas as pd

In [2]:
TRAIN = pd.read_csv("data/preprocessed_train.csv", index_col=None, header=0).set_index("PassengerId")
TEST = pd.read_csv("data/preprocessed_test.csv", index_col=None, header=0).set_index("PassengerId")
display(TRAIN)
display(TEST)

Unnamed: 0_level_0,IntSex,Pclass,Under10YearsOld,IsPair,Parch,EmbarkedAtCherboug,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,3,0,1,0,0,0
2,1,1,0,1,0,1,1
3,1,3,0,0,0,0,1
4,1,1,0,1,0,0,1
5,0,3,0,0,0,0,0
...,...,...,...,...,...,...,...
887,0,2,0,0,0,0,0
888,1,1,0,0,0,0,1
889,1,3,0,1,2,0,0
890,0,1,0,0,0,1,1


Unnamed: 0_level_0,IntSex,Pclass,Under10YearsOld,IsPair,Parch,EmbarkedAtCherboug
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
892,0,3,0,0,0,0
893,1,3,0,1,0,0
894,0,2,0,0,0,0
895,0,3,0,0,0,0
896,1,3,0,1,1,0
...,...,...,...,...,...,...
1305,0,3,0,0,0,0
1306,1,1,0,0,0,1
1307,0,3,0,0,0,0
1308,0,3,0,0,0,0


In [4]:
@dataclasses.dataclass(frozen=True)
class Result:
    model: sklearn.base.ClassifierMixin
    train: pd.DataFrame
    test: pd.DataFrame
    submit: pd.DataFrame


@dataclasses.dataclass(frozen=True)
class CVResult:
    fit_time: List[float]
    score_time: List[float]
    train_score: List[float]
    test_score: List[float]

    def to_dataframe(self) -> pd.DataFrame:
        return pd.DataFrame(dataclasses.asdict(self))


## とりあえず学習
- train: 0.84
- test: 0.76(submitした)

In [9]:
def train_random_forest_classifier(train: pd.DataFrame, test: pd.DataFrame) -> Result:
    randomforest = RandomForestClassifier()
    randomforest.fit(TRAIN.drop("Survived", axis=1), TRAIN.Survived)

    display( randomforest.feature_importances_)
    display(randomforest.score(TRAIN.drop("Survived", axis=1), TRAIN.Survived))

    submit = pd.DataFrame({
        "PassengerId": TEST.index,
        "Survived": randomforest.predict(TEST)
    })
    display(submit)

    return Result(
        model=randomforest,
        train=train,
        test=test,
        submit=submit
    )

train_random_forest_classifier(TRAIN, TEST)

array([0.51772932, 0.21972858, 0.06583019, 0.05168945, 0.0916934 ,
       0.05332906])

0.8406285072951739

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


Result(model=RandomForestClassifier(), train=             IntSex  Pclass  Under10YearsOld  IsPair  Parch  \
PassengerId                                                   
1                 0       3                0       1      0   
2                 1       1                0       1      0   
3                 1       3                0       0      0   
4                 1       1                0       1      0   
5                 0       3                0       0      0   
...             ...     ...              ...     ...    ...   
887               0       2                0       0      0   
888               1       1                0       0      0   
889               1       3                0       1      2   
890               0       1                0       0      0   
891               0       3                0       0      0   

             EmbarkedAtCherboug  Survived  
PassengerId                                
1                             0         0  
2  

## k-cross-validationしよう
過学習のアレみるなら、交差検証したほうが良さげ

In [11]:
def random_forest_classifier_cross_validation(
    train: pd.DataFrame,
    model: sklearn.base.ClassifierMixin,
    cv: int = 10,
) -> CVResult:
    cv_results = cross_validate(
        model,
        train.drop("Survived", axis=1),
        train.Survived,
        cv=cv,
        n_jobs=-1,
        return_train_score=True,
    )

    return CVResult(
        fit_time=cv_results["fit_time"],
        score_time=cv_results["score_time"],
        test_score=cv_results["test_score"],
        train_score=cv_results["train_score"],
    )


(
    random_forest_classifier_cross_validation(TRAIN, RandomForestClassifier())
    .to_dataframe()
    .mean()
)

fit_time       0.326775
score_time     0.025678
train_score    0.841003
test_score     0.815980
dtype: float64

### 特徴量を一つ削ってみる

In [13]:
RESULTS_1_LESS_FEATURES = {
    str(params): random_forest_classifier_cross_validation(
        TRAIN[list(params) + ["Survived"]], RandomForestClassifier()
    )
    for params in itertools.combinations(TEST.columns, len(TEST.columns) - 1)
}


In [14]:
pd.DataFrame(
    {
        k: v.to_dataframe().describe().loc["mean", ["test_score", "train_score"]]
        for k, v in RESULTS_1_LESS_FEATURES.items()
    }
).transpose().sort_values("test_score", ascending=False)


Unnamed: 0,test_score,train_score
"('IntSex', 'Pclass', 'Under10YearsOld', 'Parch', 'EmbarkedAtCherboug')",0.812597,0.82741
"('IntSex', 'Pclass', 'Under10YearsOld', 'IsPair', 'Parch')",0.803645,0.829281
"('IntSex', 'Pclass', 'Under10YearsOld', 'IsPair', 'EmbarkedAtCherboug')",0.798015,0.822048
"('IntSex', 'Under10YearsOld', 'IsPair', 'Parch', 'EmbarkedAtCherboug')",0.794632,0.811323
"('IntSex', 'Pclass', 'IsPair', 'Parch', 'EmbarkedAtCherboug')",0.792409,0.822921
"('Pclass', 'Under10YearsOld', 'IsPair', 'Parch', 'EmbarkedAtCherboug')",0.70829,0.734382


一つ消してダメなら消す必要なさそう

特徴量はそのまま使うことにする

### randomforestのパラメータをチューニングしてみよう