## 作成した特徴量を使ってモデルを学習するぞい

In [1]:
from __future__ import annotations
import itertools
import dataclasses
from typing import List

import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
import pandas as pd

In [2]:
TRAIN = pd.read_csv("data/preprocessed_train.csv", index_col=None, header=0).set_index("PassengerId")
TEST = pd.read_csv("data/preprocessed_test.csv", index_col=None, header=0).set_index("PassengerId")
display(TRAIN)
display(TEST)

Unnamed: 0_level_0,IntSex,Pclass,Under10YearsOld,IsPair,Parch,EmbarkedAtCherboug,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,3,0,1,0,0,0
2,1,1,0,1,0,1,1
3,1,3,0,0,0,0,1
4,1,1,0,1,0,0,1
5,0,3,0,0,0,0,0
...,...,...,...,...,...,...,...
887,0,2,0,0,0,0,0
888,1,1,0,0,0,0,1
889,1,3,0,1,2,0,0
890,0,1,0,0,0,1,1


Unnamed: 0_level_0,IntSex,Pclass,Under10YearsOld,IsPair,Parch,EmbarkedAtCherboug
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
892,0,3,0,0,0,0
893,1,3,0,1,0,0
894,0,2,0,0,0,0
895,0,3,0,0,0,0
896,1,3,0,1,1,0
...,...,...,...,...,...,...
1305,0,3,0,0,0,0
1306,1,1,0,0,0,1
1307,0,3,0,0,0,0
1308,0,3,0,0,0,0


In [3]:
@dataclasses.dataclass(frozen=True)
class Result:
    model: sklearn.base.ClassifierMixin
    train: pd.DataFrame
    test: pd.DataFrame
    submit: pd.DataFrame


@dataclasses.dataclass(frozen=True)
class Score:
    name: str
    train_scores: List[float]
    test_scores: List[float]

    def to_dataframe(self) -> pd.DataFrame:
        return pd.DataFrame({
            f"train_{self.name}": self.train_scores,
            f"test_{self.name}": self.test_scores,
        })
    
    def __post_init__(self):
        if len(self.train_scores) != len(self.test_scores):
            raise ValueError("length is not same")
    
    @property
    def length(self):
        return len(self.train_scores)

@dataclasses.dataclass(frozen=True)
class CVResult:
    fit_time: List[float]
    score_time: List[float]
    scores: List[Score]

    def to_dataframe(self) -> pd.DataFrame:
        result = pd.DataFrame({
                "fit_time": self.fit_time,
                "score_time": self.score_time,
            }
        )
        for score in self.scores:
            result = pd.concat([result, score.to_dataframe()], axis=1)
        
        return result


## とりあえず学習
- train: 0.84
- test: 0.76(submitした)

In [4]:
def train_random_forest_classifier(train: pd.DataFrame, test: pd.DataFrame) -> Result:
    randomforest = RandomForestClassifier()
    randomforest.fit(TRAIN.drop("Survived", axis=1), TRAIN.Survived)

    display( randomforest.feature_importances_)
    display(randomforest.score(TRAIN.drop("Survived", axis=1), TRAIN.Survived))

    submit = pd.DataFrame({
        "PassengerId": TEST.index,
        "Survived": randomforest.predict(TEST)
    })
    display(submit)

    return Result(
        model=randomforest,
        train=train,
        test=test,
        submit=submit
    )

train_random_forest_classifier(TRAIN, TEST)

array([0.52750465, 0.22213606, 0.0645583 , 0.05305206, 0.08375419,
       0.04899474])

0.8406285072951739

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


Result(model=RandomForestClassifier(), train=             IntSex  Pclass  Under10YearsOld  IsPair  Parch  \
PassengerId                                                   
1                 0       3                0       1      0   
2                 1       1                0       1      0   
3                 1       3                0       0      0   
4                 1       1                0       1      0   
5                 0       3                0       0      0   
...             ...     ...              ...     ...    ...   
887               0       2                0       0      0   
888               1       1                0       0      0   
889               1       3                0       1      2   
890               0       1                0       0      0   
891               0       3                0       0      0   

             EmbarkedAtCherboug  Survived  
PassengerId                                
1                             0         0  
2  

## k-cross-validationしよう
過学習のアレみるなら、交差検証したほうが良さげ

baseline: `test_score = 0.815980`

In [5]:
def random_forest_classifier_cross_validation(
    train: pd.DataFrame,
    model: sklearn.base.ClassifierMixin,
    cv: int = 10,
) -> CVResult:
    cv_results = cross_validate(
        model,
        train.drop("Survived", axis=1),
        train.Survived,
        cv=cv,
        n_jobs=-1,
        return_train_score=True,
        scoring=["accuracy", "roc_auc"],
    )

    return CVResult(
        fit_time=cv_results["fit_time"],
        score_time=cv_results["score_time"],
        scores=[
            Score(
                name=name,
                train_scores=cv_results[f"train_{name}"],
                test_scores=cv_results[f"test_{name}"],
            )
            for name in ["accuracy", "roc_auc"]
        ],
    )


(
    random_forest_classifier_cross_validation(TRAIN, RandomForestClassifier())
    .to_dataframe()
    .mean()
)


fit_time          0.365546
score_time        0.062391
train_accuracy    0.841003
test_accuracy     0.815968
train_roc_auc     0.893394
test_roc_auc      0.847726
dtype: float64

### 特徴量を一つ削ってみる

In [6]:
RESULTS_1_LESS_FEATURES = {
    str(params): random_forest_classifier_cross_validation(
        TRAIN[list(params) + ["Survived"]], RandomForestClassifier()
    )
    for params in itertools.combinations(TEST.columns, len(TEST.columns) - 1)
}


In [7]:
display(pd.DataFrame(
    {
        k: v.to_dataframe().describe().loc["mean", :]
        for k, v in RESULTS_1_LESS_FEATURES.items()
    }
).transpose().sort_values("test_accuracy", ascending=False))


Unnamed: 0,fit_time,score_time,train_accuracy,test_accuracy,train_roc_auc,test_roc_auc
"('IntSex', 'Pclass', 'Under10YearsOld', 'Parch', 'EmbarkedAtCherboug')",0.313116,0.122401,0.82741,0.814831,0.882465,0.857224
"('IntSex', 'Pclass', 'Under10YearsOld', 'IsPair', 'Parch')",0.301644,0.053059,0.829281,0.807016,0.88062,0.85383
"('IntSex', 'Pclass', 'Under10YearsOld', 'IsPair', 'EmbarkedAtCherboug')",0.270985,0.058361,0.822173,0.798015,0.879101,0.86751
"('IntSex', 'Under10YearsOld', 'IsPair', 'Parch', 'EmbarkedAtCherboug')",0.5705,0.083873,0.811323,0.793496,0.842912,0.812124
"('IntSex', 'Pclass', 'IsPair', 'Parch', 'EmbarkedAtCherboug')",0.63347,0.120328,0.822921,0.792397,0.875896,0.824862
"('Pclass', 'Under10YearsOld', 'IsPair', 'Parch', 'EmbarkedAtCherboug')",0.680552,0.140381,0.734382,0.701548,0.760819,0.723243


一つ消してダメなら消す必要なさそう

**特徴量はそのまま使うことにする**

というか、決定木ベースのアルゴリズムだと特徴量の削減はあまり意味ないか。。
学習過程で与えられた特量量のいち部を使って分類やってみて、最もいいやつを云々とかやっているし。

### randomforestのパラメータをチューニングしてみよう

#### wikipediaのおすすめ
