In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

In [6]:
def process_titanic(df, type):
    '''
    Titanicのデータの前処理を行う

    @param  df    : input data (Data.Frame)
    @param  type  : process type (str, "train"/"test")
    @return id    : passenger ID (list)
    @return df_p  : processed data (Data.Frame)
    '''

    '''
    データ型と変換の方法
    @PassengerId : int、除外する
    @Survived    : int、目的変数として使う
    @Pclass      : int、ダミー変数化して使う
    @Name        : str、除外する
    @Sex         : str、欠損あり、ダミー変数化して使う
    @Age         : int、年代別 (10年) でカテゴリ変数化して使う
    @SibSp       : int、そのまま使う
    @Parch       : int、そのまま使う
    @Ticket      : str、除外する
    @Fare        : float、0を補完して使う
    @Cabin       : 欠損あり (78%)、除外する
    @Embarked    : 欠損あり
    '''

    id_ = df.PassengerId
    # remove
    df.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1, inplace=True)

    # Age
    age = df[["Age"]].fillna(np.nanmedian(df.Age))

    # Fare
    # 0のところは欠損に変換する
    # その後、補完

    # Sex (to dummy)
    sex_dummy = pd.get_dummies(df["Sex"])

    # Embarked
    emb = df.Embarked
    emb_comp = emb.fillna("S")
    emb_dummy = pd.get_dummies(emb_comp, prefix = "Embarked")
    
    # SibSp
    
    # Parch

    # concat
    df_p = pd.concat([age, sex_dummy, emb_dummy], axis = 1)

    if type == "train":
        df_p = pd.concat([df_p, df[["Survived"]]], axis = 1)

    return id_, df_p

In [41]:
def convert_input(df, y_name):
    '''
    pd.DataFrameからsklearnに入れる形に変換する

    @param  df     : input data (pd.Data.Frame)
    @param  y_name : column of objective variable (str)
    @return X      : feature variable (np.ndarray)
    @return y      : objective variable (np.array)
    '''
    
    if y_name is not None:
        y = df[[y_name]].values.flatten()
        df = df.drop([y_name], axis=1)
        X = df.values
        return X, y

    else:
        X = df.values
        return X

### process train set

In [13]:
train = pd.read_csv("data/train.csv")
id_train, df_train = process_titanic(train, "train")
X, y = convert_input(df_train, "Survived")

### train model

In [27]:
# あとでハイパーパラメータの設定を入れる
model = RandomForestClassifier()
model.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

### predict test set

In [None]:
test = pd.read_csv("data/test.csv")
id_test, df_test = process_titanic(test, "test")
X_test = convert_input(df_test, None)
pred = model.predict(X_test)

### save result

In [42]:
result = pd.DataFrame({"PassengerId": id_test, "Survived": pred})
result.to_csv("result.csv", index = False)