In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import patsy
import seaborn as sns

from sklearn.model_selection import train_test_split
import seaborn as sns

exp_values = [
    'C(Pclass)',
    'C(Sex)',
    'C(Child)',
    'C(Embarked)', 
    'C(HasSib)',
    'C(HasParch)'
]
predict_value = 'Survived'
child_threthold = 16


def output_result(model, category_data, base_data):
    result = model.predict(category_data)
    with open('data/result.csv', 'w') as f:
        f.write('PassengerId,Survived\n')
        for r, pid in zip(result, base_data['PassengerId']):
            f.write('{},{}\n'.format(pid, str(int(r))))


def category_to_table(titanic, is_training_data=True):
    fomula = '+'.join(exp_values)
    if is_training_data:
        fomula = fomula + '+' + predict_value

    print('fomula:', fomula)

    c_df = patsy.dmatrix(fomula, data=titanic, return_type='dataframe')
    return c_df


def load_titanic_data(is_training_data=True):
    """
    データを読みこみ、
    必要なパラメータに絞る
    """
    def is_child(age):
        if pd.isnull(age):
            return False
        if age < child_threthold:
            return True
        return False
    
    titanic = None
    if is_training_data:
        titanic = pd.read_csv("data/train.csv")
    else:
        titanic = pd.read_csv("data/test.csv")

    if is_training_data:
        titanic = titanic.dropna(
            subset=['Age', 'Sex', 'Pclass']
        )
    titanic = titanic.assign(
        Child=titanic['Age'].map(is_child),
        HasSib = titanic['SibSp'].map(lambda x: x > 1),
        HasParch = titanic['Parch'].map(lambda x: x > 1)
    )
        
    return titanic


def hit_rate(predict, test):
    N = len(predict)
    hit = sum([1 if p == t else 0 for p, t in zip(predict, test)])
    return hit/N * 1.0


def main():
    titanic = load_titanic_data()
    titanic_category = category_to_table(titanic)

    x_columns = list(titanic_category.columns)
    x_columns.remove(predict_value)

    X_train, X_test, Y_train, Y_test = train_test_split(titanic[x_columns],
                                                        titanic[predict_value],
                                                        test_size=0.4,
                                                        random_state=0)

titanic_test = load_titanic_data(False)
test_data = category_to_table(titanic_test, False)

titanic = load_titanic_data()
titanic.head(2)

fomula: C(Pclass)+C(Sex)+C(Child)+C(Embarked)+C(HasSib)+C(HasParch)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Child,HasParch,HasSib
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,False,False,False
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,False,False,False


In [9]:
titanic.groupby(['Sex', 'Pclass', 'Child', 'Embarked'])[['Survived']].agg(['mean', 'count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Survived,Survived
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,count
Sex,Pclass,Child,Embarked,Unnamed: 4_level_2,Unnamed: 5_level_2
female,1,False,C,0.973684,38
female,1,False,Q,1.0,1
female,1,False,S,0.97561,41
female,1,True,S,0.666667,3
female,2,False,C,1.0,5
female,2,False,Q,1.0,1
female,2,False,S,0.896552,58
female,2,True,C,1.0,2
female,2,True,S,1.0,8
female,3,False,C,0.4,5


In [34]:
f_group = titanic[(titanic['Sex'] == "female") & (titanic['Pclass'] == 3) & (titanic['Child'] == False)]
# len(f_group) 72

f_group.groupby(['HasParch', 'HasSib'])[['Survived']].agg(['mean', 'count']) # 全然変化なし。。

Unnamed: 0_level_0,Unnamed: 1_level_0,Survived,Survived
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count
HasParch,HasSib,Unnamed: 2_level_2,Unnamed: 3_level_2
False,False,0.444444,54
False,True,0.5,2
True,False,0.384615,13
True,True,0.333333,3


In [33]:
f_group.groupby("Survived")[['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']].agg(["mean", "count"])

Unnamed: 0_level_0,PassengerId,PassengerId,Pclass,Pclass,Age,Age,SibSp,SibSp,Parch,Parch,Fare,Fare
Unnamed: 0_level_1,mean,count,mean,count,mean,count,mean,count,mean,count,mean,count
Survived,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
0,443.463415,41,3,41,29.304878,41,0.609756,41,1.073171,41,16.743802,41
1,347.741935,31,3,31,25.935484,31,0.419355,31,0.645161,31,11.679971,31


あまり有用なデータがない・・・