In [31]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import patsy
import seaborn as sns

plt.style.use('ggplot')
%matplotlib inline

def category_to_table(titanic):
    exp_values = [
        'C(Pclass)',
        'C(Sex)',
        'C(Child)',
        'C(Embarked)'
    ]

    predict_value = 'Survived'
    fomula = '+'.join(exp_values) + '+' + predict_value
#     print('fomula:', fomula)

    c_df = patsy.dmatrix(fomula, data=titanic, return_type='dataframe')
#     print(c_df.groupby(predict_value).sum())

    return c_df


def load_titanic_data():
    """
    データを読みこみ、
    必要なパラメータに絞る
    """

    def is_child(age):
        if pd.isnull(age):
            return False
        if age < 17:
            return True
        return False

    titanic = pd.read_csv("data/train.csv")
    titanic = titanic.dropna(
        subset=['Age', 'Sex', 'Pclass', 'Embarked']
    )
    titanic = titanic.assign(
        Child=titanic['Age'].map(is_child)
    )
    return titanic

titanic = load_titanic_data()
titanic.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Child
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,False
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,False


In [32]:
## カテゴリカルなデータに変換
titanic_category = category_to_table(titanic)
titanic_category.head(2)

Unnamed: 0,Intercept,C(Pclass)[T.2],C(Pclass)[T.3],C(Sex)[T.male],C(Child)[T.True],C(Embarked)[T.Q],C(Embarked)[T.S],Survived
0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [33]:
## 各パラメータに関する生存率を調べる
## 男女/等級/子供かどうか/別生存率
titanic.groupby(['Sex', 'Pclass', 'Child'])[['Survived']].agg(['mean', 'count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Survived,Survived
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,count
Sex,Pclass,Child,Unnamed: 3_level_2,Unnamed: 4_level_2
female,1,False,0.974026,77
female,1,True,0.833333,6
female,2,False,0.90625,64
female,2,True,1.0,10
female,3,False,0.42029,69
female,3,True,0.545455,33
male,1,False,0.377551,98
male,1,True,1.0,3
male,2,False,0.068182,88
male,2,True,0.818182,11


In [34]:
## 各パラメータに関する生存率を調べる
## 男女/等級/子供かどうか/別生存率
titanic.groupby(['Sex', 'Pclass', 'Child', 'Embarked'])[['Survived']].agg(['mean', 'count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Survived,Survived
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,count
Sex,Pclass,Child,Embarked,Unnamed: 4_level_2,Unnamed: 5_level_2
female,1,False,C,0.972973,37
female,1,False,Q,1.0,1
female,1,False,S,0.974359,39
female,1,True,C,1.0,1
female,1,True,S,0.8,5
female,2,False,C,1.0,5
female,2,False,Q,1.0,1
female,2,False,S,0.896552,58
female,2,True,C,1.0,2
female,2,True,S,1.0,8


In [35]:
## 各パラメータに関する生存率を調べる
## 男女/等級/子供かどうか/別生存率
titanic.groupby(['Sex', 'Pclass', 'Embarked'])[['Survived']].agg(['mean', 'count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Survived,Survived
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,count
Sex,Pclass,Embarked,Unnamed: 3_level_2,Unnamed: 4_level_2
female,1,C,0.973684,38
female,1,Q,1.0,1
female,1,S,0.954545,44
female,2,C,1.0,7
female,2,Q,1.0,1
female,2,S,0.909091,66
female,3,C,0.6875,16
female,3,Q,0.5,10
female,3,S,0.407895,76
male,1,C,0.444444,36


In [36]:
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Child
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,False
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,False
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,False
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,False
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,False
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,False
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S,True
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,False
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,True
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7000,G6,S,True


In [62]:
def get_area(area):
    return area[0]

cabin_data = titanic.copy()
cabin_data = cabin_data.dropna(subset=['Cabin'])
cabin_data = cabin_data.assign(
    Area=cabin_data['Cabin'].map(get_area)
)

cabin_data.groupby(['Embarked', 'Area'])['Survived'].agg(['count', 'mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean
Embarked,Area,Unnamed: 2_level_1,Unnamed: 3_level_1
C,A,7,0.571429
C,B,21,0.761905
C,C,19,0.631579
C,D,13,0.846154
C,E,5,1.0
Q,C,2,0.5
S,A,5,0.6
S,B,22,0.727273
S,C,30,0.566667
S,D,18,0.666667
