In [87]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import patsy
import seaborn as sns

from sklearn.cross_validation import train_test_split
import seaborn as sns

exp_values = [
    'Pclass',
    #     'PassengerId',
    'C(Sex)',
    'C(Child)',
    'C(Embarked)',
    #     'SibSp',
]
predict_value = 'Survived'
child_threthold = 16

survived_ticket_set = {
    '110152',
    '113760',
    '13502',
    '1601',
    '24160',
    '2666',
    '29106',
    '347077',
    '347742',
    'PC 17572',
    'PC 17755',
    'PC 17757'
}

dead_ticket_set = {
    '3101295',
    '345773',
    '347082',
    '347088',
    '349909',
    '382652',
    'CA 2144',
    'LINE',
    'S.O.C. 14879',
    'W./C. 6608'
}


def output_result(model, category_data, base_data):
    result = model.predict(category_data)
    with open('data/result.csv', 'w') as f:
        f.write('PassengerId,Survived\n')
        for r, pid in zip(result, base_data['PassengerId']):
            f.write('{},{}\n'.format(pid, str(int(r))))


def category_to_table(titanic, is_training_data=True):
    fomula = '+'.join(exp_values)
    if is_training_data:
        fomula = fomula + '+' + predict_value

    print('fomula:', fomula)

    c_df = patsy.dmatrix(fomula, data=titanic, return_type='dataframe')
    return c_df


def load_titanic_data(is_training_data=True):
    """
    データを読みこみ、
    必要なパラメータに絞る
    """

    s_rate_ticket_dict = dict()
    s_count_ticket_dict = dict()

    def is_child(age):
        if pd.isnull(age):
            return False
        if age < child_threthold:
            return True
        return False
    
    def cabin_a(cabin):
        if pd.isnull(cabin):
            return None
        return cabin[0]

    def ticket_count_info(x):
        if x in s_count_ticket_dict:
            return s_count_ticket_dict[x]
        return 0

    def ticket_ratio_info(x):
        if x in s_rate_ticket_dict:
            return s_rate_ticket_dict[x]
        return None

    titanic = None
    if is_training_data:
        titanic = pd.read_csv("data/train.csv")
    else:
        titanic = pd.read_csv("data/test.csv")

    if is_training_data:
        subset_ = ['Age', 'Pclass']
        titanic = titanic.dropna(
            subset=subset_
        )

        for k, v in titanic.groupby('Ticket')[['Survived']].count().items():
            print(k)
            for kk, vv in v.items():
                s_count_ticket_dict[kk] = vv

        for k, v in titanic.groupby('Ticket')[['Survived']].mean().items():
            print(k)
            for kk, vv in v.items():
                s_rate_ticket_dict[kk] = vv

        titanic = titanic.assign(
            TicketCount=titanic['Ticket'].map(ticket_count_info),
            TicketSRate=titanic['Ticket'].map(ticket_ratio_info),
            Type="Train",
            Training=1,
            Test=0
        )
    else:
        titanic = titanic.assign(
            Surrvived=None,
            Type="Test",
            Training=0,
            Test=1
        )

    titanic = titanic.assign(
        Child=titanic['Age'].map(is_child),
        CabinAlpha=titanic['Cabin'].map(cabin_a),
    )
    return titanic


def hit_rate(predict, test):
    N = len(predict)
    hit = sum([1 if p == t else 0 for p, t in zip(predict, test)])
    return "{0:.3f}%".format(100 * hit / N * 1.0)


def main():
    titanic = load_titanic_data()
    titanic_category = category_to_table(titanic)

    x_columns = list(titanic_category.columns)
    x_columns.remove(predict_value)

    X_train, X_test, Y_train, Y_test = train_test_split(titanic[x_columns],
                                                        titanic[predict_value],
                                                        test_size=0.4,
                                                        random_state=0)


titanic = load_titanic_data()
titanic.head(2)

## カテゴリカルなデータに変換
titanic_category = category_to_table(titanic)
titanic_category.head(2)

test_data = load_titanic_data(False)
test_data_category = category_to_table(test_data, False)

full_data = pd.concat([titanic, test_data])


Survived
Survived
fomula: Pclass+C(Sex)+C(Child)+C(Embarked)+Survived
fomula: Pclass+C(Sex)+C(Child)+C(Embarked)


In [60]:
 test_data.groupby(['Sex', 'Pclass', 'Child', 'Embarked'])[['Ticket']].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Ticket
Sex,Pclass,Child,Embarked,Unnamed: 4_level_1
female,1,False,C,28
female,1,False,Q,1
female,1,False,S,21
female,2,False,C,3
female,2,False,S,21
female,2,True,C,1
female,2,True,S,5
female,3,False,C,7
female,3,False,Q,23
female,3,False,S,35


In [61]:
full_data.groupby(['Sex', 'Pclass', 'Child', 'Embarked'])[['Test', 'Training']].sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Test,Training
Sex,Pclass,Child,Embarked,Unnamed: 4_level_1,Unnamed: 5_level_1
female,1,False,C,28,38
female,1,False,Q,1,1
female,1,False,S,21,41
female,1,True,S,0,3
female,2,False,C,3,5
female,2,False,Q,0,1
female,2,False,S,21,58
female,2,True,C,1,2
female,2,True,S,5,8
female,3,False,C,7,5


In [64]:
x_columns = list(titanic_category.columns)
x_columns.remove(predict_value)
x_columns

from sklearn.cross_validation import train_test_split, StratifiedShuffleSplit


X_train, X_test, Y_train, Y_test = train_test_split(titanic_category[x_columns],
                                                                                        titanic_category[predict_value],
                                                                                        test_size=0.3,
                                                                                        random_state=50)

print(len(X_train), len(X_test))

train_user = set(X_train.index)

training_titanic = titanic.query('PassengerId in @train_user')
test_titanic = titanic.query('PassengerId not in @train_user')

## データに偏りがないかを調べる
pd.concat([X_train.sum(), X_test.sum()], axis=1)

498 214


Unnamed: 0,0,1
Intercept,498.0,214.0
C(Sex)[T.male],317.0,136.0
C(Child)[T.True],58.0,25.0
C(Embarked)[T.Q],19.0,9.0
C(Embarked)[T.S],389.0,165.0
Pclass,1110.0,485.0


In [84]:
import patsy
from sklearn import tree
from sklearn.feature_extraction import DictVectorizer

decision_tree_model = tree.DecisionTreeClassifier(max_depth=3, min_samples_leaf=8)
decision_tree_model.fit(X_train, Y_train)

predict = decision_tree_model.predict(X_test)
print("hit_rate:", hit_rate(predict, Y_test) )

def output_result(model, category_data, base_data):
    with open('data/result.csv', 'w') as f:
        f.write('PassengerId,Survived\n')
        for r, pid in zip(result, base_data['PassengerId']):
            f.write('{},{}\n'.format(pid, str(int(r))))

            
result = decision_tree_model.predict(test_data_category)
for i in range(len(test_data)):
    ticket = test_data.iloc[i]['Ticket']
    if ticket in survived_ticket_set:
        result[i] = 1
    if ticket in dead_ticket_set:
        result[i] = 0
        
with open('data/result.csv', 'w') as f:
    f.write('PassengerId,Survived\n')
    for r, pid in zip(result, test_data['PassengerId']):
        f.write('{},{}\n'.format(pid, str(int(r))))

hit_rate: 80.841%


In [159]:
from collections import defaultdict
cabin_table = titanic.groupby(['Ticket', 'Fare', 'Cabin'])[['Survived']].count()

t_f_cabin = dict()
for k, v in cabin_table.items():
    for (ticket, fare, cabin), c in v.items():
        if (ticket, fare) in t_f_cabin:
            if t_f_cabin[(ticket, fare)] > cabin:
                t_f_cabin[(ticket, fare)] = cabin
        else:
            t_f_cabin[(ticket, fare)] = cabin
    
    

def filled_cabin(x):
    key = x.Ticket or x.Fare
    if not x.Ticket or x.Fare:        
        return np.nan
    key = (x.Ticket, x.Fare)    
    if key in t_f_cabin:
        return t_f_cabin[key]
    return np.nan

titanic_cabin = titanic.assign(
    CabinFill=titanic['Cabin']
)

for i in range(len(titanic)):
    row = titanic_cabin.iloc[i]
    t_ = row['Ticket']
    f_ = row['Fare']
    print (titanic_cabin['CabinFill'][i])

nan
C85
nan
C123
nan


KeyError: 5

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Test,TicketCount,TicketSRate,Training,Type,CabinAlpha,Child
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,0,1,0.00,1,Train,,False
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0,1,1.00,1,Train,C,False
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,0,1,1.00,1,Train,,False
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,0,2,0.50,1,Train,C,False
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,0,1,0.00,1,Train,,False
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,0,1,0.00,1,Train,E,False
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S,0,4,0.00,1,Train,,True
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,0,3,1.00,1,Train,,False
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,0,2,0.50,1,Train,,True
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7000,G6,S,0,2,1.00,1,Train,G,True
