# インポートとダウンロード

In [1]:
import pandas as pd
import numpy as np
from math import log, exp

from bokeh.io import output_notebook, show
output_notebook()
from bokeh.io import output_file, save, show

from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, Range1d, LinearAxis

from IPython.display import display

In [2]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

# 基本的な可視化

In [3]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df_train.shape

(891, 12)

In [5]:
# nanがある場合はnanもカテゴリとして数える
columns = df_train.columns
for column in columns:
    num_category = len(set(df_train[column]))
    column_type = df_train[column].dtypes
    print(str(column_type) + "\t" + str(num_category) + "\t" + column)

int64	891	PassengerId
int64	2	Survived
int64	3	Pclass
object	891	Name
object	2	Sex
float64	265	Age
int64	7	SibSp
int64	7	Parch
object	681	Ticket
float64	248	Fare
object	148	Cabin
object	4	Embarked


In [6]:
df_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
df_train.isnull().sum(axis=1).value_counts().sort_index()

0    183
1    550
2    158
dtype: int64

# ラベルの可視化

In [8]:
def vis_label(df, label_column, unique_column):
    output_file("visData/label.html")
    plot_data = df[[label_column, unique_column]].groupby(label_column, as_index=False).count()
    plot_data = plot_data.astype({label_column:str})
    source = ColumnDataSource(plot_data)
    p = figure(x_range=plot_data[label_column], title="label:"+label_column)
    p.vbar(x=label_column, top=unique_column, source=source, width=0.5)
    save(p)

In [9]:
vis_label(df_train, "Survived", "PassengerId")

# 特徴量の可視化

In [5]:
def vis_data(df, column, label_column, unique_column, sort=False, output_form="save"):
    output_file("visData/" + column + ".html")
    bar_data = df[[column, unique_column]].groupby(column, as_index=False).count().rename({unique_column:"BarData"}, axis=1)
    line_data = df[df[label_column] == 1][[column, unique_column]].groupby(column, as_index=False).count().rename({unique_column:"LineData"}, axis=1)
    plot_data = pd.merge(bar_data, line_data, on=column)
    plot_data["LineData"] = plot_data["LineData"]/plot_data["BarData"]
    plot_data = plot_data.astype({column:str})
    if sort:
        plot_data = plot_data.sort_values("LineData", ascending=False)
    source = ColumnDataSource(plot_data)
    p = figure(x_range=plot_data[column], title=column)
    p.vbar(x=column, top="BarData", source=source, width=0.5)
    p.extra_y_ranges = {"LineAxis":Range1d(start=-0.05, end=1.05)}
    p.add_layout(LinearAxis(y_range_name="LineAxis"), "right")
    p.line(x=column, y="LineData", source=source, color="firebrick", y_range_name="LineAxis", line_width=3)
    if output_form == "save":
        save(p)
    elif output_form == "show":
        show(p)

## Pclass,Sex,Sibsp,Parch

In [11]:
vis_data(df_train, "Pclass", "Survived", "PassengerId")
vis_data(df_train, "Sex", "Survived", "PassengerId", sort=True)
vis_data(df_train, "SibSp", "Survived", "PassengerId")
vis_data(df_train, "Parch", "Survived", "PassengerId")

## Embarked
乗った順番はS→C→Qなのでその順番になるように名前を変更

In [12]:
tmp = df_train.copy()
tmp.loc[tmp["Embarked"] == "S", "Embarked"] = "1_Southampton"
tmp.loc[tmp["Embarked"] == "C", "Embarked"] = "2_Cherbourg"
tmp.loc[tmp["Embarked"] == "Q", "Embarked"] = "3_Queenstown"

In [13]:
set(tmp["Embarked"])

{'1_Southampton', '2_Cherbourg', '3_Queenstown', nan}

In [14]:
vis_data(tmp, "Embarked", "Survived", "PassengerId")

## Age

In [15]:
tmp = df_train.copy()
tmp.sort_values("Age")

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
803,804,1,3,"Thomas, Master. Assad Alexander",male,0.42,0,1,2625,8.5167,,C
755,756,1,2,"Hamalainen, Master. Viljo",male,0.67,1,1,250649,14.5000,,S
644,645,1,3,"Baclini, Miss. Eugenie",female,0.75,2,1,2666,19.2583,,C
469,470,1,3,"Baclini, Miss. Helene Barbara",female,0.75,2,1,2666,19.2583,,C
78,79,1,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,248738,29.0000,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


In [16]:
# 0-1の人7人しかいないし,みんな生き残ってるから全員0歳にする
tmp.loc[tmp["Age"] < 1, "Age"] = 0
tmp[["Age", "PassengerId"]].groupby("Age", as_index=False).count()

Unnamed: 0,Age,PassengerId
0,0.0,7
1,1.0,7
2,2.0,10
3,3.0,6
4,4.0,10
...,...,...
79,70.0,2
80,70.5,1
81,71.0,2
82,74.0,1


In [17]:
vis_data(tmp, "Age", "Survived", "PassengerId", output_form="show")

In [18]:
tmp[tmp["Age"]*2%2 == 1]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
57,58,0,3,"Novel, Mr. Mansouer",male,28.5,0,0,2697,7.2292,,C
111,112,0,3,"Zabour, Miss. Hileni",female,14.5,1,0,2665,14.4542,,C
116,117,0,3,"Connors, Mr. Patrick",male,70.5,0,0,370369,7.75,,Q
122,123,0,2,"Nasser, Mr. Nicholas",male,32.5,1,0,237736,30.0708,,C
123,124,1,2,"Webber, Miss. Susan",female,32.5,0,0,27267,13.0,E101,S
148,149,0,2,"Navratil, Mr. Michel (""Louis M Hoffman"")",male,36.5,0,2,230080,26.0,F2,S
152,153,0,3,"Meo, Mr. Alfonzo",male,55.5,0,0,A.5. 11206,8.05,,S
153,154,0,3,"van Billiard, Mr. Austin Blyler",male,40.5,0,2,A/5. 851,14.5,,S
203,204,0,3,"Youseff, Mr. Gerious",male,45.5,0,0,2628,7.225,,C
227,228,0,3,"Lovell, Mr. John Hall (""Henry"")",male,20.5,0,0,A/5 21173,7.25,,S


In [19]:
# xx.5の予測された人18人しかいないから落とす
tmp = tmp[tmp["Age"]*2%2 != 1]

In [20]:
vis_data(tmp, "Age", "Survived", "PassengerId", output_form="show")

In [21]:
# 5歳ごとにバンドリング
tmp["Age"] = tmp["Age"]//5*5

In [22]:
vis_data(tmp, "Age", "Survived", "PassengerId", output_form="show")

In [23]:
# 80歳越え一人しかいないから60歳にまとめる
tmp.loc[tmp["Age"] == 80.0, "Age"] = 60.0

In [24]:
vis_data(tmp, "Age", "Survived", "PassengerId", output_form="show")

In [25]:
# 10歳ごとだとどう？
tmp["Age"] = tmp["Age"]//10*10

In [26]:
vis_data(tmp, "Age", "Survived", "PassengerId", output_form="show")

In [27]:
# 5歳ごとと10歳ごとであんまり傾向変わらなさそうだからこっちで保存
vis_data(tmp, "Age", "Survived", "PassengerId")

## Ticket
- 681種類もあってこのままではどうしようもない
- Ticketの番号,接頭語の規則がわかれば使い道あるかも
- 現実では規則はわかってるはずだが,今回は説明なし

## Fare

In [3]:
tmp = df_train.copy()

In [29]:
tmp.sort_values("Fare")

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
271,272,1,3,"Tornquist, Mr. William Henry",male,25.0,0,0,LINE,0.0000,,S
597,598,0,3,"Johnson, Mr. Alfred",male,49.0,0,0,LINE,0.0000,,S
302,303,0,3,"Johnson, Mr. William Cahoone Jr",male,19.0,0,0,LINE,0.0000,,S
633,634,0,1,"Parr, Mr. William Henry Marsh",male,,0,0,112052,0.0000,,S
277,278,0,2,"Parkes, Mr. Francis ""Frank""",male,,0,0,239853,0.0000,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
438,439,0,1,"Fortune, Mr. Mark",male,64.0,1,4,19950,263.0000,C23 C25 C27,S
341,342,1,1,"Fortune, Miss. Alice Elizabeth",female,24.0,3,2,19950,263.0000,C23 C25 C27,S
737,738,1,1,"Lesurer, Mr. Gustave J",male,35.0,0,0,PC 17755,512.3292,B101,C
258,259,1,1,"Ward, Miss. Anna",female,35.0,0,0,PC 17755,512.3292,,C


In [30]:
len(set(tmp["Fare"]))

248

In [31]:
# 意外と少ないからとりあえず見てみる
vis_data(tmp, "Fare", "Survived", "PassengerId", output_form="show")

In [32]:
# やっぱり見にくいから小数は無視してみる
len(set(tmp["Fare"]//1))

91

In [33]:
t = tmp.copy()
t["Fare"] = tmp["Fare"]//1
vis_data(t, "Fare", "Survived", "PassengerId", output_form="show")

In [34]:
# 下に固まってるからlog(x+1)してみる
t = tmp.copy()
t["Fare"] = t["Fare"].map(lambda x: log(x+1))
t["Fare"] = (t["Fare"]//1).map(lambda x: exp(x)-1)//1
vis_data(t, "Fare", "Survived", "PassengerId", output_form="show")

In [6]:
# まだ固まってるからもう一回log(x+1)
# 2以下だったから0.2ごとにわける
t = tmp.copy()
t["Fare"] = t["Fare"].map(lambda x: log(log(x+1)+1))
t["Fare"] = (t["Fare"]//0.2*0.2).map(lambda x: exp(exp(x)-1)-1)//1
vis_data(t, "Fare", "Survived", "PassengerId", output_form="show")

In [7]:
set(t["Fare"])

{0.0, 2.0, 4.0, 9.0, 20.0, 51.0, 154.0}

In [36]:
# 0.2ごとだと少ないから0.1ごと
t = tmp.copy()
t["Fare"] = t["Fare"].map(lambda x: log(log(x+1)+1))
t["Fare"] = (t["Fare"]//0.1*0.1).map(lambda x: exp(exp(x)-1)-1)//1
vis_data(t, "Fare", "Survived", "PassengerId", output_form="show")

In [37]:
# 0.2ごとが一番見やすそうだからこれで出力
t = tmp.copy()
t["Fare"] = t["Fare"].map(lambda x: log(log(x+1)+1))
t["Fare"] = (t["Fare"]//0.2*0.2).map(lambda x: exp(exp(x)-1)-1)//1
vis_data(t, "Fare", "Survived", "PassengerId")

## Cabin
- 欠損値687
- カテゴリ数147(101個は値が一つ,46個は値が複数)
- Ticketと合わせて家族を探す手掛かりになるかも

In [38]:
tmp.groupby("Cabin").count()

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
A10,1,1,1,1,1,1,1,1,1,1,1
A14,1,1,1,1,1,0,1,1,1,1,1
A16,1,1,1,1,1,1,1,1,1,1,1
A19,1,1,1,1,1,0,1,1,1,1,1
A20,1,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...
F33,3,3,3,3,3,3,3,3,3,3,3
F38,1,1,1,1,1,0,1,1,1,1,1
F4,2,2,2,2,2,2,2,2,2,2,2
G6,4,4,4,4,4,4,4,4,4,4,4


In [39]:
t = tmp.groupby("Cabin", as_index=False).count()
t[t["PassengerId"] > 1].shape

(46, 12)