# インポート

In [8]:
import pandas as pd
from math import log, exp

# 一回目

## ダウンロード

In [3]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [4]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Ageをバンドリングする
10歳ごとにバンドリングする
- 変数名：Age_by10
- [n, n+10) = n

In [5]:
df_train["Age_by10"] = df_train["Age"]//10*10

In [6]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_by10
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,20.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,30.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,20.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,30.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,30.0


In [7]:
df_test["Age_by10"] = df_test["Age"]//10*10

## Fareをバンドリングする
x → log(log(x+1)+1)にして, 0.2ごとにバンドリング
- 変数名：Fare_banded
- [log(log(x+1)+1), log(log(x+1)+1)+0.2) = x//1

In [10]:
tmp = df_train["Fare"].map(lambda x: log(log(x+1)+1))
df_train["Fare_banded"] = (tmp//0.2*0.2).map(lambda x: exp(exp(x)-1)-1)//1

In [11]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_by10,Fare_banded
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,20.0,4.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,30.0,51.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,20.0,4.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,30.0,51.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,30.0,4.0


In [16]:
tmp = df_test["Fare"].map(lambda x: log(log(x+1)+1))
df_test["Fare_banded"] = (tmp//0.2*0.2).map(lambda x: exp(exp(x)-1)-1)//1

## Embarkedを順序付きカテゴリカルデータに変換する
S → 1, C → 2, Q → 3
- 変数名：Embarked_int

In [17]:
def f(s):
    if s == "S":
        return 1
    elif s == "C":
        return 2
    elif s == "Q":
        return 3
    else:
        return s

In [20]:
df_train["Embarked_int"] = df_train["Embarked"].map(f)

In [23]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_by10,Fare_banded,Embarked_int
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,20.0,4.0,1.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,30.0,51.0,2.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,20.0,4.0,1.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,30.0,51.0,1.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,30.0,4.0,1.0


In [24]:
df_test["Embarked_int"] = df_test["Embarked"].map(f)

## 保存する

In [26]:
print(df_train.shape)
print(df_test.shape)

(891, 15)
(418, 14)


In [39]:
df_train.to_csv("data/cleansed_train.csv", index=False)
df_test.to_csv("data/cleansed_test.csv", index=False)