In [1]:
import pandas as pd

# 基本情報の確認
https://www.kaggle.com/c/titanic/data からtrain.csvをダウンロード

In [2]:
train = pd.read_csv('../../data/train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


・文字列データだと扱いにくいので数値データに変換

In [None]:
for label, value in [('male', 0), ('female', 1)]:
    train.loc[train.loc[:, 'Sex'] == label, 'Sex'] = value
train.loc[:, 'Sex'] = train.loc[:, 'Sex'].astype(int)

for label, value in [('C', 1), ('Q', 2), ('S', 3)]:
    train.loc[train.loc[:, 'Embarked'] == label, 'Embarked'] = value
train.loc[:, 'Embarked'] = train.loc[:, 'Embarked'].fillna(0)
train.loc[:, 'Embarked'] = train.loc[:, 'Embarked'].astype(int)

train.info()

# 属性の選択
横軸:各属性のクラス 縦軸:生存率のグラフで可視化

In [None]:
import matplotlib.pyplot as plt

In [None]:
survival_num = (train.loc[:, 'Survived'] == 1).sum()
print('Survival num: {:}'.format(survival_num))
survival_rate = survival_num / len(train)
print('Survival rate: {:.1f}%'.format(survival_rate * 100))

In [None]:
def visualize(attr, classes, survival_rates):
    plt.title(attr)
    plt.ylim(0, 1)
    plt.scatter(classes, survival_rates)
    plt.hlines(y=[survival_rate], xmin=min(*classes), xmax=max(*classes), linestyles='dashed')
    plt.show()

In [None]:
for attr in ['Sex', 'Pclass', 'SibSp', 'Parch', 'Embarked']:
    values = train.loc[:, attr]
    classes = list(range(int(values.min()), int(values.max()) + 1))
    survival_rates = []
    lens = []
    for class_ in classes:
        surv = train.loc[train.loc[:, attr] == class_, 'Survived'] == 1
        survival_rates.append(surv.sum() / len(surv)) # あるクラスの生存者数 / あるクラスの人数
        lens.append('{}:{}'.format(class_, len(surv)))
    print(attr + ' ' + ' '.join(lens))
    visualize(attr, classes, survival_rates)

In [None]:
attr = 'Age'
step = 10
classes = list(range(0, int(train.loc[:, attr].max()) + step, step))
survival_rates = []
lens = []
for class_ in classes:
    surv = train.loc[(train.loc[:, attr] >= class_) & (train.loc[:, attr] < class_ + step), 'Survived'] == 1
    survival_rates.append(surv.sum() / len(surv))
    lens.append('{}:{}'.format(class_, len(surv)))
print(attr + ' ' + ' '.join(lens))
visualize(attr, classes, survival_rates)

In [None]:
attr = 'Fare'
step = 10
classes = list(range(0, 90 + step, step))
survival_rates = []
lens = []
for class_ in classes:
    surv = train.loc[(train.loc[:, attr] >= class_) & (train.loc[:, attr] < class_ + step), 'Survived'] == 1
    survival_rates.append(surv.sum() / len(surv))
    lens.append('{}:{}'.format(class_, len(surv)))
print(attr + ' ' + ' '.join(lens))
visualize(attr, classes, survival_rates)

In [None]:
attr = 'Fare'
step = 100
classes = list(range(100, int(train.loc[:, attr].max()) + step, step))
survival_rates = []
lens = []
for class_ in classes:
    surv = train.loc[(train.loc[:, attr] >= class_) & (train.loc[:, attr] < class_ + step), 'Survived'] == 1
    survival_rates.append(surv.sum() / len(surv))
    lens.append('{}:{}'.format(class_, len(surv)))
print(attr + ' ' + ' '.join(lens))
visualize(attr, classes, survival_rates)

In [None]:
attr = 'Name'
values = train.loc[:, attr]
alphabets = 'abcdefghijklmnopqrstuvwxyz'
classes = []
survival_rates = []
lens = []
names = train.loc[:, attr]
for class_, char in enumerate(alphabets):
    idx = [char in name.lower() for name in names]
    surv = train.loc[idx, 'Survived'] == 1
    survival_rates.append(surv.sum() / len(surv)) # あるクラスの生存者数 / あるクラスの人数
    classes.append(class_)
    lens.append('{}({}):{}'.format(class_, char, len(surv)))
print(attr + ' ' + ' '.join(lens))
visualize(attr, classes, survival_rates)

# 欠損値の補完

In [None]:
# 欠損埋め
fill_train = train.loc[:, ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
med = fill_train['Age'].median()
fill_train.loc[:, 'Age'] = fill_train['Age'].fillna(med)
fill_train.info()

# 標準化

In [None]:
fill_train.head()

In [None]:
standardized_train = (fill_train - fill_train.mean()) / fill_train.std()
standardized_train.head()