In [22]:
%matplotlib inline
import pandas as pd
import numpy as np
train = pd.read_csv(r'E:\Mirror\GitHub\Predict-survival-on-the-Titanic\data\train.csv')
test = pd.read_csv(r'E:\Mirror\GitHub\Predict-survival-on-the-Titanic\data\test.csv')
full_data = [train, test]
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


## 1. Pclass ##

票类：经济地位的象征

序号 | 票类
---- | ----
1 | 头等舱
2 | 中等舱
3 | 末等舱

In [23]:
# One-hot编码
# train
train['P1'] = np.array(train['Pclass'] == 1).astype(np.int32)
train['P2'] = np.array(train['Pclass'] == 2).astype(np.int32)
train['P3'] = np.array(train['Pclass'] == 3).astype(np.int32)
# test
test['P1'] = np.array(test['Pclass'] == 1).astype(np.int32)
test['P2'] = np.array(test['Pclass'] == 2).astype(np.int32)
test['P3'] = np.array(test['Pclass'] == 3).astype(np.int32)
train.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,P1,P2,P3
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,0,1


## 2. Sex ##

性别：男or女

Sex | label
---- | ----
male | 1
female | 0

In [24]:
# 把male/female转换成1/0
train['Sex'] = [1 if i == 'male' else 0 for i in train.Sex]
test['Sex'] = [1 if i == 'male' else 0 for i in test.Sex]
train.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,P1,P2,P3
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S,0,0,1


## 3. SibSp and Parch ##

- SibSp

the number of siblings/spouse：兄弟姐妹/配偶人数

- Parch

the number of children/parents：子女/父母人数


In [25]:
# 'FamilySize'：家庭成员人数
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
train.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,P1,P2,P3,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S,0,0,1,2


In [26]:
# 'IsAlone'：是否只身一人
for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
train.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,P1,P2,P3,FamilySize,IsAlone
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S,0,0,1,2,0


## 4. Embarked ##

登船港口，有缺失值，先进行缺失值处理

C = Cherbourg, Q = Queenstown, S = Southampton

In [27]:
# 缺失值处理
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
# One-hot编码
# train
train['E1'] = np.array(train['Embarked'] == 'S').astype(np.int32)
train['E2'] = np.array(train['Embarked'] == 'C').astype(np.int32)
train['E3'] = np.array(train['Embarked'] == 'Q').astype(np.int32)
# test
test['E1'] = np.array(test['Embarked'] == 'S').astype(np.int32)
test['E2'] = np.array(test['Embarked'] == 'C').astype(np.int32)
test['E3'] = np.array(test['Embarked'] == 'Q').astype(np.int32)
train.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,P1,P2,P3,FamilySize,IsAlone,E1,E2,E3
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S,0,0,1,2,0,1,0,0


## 5. Fare ##

乘客票价

In [34]:
# train
train['CategoricalFare'] = pd.qcut(train['Fare'], 4)
train['CategoricalFare'].cat.categories = [1, 2, 3, 4]
# one-hot编码
train['C1'] = np.array(train['CategoricalFare'] == 1).astype(np.int32)
train['C2'] = np.array(train['CategoricalFare'] == 2).astype(np.int32)
train['C3'] = np.array(train['CategoricalFare'] == 3).astype(np.int32)
train['C4'] = np.array(train['CategoricalFare'] == 4).astype(np.int32)

# test
test['CategoricalFare'] = pd.qcut(test['Fare'], 4)
test['CategoricalFare'].cat.categories = [1, 2, 3, 4]
# one-hot编码
test['C1'] = np.array(test['CategoricalFare'] == 1).astype(np.int32)
test['C2'] = np.array(test['CategoricalFare'] == 2).astype(np.int32)
test['C3'] = np.array(test['CategoricalFare'] == 3).astype(np.int32)
test['C4'] = np.array(test['CategoricalFare'] == 4).astype(np.int32)

train.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,FamilySize,IsAlone,E1,E2,E3,CategoricalFare,C1,C2,C3,C4
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,...,2,0,1,0,0,1,1,0,0,0


## 6. Age ##

缺失值处理

In [39]:
for dataset in full_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)
train['CategoricalAge'] = pd.qcut(train['Age'], 5)
train['CategoricalAge']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0        (19.0, 25.0]
1        (31.0, 40.0]
2        (25.0, 31.0]
3        (31.0, 40.0]
4        (31.0, 40.0]
5        (25.0, 31.0]
6        (40.0, 80.0]
7      (-0.001, 19.0]
8        (25.0, 31.0]
9      (-0.001, 19.0]
10     (-0.001, 19.0]
11       (40.0, 80.0]
12       (19.0, 25.0]
13       (31.0, 40.0]
14     (-0.001, 19.0]
15       (40.0, 80.0]
16     (-0.001, 19.0]
17       (31.0, 40.0]
18       (25.0, 31.0]
19     (-0.001, 19.0]
20       (31.0, 40.0]
21       (31.0, 40.0]
22     (-0.001, 19.0]
23       (25.0, 31.0]
24     (-0.001, 19.0]
25       (31.0, 40.0]
26       (19.0, 25.0]
27     (-0.001, 19.0]
28     (-0.001, 19.0]
29       (31.0, 40.0]
            ...      
861      (19.0, 25.0]
862      (40.0, 80.0]
863    (-0.001, 19.0]
864      (19.0, 25.0]
865      (40.0, 80.0]
866      (25.0, 31.0]
867      (25.0, 31.0]
868      (25.0, 31.0]
869    (-0.001, 19.0]
870      (25.0, 31.0]
871      (40.0, 80.0]
872      (31.0, 40.0]
873      (40.0, 80.0]
874      (25.0, 31.0]
875    (-0