In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier,export_graphviz

读取数据train_data和test_data

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

查看数据集的详细信息，主要是查看是否有缺失值的情况。

In [3]:
data = train_data.append(test_data)

In [4]:
data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
Age            1046 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1308 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB


## 查看数据的详细情况
让我们来明细一下数据标签的含义
PassengerId: 乘客的序号
Survived: 是否存活。0表示没有，1表示存活下来了。
Pclass： 1表示第一阶层，2表示第二阶层，3表示第三阶层
name: 乘客名字
sex: 乘客性别
Age: 年龄
SibSp:

In [5]:
train_data.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [6]:
data.describe()

Unnamed: 0,Age,Fare,Parch,PassengerId,Pclass,SibSp,Survived
count,1046.0,1308.0,1309.0,1309.0,1309.0,1309.0,891.0
mean,29.881138,33.295479,0.385027,655.0,2.294882,0.498854,0.383838
std,14.413493,51.758668,0.86556,378.020061,0.837836,1.041658,0.486592
min,0.17,0.0,0.0,1.0,1.0,0.0,0.0
25%,21.0,7.8958,0.0,328.0,2.0,0.0,0.0
50%,28.0,14.4542,0.0,655.0,3.0,0.0,0.0
75%,39.0,31.275,0.0,982.0,3.0,1.0,1.0
max,80.0,512.3292,9.0,1309.0,3.0,8.0,1.0


# 数据清洗
这个环节是整个项目中最令人蛋疼的地方。

In [7]:
data['Age'] = data['Age'].fillna(data['Age'].mean())
data['Fare'] = data['Fare'].fillna(data['Fare'].mean())
print(data['Age'].count())
print(data['Fare'].count())

1309
1309


Age和Fare都变成了1309行，说明填充成功

接下来处理Cabin和Embarked。

In [8]:
data['Cabin'] = data['Cabin'].fillna('N')
print(data['Cabin'].count())

1309


In [9]:
data['Embarked'].mode()

0    S
dtype: object

In [10]:
data['Embarked'] = data['Embarked'].fillna('S')
print(data['Embarked'].count())

1309


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
Age            1309 non-null float64
Cabin          1309 non-null object
Embarked       1309 non-null object
Fare           1309 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB


可以看到缺失值已经填充完毕，接下来进行特征提取。
我们继续对数据进行转换。可以使用如下处理方法。1，对于数值型数据直接使用；2.对于时间序列可以转换成年，月，日。3.对于分类数据用one-hot编码转换成数字。

1.性别数据sex，可以将原始数据中的male,female分别用0,1代替

In [12]:
map_dict = {'male':0,'female':1}
data['Sex'] = data['Sex'].map(map_dict)
data.tail()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
413,29.881138,N,S,8.05,"Spector, Mr. Woolf",0,1305,3,0,0,,A.5. 3236
414,39.0,C105,C,108.9,"Oliva y Ocana, Dona. Fermina",0,1306,1,1,0,,PC 17758
415,38.5,N,S,7.25,"Saether, Mr. Simon Sivertsen",0,1307,3,0,0,,SOTON/O.Q. 3101262
416,29.881138,N,S,8.05,"Ware, Mr. Frederick",0,1308,3,0,0,,359309
417,29.881138,N,C,22.3583,"Peter, Master. Michael J",1,1309,3,0,1,,2668


Parch,SibSp数据处理，Parch表示船上兄弟姐妹和配偶的数量，

In [None]:
pd