In [34]:
import pandas as pd

### 获取数据

In [35]:
train_data = pd.read_csv("../../../datas/titanic/train.csv")
test_data = pd.read_csv("../../../datas/titanic/test.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### 选取特征属性

In [36]:
x_train = train_data[["Pclass","Age","Sex"]]
y_train = train_data["Survived"]
x_test = test_data[["Pclass","Age","Sex"]]
y_test = pd.read_csv("../../../datas/titanic/gender_submission.csv")["Survived"]
x_train.head()

Unnamed: 0,Pclass,Age,Sex
0,3,22.0,male
1,1,38.0,female
2,3,26.0,female
3,1,35.0,female
4,3,35.0,male


### 数据处理

#### 缺失值处理

In [37]:
x_train["Age"].fillna(x_train["Age"].mean(),inplace=True)
x_test["Age"].fillna(x_test["Age"].mean(),inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train["Age"].fillna(x_train["Age"].mean(),inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test["Age"].fillna(x_test["Age"].mean(),inplace=True)


#### 转换为字典

In [39]:
x_train = x_train.to_dict(orient="records")
x_test = x_test.to_dict(orient="records")
x_train

[{'Pclass': 3, 'Age': 22.0, 'Sex': 'male'},
 {'Pclass': 1, 'Age': 38.0, 'Sex': 'female'},
 {'Pclass': 3, 'Age': 26.0, 'Sex': 'female'},
 {'Pclass': 1, 'Age': 35.0, 'Sex': 'female'},
 {'Pclass': 3, 'Age': 35.0, 'Sex': 'male'},
 {'Pclass': 3, 'Age': 29.69911764705882, 'Sex': 'male'},
 {'Pclass': 1, 'Age': 54.0, 'Sex': 'male'},
 {'Pclass': 3, 'Age': 2.0, 'Sex': 'male'},
 {'Pclass': 3, 'Age': 27.0, 'Sex': 'female'},
 {'Pclass': 2, 'Age': 14.0, 'Sex': 'female'},
 {'Pclass': 3, 'Age': 4.0, 'Sex': 'female'},
 {'Pclass': 1, 'Age': 58.0, 'Sex': 'female'},
 {'Pclass': 3, 'Age': 20.0, 'Sex': 'male'},
 {'Pclass': 3, 'Age': 39.0, 'Sex': 'male'},
 {'Pclass': 3, 'Age': 14.0, 'Sex': 'female'},
 {'Pclass': 2, 'Age': 55.0, 'Sex': 'female'},
 {'Pclass': 3, 'Age': 2.0, 'Sex': 'male'},
 {'Pclass': 2, 'Age': 29.69911764705882, 'Sex': 'male'},
 {'Pclass': 3, 'Age': 31.0, 'Sex': 'female'},
 {'Pclass': 3, 'Age': 29.69911764705882, 'Sex': 'female'},
 {'Pclass': 2, 'Age': 35.0, 'Sex': 'male'},
 {'Pclass': 2, 'Ag

#### 字典特征抽取

In [49]:
from sklearn.feature_extraction import DictVectorizer
transer = DictVectorizer(sparse = False) # 默认sparse = True 返回稀疏矩阵 
# 2. 调用fit_transform()
x_train = transer.fit_transform(x_train)
x_test = transer.transform(x_test)
print(transer.get_feature_names())
print(x_test)

['Age', 'Pclass', 'Sex=female', 'Sex=male']
[[34.5         3.          0.          1.        ]
 [47.          3.          1.          0.        ]
 [62.          2.          0.          1.        ]
 ...
 [38.5         3.          0.          1.        ]
 [30.27259036  3.          0.          1.        ]
 [30.27259036  3.          0.          1.        ]]




### 决策树预估

In [51]:
# 3. 使用决策树预估器
from sklearn.tree import DecisionTreeClassifier
# entropy 以信息增益为依划分据 默认为gini
# max_depth 树深度设置，使不会过拟合
estimator = DecisionTreeClassifier(criterion="entropy",max_depth=4) 
estimator.fit(x_train,y_train)
# 4. 模型评估
# 法一 直接比对真实值和预测值
y_predict =  estimator.predict(x_test)
print("y_predict:\n",y_predict)
print("直接比对真实值和预测值：\n",y_predict == y_test)

# 法二 计算准确率
score = estimator.score(x_test,y_test)
print("准确率：\n",score)

y_predict:
 [0 0 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1 0 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0 0 1
 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 1 0
 1 0 0 1 0 1 0 0 0 0 0 0 1 1 1 1 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0
 0 0 1 0 0 1 0 0 1 1 0 1 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 1 0 0 0 0 0 1 0 1 1 0 1 1 0 0 1 0 0 1 0 1 0 0 0 0 1 1 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 1 0 1 1 0
 0 1 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 1 0 0 0
 0 1 1 1 1 1 0 1 0 0 0]
直接比对真实值和预测值：
 0       True
1      False
2       True
3       True
4       True
       ...  
413     True
414     True
415     True
416     

### 随机森林
集成学习的一种方法  
集成学习：训练多个分类器，由多个分类器的众数决定  
随机：  
1. 训练集随机
    * bootstrap:随机有放回抽样
2. 特征随机
    * 从大M个特征中随机抽取m个特征  (M>>m)
    * 能实现降维
    * 能使好的结果脱颖而出

适用场景：
1. 在当前所有算法中具有极好的准确率
2. 能够有效运行在大数据集上，处理高维特征输入样本，而且不需要降维
3. 能评估各个特征在分类问题上的重要性

In [54]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [55]:
estimator2 = RandomForestClassifier()
# 5. 加入网格搜索和交叉验证
param_dict = {
    "n_estimators" : [120,200,300,500,800,1200],
    "max_depth" : [4,5,8,15,25,30]
}
estimator2 = GridSearchCV(estimator2,param_grid=param_dict,cv=10)  # cv: 交叉验证的轮数
estimator2.fit(x_train,y_train)

# 5. 模型评估
# 法一 直接比对真实值和预测值
y_predict =  estimator2.predict(x_test)
print("y_predict:\n",y_predict)
print("直接比对真实值和预测值：\n",y_predict == y_test)

# 法二 计算准确率
score = estimator2.score(x_test,y_test)
print("准确率：\n",score)

y_predict:
 [0 0 1 1 1 0 0 0 0 0 0 0 1 1 1 1 0 0 1 0 0 0 1 0 1 0 1 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 1 1 0 0 0
 1 1 0 1 0 1 1 0 1 0 0 0 1 0 0 1 1 0 1 0 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0
 0 1 0 1 0 0 1 1 1 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
 0 0 1 0 0 1 1 0 1 0 1 1 0 0 0 0 0 1 1 0 1 1 0 1 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0 1 1 1 0 0 1 0 1 0
 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 1 1 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 0 0 1 1 1 0 0 0 1 1 0
 0 1 1 0 1 1 1 0 0 0 0 0 0 1 0 1 0 0 0 1 1 1 1 0 0 1 0 1 0 0 1 0 1 1 1 0 0
 0 0 0 0 1 0 0 1 0 0 0]
直接比对真实值和预测值：
 0       True
1      False
2      False
3      False
4       True
       ...  
413     True
414     True
415     True
416     