## 集成模型（分类）

In [2]:
import pandas as pd

titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')

#人工选取pclass/age/sex作为判别乘客是否能够生还的特征
X=titanic[['pclass','age','sex']]
y=titanic['survived']

X['age'].fillna(X['age'].mean(),inplace=True)   #inplace=True不创建新的对象，直接对原始对象进行修改；

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25,random_state = 33)

#对类别特征进行转化，成为特征向量
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse = False)
X_train = vec.fit_transform(X_train.to_dict(orient = 'record'))
X_test = vec.transform(X_test.to_dict(orient = 'record'))

#使用单一决策树进行模型训练以及预测分析
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train,y_train)
dtc_y_pred = dtc.predict(X_test)

#使用随机森林分类器进行集成模型的训练以及预测分析
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
rfc_y_pred = rfc.predict(X_test)

#使用梯度提升决策树进行集成模型的训练以及预测分析
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train,y_train)
gbc_y_pred = gbc.predict(X_test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [3]:
from sklearn.metrics import classification_report

print('The accuracy of dtc is',dtc.score(X_test,y_test))
print(classification_report(dtc_y_pred,y_test))

print('The accuracy of rfc is',rfc.score(X_test,y_test))
print(classification_report(rfc_y_pred,y_test))

print('The accuracy of gtb is',gbc.score(X_test,y_test))
print(classification_report(gbc_y_pred,y_test))

The accuracy of dtc is 0.7811550151975684
             precision    recall  f1-score   support

          0       0.91      0.78      0.84       236
          1       0.58      0.80      0.67        93

avg / total       0.81      0.78      0.79       329

The accuracy of rfc is 0.78419452887538
             precision    recall  f1-score   support

          0       0.91      0.78      0.84       235
          1       0.59      0.80      0.68        94

avg / total       0.82      0.78      0.79       329

The accuracy of gtb is 0.790273556231003
             precision    recall  f1-score   support

          0       0.92      0.78      0.84       239
          1       0.58      0.82      0.68        90

avg / total       0.83      0.79      0.80       329



In [None]:
#集成模型可以说是实战应用中最为常见的，相比于其他单一的学习模型，集成模型可以整合多种模型，或者多次就一种类型的模型进行建模。由于模型估计参数的过程同样受到概率的影响，具有一定的不确定性；因此，集成模型虽然在训练过程中要耗费更多的时间，但是得到的综合模型往往具有更高的表现性能和更好的稳定性。