# 优化等级标注预测详细流程

In [1]:
# 引入工程所需要的库
from pyecharts import online
online() # 使用远程 jshost

In [2]:
import numpy as np
import pandas as pd 

In [3]:
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

# 1. 数据总览

## 1.1 加载数据

In [4]:
# 原始数据路径
train_path = 'H://train_data.csv'
test_path = 'H://test_data.csv'
importance_feature_path = 'H://importance_feature_data.csv'
columns = pd.read_csv(importance_feature_path)

# 读取原始数据
# train_data-final数据
train_data = pd.read_csv(train_path)
train_data = train_data[list(columns['importance_feature'].values)+['hand']]
test_data = pd.read_csv(test_path)
test_data = test_data[columns['importance_feature'].values]

In [5]:
train_data.shape

(29710, 44)

In [6]:
test_data.shape

(1000000, 43)

观察前几行的源数据：

In [7]:
train_data.head()

Unnamed: 0,rankCount1,rankCount2,rankCount3,suitCount3,suitCount2,suitCount1,suitCount4,diff8_3,rankCount4,diff9_3,...,rank5,diff4_2,diff11_5,diff11_2,rank12,diff5_4,diff4_3,diff12_11,diff12_1,hand
0,5,0,0,5,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
1,5,0,0,0,0,0,5,0,0,0,...,0,0,1,1,1,0,0,0,0,1
2,5,0,0,0,5,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
3,5,0,0,0,0,5,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
4,5,0,0,0,0,5,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1


In [8]:
test_data.head()

Unnamed: 0,rankCount1,rankCount2,rankCount3,suitCount3,suitCount2,suitCount1,suitCount4,diff8_3,rankCount4,diff9_3,...,diff9_6,rank5,diff4_2,diff11_5,diff11_2,rank12,diff5_4,diff4_3,diff12_11,diff12_1
0,1,2,0,0,2,1,2,0,0,2,...,1,0,2,0,2,0,0,0,0,0
1,3,1,0,1,1,1,2,1,0,0,...,1,0,0,0,0,1,0,1,1,1
2,3,1,0,1,1,2,1,0,0,0,...,0,1,1,1,0,1,0,1,1,1
3,3,1,0,2,2,1,0,2,0,1,...,1,0,0,0,0,0,0,2,0,0
4,5,0,0,3,1,0,1,1,0,0,...,0,0,1,0,0,0,1,0,0,0


数据信息总览：

In [9]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29710 entries, 0 to 29709
Data columns (total 44 columns):
rankCount1    29710 non-null int64
rankCount2    29710 non-null int64
rankCount3    29710 non-null int64
suitCount3    29710 non-null int64
suitCount2    29710 non-null int64
suitCount1    29710 non-null int64
suitCount4    29710 non-null int64
diff8_3       29710 non-null int64
rankCount4    29710 non-null int64
diff9_3       29710 non-null int64
diff9_4       29710 non-null int64
diff9_2       29710 non-null int64
diff10_5      29710 non-null int64
diff9_8       29710 non-null int64
diff10_9      29710 non-null int64
diff8_4       29710 non-null int64
diff11_3      29710 non-null int64
diff8_2       29710 non-null int64
diff3_1       29710 non-null int64
diff3_2       29710 non-null int64
diff7_1       29710 non-null int64
diff13_8      29710 non-null int64
diff12_7      29710 non-null int64
diff11_6      29710 non-null int64
diff7_2       29710 non-null int64
diff6_1       29

In [10]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 43 columns):
rankCount1    1000000 non-null int64
rankCount2    1000000 non-null int64
rankCount3    1000000 non-null int64
suitCount3    1000000 non-null int64
suitCount2    1000000 non-null int64
suitCount1    1000000 non-null int64
suitCount4    1000000 non-null int64
diff8_3       1000000 non-null int64
rankCount4    1000000 non-null int64
diff9_3       1000000 non-null int64
diff9_4       1000000 non-null int64
diff9_2       1000000 non-null int64
diff10_5      1000000 non-null int64
diff9_8       1000000 non-null int64
diff10_9      1000000 non-null int64
diff8_4       1000000 non-null int64
diff11_3      1000000 non-null int64
diff8_2       1000000 non-null int64
diff3_1       1000000 non-null int64
diff3_2       1000000 non-null int64
diff7_1       1000000 non-null int64
diff13_8      1000000 non-null int64
diff12_7      1000000 non-null int64
diff11_6      1000000 non-null int64


从上面的数据我们可以看出training-final（train_data）有11列，共6000条数据，无缺失数据；Semifinal-testing-final（test_data）有10列，共1000000条数据，无缺失数据。

In [11]:
train_data_X = train_data.drop(['hand'],axis=1)
train_data_Y = train_data.hand
test_data_X = test_data


# 2. 模型的融合及预测

## 2.1 模型融合

常见的模型融合方法有：Bagging、Boosting、Stacking、Blending。

### 2.1.1 Bagging

Bagging将多个模型，也就是基学习器的预测结果进行简单的加权平均或者投票。它的好处是可以并行地训练基学习器。Random Forest就用到了Bagging的思想。

### 2.1.2 Boosting

Boosting的思想有点像知错能改，每个基学习器是在上一个基学习器学习的基础上，对上一个基学习器的错误进行弥补。我们将会用到的AdaBoost，Gradient Boost就用到了这种思想。

### 2.1.3 Stacking

Stacking是用新的次学习器去学习如何组合上一层的基学习器。如果把Bagging看作是多个基分类器的线性组合，那么Stacking就是多个基分类器的非线性组合。Stacking可以将学习器一层一层地堆砌起来，形成一个网状的结构。
相比来说Stacking的融合框架相对前面二者来说在精度上确实有一定的提升，所以在下面的模型融合上，我们也使用Stacking方法。

### 2.1.4 Blending

Blending和Stacking很相似，但同时它可以防止信息泄露的问题。

Stacking框架融合：这里我们使用了两层的模型融合

Level 1使用了：Random Forest、AdaBoost、ExtraTrees、GBDT、Decision Tree、KNN、SVM，一共7个模型

Level 2使用了XGBoost，使用第一层预测的结果作为特征对最终的结果进行预测。

Level 1：

Stacking框架是堆叠使用基础分类器的预测作为对二级模型的训练的输入。然而，我们不能简单地在全部训练数据上训练基本模型，产生预测，输出用于第二层的训练。如果我们在Train Data上训练，然后在Train Data上预测，就会造成标签。为了避免标签，我们需要对每个基学习器使用K-fold，将Kge模型对Valid Set的预测结果拼起来，作为下一层学习器的输入。

所以这里我们建立输出fold预测方法：

In [12]:
from sklearn.model_selection import KFold
 
# Some useful parameters which will come in handy later on
ntrain = train_data_X.shape[0]
ntest = test_data_X.shape[0]
SEED = 0 #for reproducibility
NFOLDS = 7 # set folds for out-of-fold prediction
kf = KFold(n_splits = NFOLDS,random_state=SEED,shuffle=False)
 
def get_out_fold(clf,x_train,y_train,x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS,ntest))
    
    for i, (train_index,test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]
        
        clf.fit(x_tr,y_tr)
        
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i,:] = clf.predict(x_test)
        
    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1,1),oof_test.reshape(-1,1)

构建不同的基学习器，这里我们使用了RandomForest、AdaBoost、ExtraTrees、DecisionTree、SVM七个基学习器：（这里的模型可以使用如上面的GridSearch方法对模型的超参数进行搜索选择

In [13]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

mlp = MLPClassifier(hidden_layer_sizes=(50, 50, 15), solver='adam', activation='tanh', alpha = 0.0001, learning_rate_init=0.001, max_iter=400)

rf = RandomForestClassifier(max_depth=32,min_samples_split=3,min_samples_leaf=2,verbose=0)

et = ExtraTreesClassifier(n_estimators=150,max_depth=15,min_samples_leaf=2,verbose=0)

dt = DecisionTreeClassifier(max_depth=17)

svm = SVC(kernel='rbf', gamma=0.1, C=1)


将pandas转换为arrays

In [14]:
# Create Numpy arrays of train,test and target(Survived) dataframes to feed into our models
x_train = train_data_X.values   #Creates an array of the train data
x_test =  test_data_X.values   #Creates an array of the test data
y_train = train_data_Y

In [15]:
# Create our OOF train and test predictions.These base result will be used as new featurs
mlp_oof_train,mlp_oof_test = get_out_fold(mlp,x_train,y_train,x_test)  # MLP
rf_oof_train,rf_oof_test = get_out_fold(rf,x_train,y_train,x_test)  # Random Forest
et_oof_train,et_oof_test = get_out_fold(et,x_train,y_train,x_test)  # Extra Trees
dt_oof_train,dt_oof_test = get_out_fold(dt,x_train,y_train,x_test)  #Decision Tree
svm_oof_train,svm_oof_test = get_out_fold(svm,x_train,y_train,x_test)  # Support Vector
 
print("Training is complete")


Training is complete


## 2.2 预测并生成提交文件

Level 2： 我们利用XGBoost，使用第一层预测的结果作为特征对最终的结果进行预测。

In [16]:
x_train = np.concatenate((mlp_oof_train,rf_oof_train,et_oof_train,dt_oof_train,svm_oof_train),axis=1)
x_test =np.concatenate((mlp_oof_test,rf_oof_test,et_oof_test,dt_oof_test,svm_oof_test),axis=1)
 

In [25]:
from xgboost import XGBClassifier
 
gbm = XGBClassifier(objective='multi:softmax ', num_class='multi:softprob')
gbm.fit(x_train,y_train)
predictions = gbm.predict(x_test)

  if diff:


In [26]:
result = pd.DataFrame(predictions)

In [27]:
result

Unnamed: 0,0
0,4
1,8
2,8
3,8
4,0
5,0
6,0
7,8
8,0
9,8


In [28]:
result_1 = result[0].apply(int)

In [29]:
result_2 = pd.DataFrame(result_1)

In [30]:
# 将预测的结果写入txt文件当中
result_2.to_csv('H://data.txt', index=False, header=False)