In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Feb 16 21:25:20 2020

@author: wanghaochen
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb  # 使用XGBoost特征选择及处理多分类
from sklearn.decomposition import PCA
from xgboost import plot_importance
from sklearn.svm import SVC  # 使用SVM处理多分类
from sklearn.neighbors import KNeighborsClassifier  # 使用K-近邻分类处理多分类
from sklearn.linear_model import LogisticRegression  # 使用逻辑回归处理多分类
from sklearn.linear_model import SGDClassifier  # 使用随机梯度下降处理多分类
from sklearn.tree import DecisionTreeClassifier  # 使用决策树处理多分类

from sklearn.preprocessing import MinMaxScaler, StandardScaler  # 归一化，标准化

path = './Dataset/'

Indicators_all = pd.read_excel(path + 'Indicators_all.xlsx', sheet_name='All')
Result = Indicators_all[['Result']]
Indicators = Indicators_all.iloc[:, 2:]

# 使用 XGBoost 挑选对比赛结果有较大影响的特征
#利用xgb.train中的get_score得到weight，gain，以及cover
params = {'max_depth': 7, 'n_estimators': 80, 'learning_rate': 0.01, 'nthread': 4,
          'subsample': 1.0, 'colsample_bytree': 0.5, 'min_child_weight': 3, 'seed': 1301}
xgtrain = xgb.DMatrix(Indicators, label=Result)
model = xgb.train(params, xgtrain, num_boost_round=100)
'''
* "weight" is the number of times a feature appears in a tree
* "gain" is the average gain of splits which use the feature
* "cover" is the average coverage of splits which use the feature
'''
# 利用plot_importance画出各个特征的重要性排序
fig, ax = plt.subplots(figsize=(15, 15))
plot_importance(model, title='Feature Importance', ax=ax,
                max_num_features=Indicators.shape[1])
plt.show()
importance = model.get_fscore()
score_var = list(importance)
score_val = list(importance.values())
score = pd.DataFrame({'Feature': score_var, 'Score': score_val})
score = score.sort_values(by='Score', ascending=False)
score_var = list(score['Feature'])
score_val = list(score['Score'])
# 选取了重要性排名前 40% 的变量，舍弃重要性排名后 60% 的变量
_n = int(0.4 * len(score_var))
selected_var = score_var[: _n]
Indicators_selected = Indicators[selected_var]
# 使用 PCA 对选取的数据做进一步的降维处理
pca = PCA(n_components=2)  # 加载PCA算法，设置降维后主成分数目为2
Indicators_reduced = pca.fit_transform(Indicators)  # 对原样本进行降维
print(sum(pca.explained_variance_ratio_))

Indicators_reduced = pca.fit_transform(Indicators_selected)  # 对筛选后的样本进行降维
print(sum(pca.explained_variance_ratio_))

# Ureduce=pca.components_ # 各个特征的权重系数
Indicators_reduced = pd.DataFrame(Indicators_reduced, columns=['comp_0', 'comp_1'])

# 标准化
scaler = StandardScaler()
Indicators_reduced_scaled = Indicators_reduced.copy(deep=True)
Indicators_reduced_scaled['comp_0'] = scaler.fit_transform(Indicators_reduced['comp_0'].values.reshape(-1, 1))
Indicators_reduced_scaled['comp_1'] = scaler.fit_transform(Indicators_reduced['comp_1'].values.reshape(-1, 1))
Indicator_train_scaled = Indicators_reduced_scaled.iloc[:train_num, :]
Indicator_test_scaled = Indicators_reduced_scaled.iloc[train_num:, :]
# 划分训练数据集和测试数据集 0.7 : 0.3
train_num = int(0.7 * Indicators_reduced.shape[0])
Indicator_train = Indicators_reduced.iloc[:train_num, :]
Result_train = Result.iloc[:train_num, :]

Indicator_test = Indicators_reduced.iloc[train_num:, :]
Result_test = Result.iloc[train_num:, :]
# 支持向量机预测
clf_svm = SVC(kernel='rbf', gamma=0.05, C=1, probability=False, tol=0.0001,
              decision_function_shape='ovo', max_iter=-1, random_state=None)
clf_svm.fit(Indicator_train, Result_train)
pred_clf_svm_train = clf_svm.predict(Indicator_train)
pred_clf_svm_test = clf_svm.predict(Indicator_test)

accuracy_svm_train = sum(np.asfarray(pred_clf_svm_train) == np.asfarray(Result_train['Result'])) / np.size(Result_train)
accuracy_svm_test = sum(np.asfarray(pred_clf_svm_test) == np.asfarray(Result_test['Result'])) / np.size(Result_test)
# XGBoost预测
params = {
    'booster': 'gbtree',
    'objective': 'multi:softmax',  # 多分类的问题
    'num_class': 3,  # 类别数，与 multisoftmax 并用
    'gamma': 0.3,  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。
    'max_depth': 10,  # 构建树的深度，越大越容易过拟合
    'lambda': 1,  # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
    'subsample': 0.7,  # 随机采样训练样本
    'colsample_bytree': 0.7,  # 生成树时进行的列采样
    'min_child_weight': 2,
    'silent': 1,  # 设置成1则没有运行信息输出，最好是设置为0.
    'eta': 0.6,  # 如同学习率
    'seed': 1,
    'nthread': 1,  # cpu 线程数
}
dtrain = xgb.DMatrix(Indicator_train, label=Result_train)
dtest = xgb.DMatrix(Indicator_test)  #, label=Result_test)
num_rounds = 500
clf_xgb = xgb.train(params, dtrain, num_rounds)

pred_clf_xgb_train = clf_xgb.predict(dtrain)
pred_clf_xgb_test = clf_xgb.predict(dtest)

accuracy_xgb_train = sum(np.asfarray(pred_clf_xgb_train) == np.asfarray(Result_train['Result'])) / np.size(Result_train)
accuracy_xgb_test = sum(np.asfarray(pred_clf_xgb_test) == np.asfarray(Result_test['Result'])) / np.size(Result_test)
# k-近邻分类预测
clf_knc = KNeighborsClassifier(n_neighbors=1)
clf_knc.fit(Indicator_train, Result_train)

pred_clf_knc_train = clf_knc.predict(Indicator_train)
pred_clf_knc_test = clf_knc.predict(Indicator_test)

accuracy_knc_train = sum(np.asfarray(pred_clf_knc_train) == np.asfarray(Result_train['Result'])) / np.size(Result_train)
accuracy_knc_test = sum(np.asfarray(pred_clf_knc_test) == np.asfarray(Result_test['Result'])) / np.size(Result_test)

# 逻辑回归分类预测 # 效果很差
clf_lor = LogisticRegression(penalty='l2', C=10, multi_class='multinomial', solver='saga')
clf_lor.fit(Indicator_train_scaled, Result_train)

pred_clf_lor_train = clf_lor.predict(Indicator_train_scaled)
pred_clf_lor_test = clf_lor.predict(Indicator_test_scaled)

accuracy_lor_train = sum(np.asfarray(pred_clf_lor_train) == np.asfarray(Result_train['Result'])) / np.size(Result_train)
accuracy_lor_test = sum(np.asfarray(pred_clf_lor_test) == np.asfarray(Result_test['Result'])) / np.size(Result_test)

# 随机梯度下降分类预测 # 拟合很差 但是预测较好
clf_sgdv = SGDClassifier(max_iter=1000, random_state=100)
clf_sgdv.fit(Indicator_train_scaled, Result_train)

pred_clf_sgdv_train = clf_sgdv.predict(Indicator_train)
pred_clf_sgdv_test = clf_sgdv.predict(Indicator_test)

accuracy_sgdv_train = sum(np.asfarray(pred_clf_sgdv_train) == np.asfarray(Result_train['Result'])) / np.size(
    Result_train)
accuracy_sgdv_test = sum(np.asfarray(pred_clf_sgdv_test) == np.asfarray(Result_test['Result'])) / np.size(Result_test)
# 决策树分类预测
clf_tree = DecisionTreeClassifier()  #criterion='entropy'
clf_tree.fit(Indicator_train_scaled, Result_train)

pred_clf_tree_train = clf_sgdv.predict(Indicator_train)
pred_clf_tree_test = clf_sgdv.predict(Indicator_test)

accuracy_tree_train = sum(np.asfarray(pred_clf_tree_train) == np.asfarray(Result_train['Result'])) / np.size(
    Result_train)
accuracy_tree_test = sum(np.asfarray(pred_clf_tree_test) == np.asfarray(Result_test['Result'])) / np.size(Result_test)
