In [1]:
# import packages

# data processing
import pandas as pd
import numpy as np
from datetime import timedelta, datetime


import re

# data visualization
import plotly.graph_objs as go
from plotly.graph_objs import Bar, Layout
from plotly import offline
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (20, 10)

plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号

# change text color
import colorama
from colorama import Fore, Style

# IPython
from IPython.display import IFrame

from sklearn.feature_selection import mutual_info_classif

%matplotlib inline

<div class="alert alert-block alert-warning">
<center><b>【第六阶段】项目日志</b></center>
    
    
第六阶段：【数据建模全解析】
   
第六节阶段核心目的：全方面掌握数据建模的实用手段和适用场景（线性和树型）
    
难度（最高5星）：⭐⭐⭐⭐

Good Luck!
</div>

对于不平衡数据，我们不能用传统的accuracy来衡量模型的好坏，本项目给出了一个归一基尼系数，作为本项目的唯一指标。

第一阶段已经讲过了！


参考：
- https://www.kaggle.com/batzner/gini-coefficient-an-intuitive-explanation
- https://stats.stackexchange.com/questions/306287/why-use-normalized-gini-score-instead-of-auc-as-evaluation

## 评估指标

<div class="alert alert-block alert-success"><b>Step 1</b>: 
    
完成下面题目：

1. normalized gini coefficent 的应用场景
2. 请写出该指标的算法实现（写个函数`gini`）
    
</div>

In [2]:
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = -eval_gini(labels, preds)
    return [('gini', gini_score)]

## 过拟合和欠拟合

对于机器学习算法来说，我们不仅要求它对训练数据集有很好的拟合（训练误差），同时也希望它可以对未知数据集（测试集）有很好的拟合结果（泛化能力），所产生的测试误差被称为泛化误差。

> 利用模型对数据进行拟合，学习的目的并非是对有限训练集进行正确预测，而是对未曾在训练集合出现的样本能够正确预测。

那么如何衡量泛化能力的好坏呢？

也就是模型的过拟合（overfitting）和欠拟合（underfitting）。

![](https://miro.medium.com/max/1125/1*_7OPgojau8hkiPUiHoGK_w.png)

建议观看：https://www.youtube.com/watch?v=EuBBz3bI-aA

<div class="alert alert-block alert-success"><b>Step 2</b>: 
    
完成下面题目：

1. 过拟合欠拟合出现的原因
2. 怎么处理过拟合和欠拟合？
  
</div>

**欠拟合出现的原因**

- 模型复杂度过低
- 特征量过少

可以尝试：
1. 增加新特征（特征组合、特征交互）enhancement的过程
2. 增加数据量
3. 减少正则化参数，简单来说，正则化参数是避免过拟合的
4. 尝试用非线性模型提高模型的复杂度
5. 增加模型训练时间，调整cost function的tolerance

**过拟合出现的原因**

- 样本太少
- 噪音干扰大
- 参数过多，模型过于复杂
- 假设条件不成立

可以尝试：
1. 加正则项
2. 增加数据量
3. Dropout（神经网络也适用于bagging）
4. Early stopping
5. Pruning(树形）
6. cross validation

## 交叉验证

交叉验证（Cross validation)，用于防止模型过于复杂而引起的**过拟合**。

交叉验证是在机器学习建立模型和验证模型参数时常用的办法。交叉验证，顾名思义，就是**重复的使用数据**，把得到的样本数据进行切分，组合为不同的训练集和测试集，用训练集来训练模型，用测试集来评估模型预测的好坏。在此基础上可以得到多组不同的训练集和测试集，某次训练集中的某样本在下次可能成为测试集中的样本，即所谓“交叉”。　

那么什么时候才需要交叉验证呢？

交叉验证用在**数据不是很充足的时候**。比如在我日常项目里面，对于普通适中问题，如果数据样本量小于一万条，我们就会采用交叉验证来训练优化选择模型。如果样本大于一万条的话，我们一般随机的把数据分成三份，一份为训练集（Training Set），一份为验证集（Validation Set），最后一份为测试集（Test Set）。用训练集来训练模型，用验证集来评估模型预测的好坏和选择模型及其对应的参数。把最终得到的模型再用于测试集，最终决定使用哪个模型以及对应参数。

![d9812a29f03c28c3b87694aa0cad613.jpg](https://img.wang.232232.xyz/img/2022/06/18/d9812a29f03c28c3b87694aa0cad613.jpg)

交叉验证：
https://www.youtube.com/watch?v=fSytzGwwBVw&ab_channel=StatQuestwithJoshStarmer

<div class="alert alert-block alert-success"><b>Step 2</b>: 
    
完成下面题目：

1. 交叉验证的用途有哪些？
2. 交叉验证的方法有哪些?
3. 时间序列的交叉验证方法和普通数据的区别
4. 验证集和测试集的区别
5. 完善下列代码
  
</div>

交叉验证的用途有哪些？

- 模型选择，也可以称为超参数选择
- 模型评估，用交叉验证的方法来对模型的performance进行评估

交叉验证的方法有哪些?


- 第一种是简单交叉验证

> 所谓的简单，是和其他交叉验证方法相对而言的。首先，我们随机的将样本数据分为两部分（比如： 70%的训练集，30%的测试集），然后用训练集来训练模型，在测试集上验证模型及参数。接着，我们再把样本打乱，重新选择训练集和测试集，继续训练数据和检验模型。最后我们选择损失函数评估最优的模型和参数。　

- 第二种是S折交叉验证（S-Folder Cross Validation）

> 和第一种方法不同，S折交叉验证会把样本数据随机的分成S份，每次随机的选择S-1份作为训练集，剩下的1份做测试集。当这一轮完成后，重新随机选择S-1份来训练数据。若干轮（小于S）之后，选择损失函数评估最优的模型和参数。

- 第三种是留一交叉验证（Leave-one-out Cross Validation）

> 它是第二种情况的特例，此时S等于样本数N，这样对于N个样本，每次选择N-1个样本来训练数据，留一个样本来验证模型预测的好坏。此方法主要用于样本量非常少的情况，比如对于普通适中问题，N小于50时，我一般采用留一交叉验证。
    
    
时间序列的交叉验证方法和普通数据的区别

在处理时间序列，我们不能选择随机样本并将它们分配给测试集或训练集，因为使用来自未来的值来预测过去的值是没有意义的。时间序列数据之间存在依赖性。

`from sklearn.model_selection import TimeSeriesSplit`

![](https://miro.medium.com/max/1204/1*qvdnPF8ETV9mFdMT0Y_BBA.png)
    
验证集和测试集的区别

- 验证集是在训练过程中用于检验模型的训练情况，从而确定合适的超参数；
- 测试集是在训练结束之后，测试模型的泛化能力。

In [3]:
from sklearn.model_selection import KFold

K = 10
kf = KFold(n_splits = K, random_state = 1, shuffle = True)
np.random.seed(1996)

## 线性算法


![](https://saedsayad.com/images/LogReg_1.png)

逻辑回归是什么？为什么要引入？

> 一句话概括：逻辑回归假设数据服从伯努利分布，通过极大似然化函数的方法，运用梯度下降来求解参数，从而达到二分类的目的。

首先逻辑回归虽名为回归，但并不是一个**回归方法**，它主要用来解决二分类问题，面对分类问题，线性回归显得无能为力，从下图可以看到，线性回归在对作为离散变量存在的y时，只能用直线进行拟合，并且所预测的y可能会超过0到1的范围，并且对异常值非常敏感。

那么我们想引入非线性元素，来解决以上两个问题，我们希望输出的值$p(y=1|X;\theta)$保证在 0 到 1 之间，并且对异常值不再敏感。


![](https://static.javatpoint.com/tutorial/machine-learning/images/linear-regression-vs-logistic-regression.png)

参考：https://zg104.github.io/Logistic_regression

<div class="alert alert-block alert-success"><b>Step 2</b>: 
    
完成下面题目：

1. Why we use logistic regression, not linear regression? What are the disadvantages of linear regression for classification?

2. What type of datasets is most suited for logistic regression?

3. Can you explain or interpret the hypothesis output of logistic regression?

4. Why we define the sigmoid function, create a new version of cost function, and applied MLE to derive logistic regression?

5. What are the disadvantage of logistic regression?

6. How to deal with overfitting?
  
</div>

**Why we use logistic regression, not linear regression? What are the disadvantages of linear regression for classification?**

- Linear regression can give us the values which are not between 0 and 1.

- Also, linear regression is sensitive to the outliers. However, the sigmoid function restrict the values between 0 and 1, which can be interpreted as the conditional probability of assigning the data to the particular class given the data parametrized by theta.

![](https://static.javatpoint.com/tutorial/machine-learning/images/linear-regression-vs-logistic-regression.png)

**What type of datasets is most suited for logistic regression?**

- Logistic regression likes overlapping data, instead of well separated data.
- Linear Discriminent Analysis will perform better for well separated data since the decision boundary is linear.

**Can you explain or interpret the hypothesis output of logistic regression?**

- We try to set a threshold to determine which class each data point should be assigned based on the conditional probability (I have clarified in Q1) derived from the sigmoid function.
- Typically, we set the threshold to be 0.5. However, it can be adjusted between 0 and 1 for personal specification, such as restriction on TPR (True Positive Rate).


**Why we define the sigmoid function, create a new version of cost function, and applied MLE to derive logistic regression?**

- Sigmoid function helps transform the linear esitimation into non-linear one.

- If we use the mean squared error as the cost function the same as linear regression, it is impossible to find the derivatives of the cost function with respect to theta, since the sigmoid function will make the cost function non-convex. So, we have to use gradient descent to minimize the cost function instead of computing the gradient by hand.

- You might wonder why the cost function of logistic regression is like this! That is beacuse we applied the MLE to maximize the probability to make the model the most plausible for all data points. You always minimize the loss function, which is just the negative form of the loglikelihood after MLE.

**How to deal with overfitting?**

- It can be pretty easy for every machine learning method to be overfitting. It is not a big deal!

- A regularization term is added to the cost function where the first part is loss function, and the second is the penalty term.

![](https://miro.medium.com/max/3232/1*vwhvjVQiEgLcssUPX6vxig.png)

**What are the disadvantage of logistic regression?**

- You should use k-fold cross validation to determine the highest polynomial of the features if the decision boundary is non-linear. It can be easy for this to overfit.

- Logistic regression is unstable when dealing with well separated datasets.

- Logistic regression requires relatively large datasets for training.

- Logistic regression is not that popular for multiclassification problems. Sigmoid function should be ungraded to Softmax function(You may hear about it if you know about Neural Networks).

In [4]:
final_train = pd.read_csv('final_train.csv',index_col=0)
final_test = pd.read_csv('final_test.csv',index_col=0)

<span id="5"></span>In order to get results between 0 and 1, a function, which is called **sigmoid**, is used to transform our hypothesis function. It is defined as
$$ $$
$$h_{\theta}(x) = g(\theta^{T} x)$$ 
$$ $$
where $h_{\theta}(x)$ is the hypothesis function, $x$ is a single record and 
$$ $$
$$g(z)=\dfrac{1}{1+e^{-z}}$$
$$ $$
By using $g(\theta^{T} x)$, we obtain the probablity and if $h_{\theta}(x) \geq 0.5$, we get $y=1$; if $h_{\theta}(x) < 0.5$, we get $y=0$. Further, when $z \geq 0$, $g(z) \geq 0.5$ is another detail. Thus, if the $\theta^{T} x \geq 0$, then $y=1$.
 
By the definition, I defined the below ***sigmoid*** function.<span id="5"></span>

We can't use the same cost function that we use for linear regression because the Logistic Function will cause the output to be wavy, causing many local optima. In other words, it will not be a convex function. That's why we need to define a different cost function for logistic regression. It is simply defined as
$$ $$
$$J(\theta) = \dfrac{1}{m} \sum^{m}_{i=1}Cost(h_{\theta}(x^{(i)}), y^{(i)})$$ 
$$ $$
where 
$$ $$
$$Cost(h_{\theta}(x^{(i)}), y^{(i)})=-y^{(i)} \; log(h_{\theta}(x^{(i)}))-(1-y^{(i)}) \; log(1-h_{\theta}(x^{(i)}))$$
$$ $$
As the sanity check, $J(\theta)$ can be plotted or printed as a function of the number of iterations to be sure that $J(\theta)$ is **decreasing on every iteration**, which shows that it is converging correctly. At this point, choice of $\alpha$ is important. If we select a high or small $\alpha$ value, we might have problem about the converging.<span id="6"></span>



In [5]:
# prepare the data

# 1. 写出Sigmoid function

def sigmoid(z):
    return 1 / (1 + np.exp(-z))
    
# 2. loss function 
def loss(h, y):
    return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()

## 树型算法

XGB视频：https://www.youtube.com/watch?v=GrJP9FLV3FE

什么是XGB?

XGBoost是陈天奇等人开发的一个开源机器学习项目，高效地实现了GBDT算法并进行了算法和工程上的许多改进，堪称表格型数据大杀器！

那么说到XGB不得不说一下gbdt。

GBDT(Gradient Boosting Decision Tree)，全名叫梯度提升决策树，使用的是Boosting的思想。

那么啥是boosting思想？那就得先说说bagging（集成）

如果你对随机森林有点了解，你应该知道它是个集成算法，也就是【人多力量大】的思想，也叫bagging。

Bagging 的思路是所有基础模型都一致对待，每个基础模型手里都只有一票。然后使用民主投票的方式得到最终的结果。

大部分情况下，经过 bagging 得到的结果方差（variance）更小。

具体过程：

1. 从原始样本集中抽取训练集。每轮从原始样本集中使用Bootstraping的方法抽取n个训练样本（在训练集中，有些样本可能被多次抽取到，而有些样本可能一次都没有被抽中）。共进行k轮抽取，得到k个训练集。（k个训练集之间是相互独立的）
2. 每次使用一个训练集得到一个模型，k个训练集共得到k个模型。（注：这里并没有具体的分类算法或回归方法，我们可以根据具体问题采用不同的分类或回归方法，如决策树、感知器等）
3. 对分类问题：将上步得到的k个模型采用投票的方式得到分类结果；对回归问题，计算上述模型的均值作为最后的结果。（所有模型的重要性相同）

ok所以你应该了解什么是bagging了，那么boosting的思想是什么？

那就是【精英筛选】

Boosting 和 bagging 最本质的差别在于他对基础模型不是一致对待的，而是经过不停的考验和筛选来挑选出「精英」，然后给精英更多的投票权，表现不好的基础模型则给较少的投票权，然后综合所有人的投票得到最终结果。

大部分情况下，经过 boosting 得到的结果偏差（bias）更小。

具体过程：

1. 通过加法模型将基础模型进行线性的组合。
2. 每一轮训练都提升那些错误率小的基础模型权重，同时减小错误率高的模型权重。
3. 在每一轮改变训练数据的权值或概率分布，通过提高那些在前一轮被弱分类器分错样例的权值，减小前一轮分对样例的权值，来使得分类器对误分的数据有较好的效果。

---

那么再回到gbdt来，原理很简单，就是所有弱分类器的结果相加等于预测值，然后下一个弱分类器去拟合误差函数对预测值的残差。

举一个非常简单的例子，比如我今年30岁了，但计算机或者模型GBDT并不知道我今年多少岁，那GBDT咋办呢？

1. 它会在第一个弱分类器（或第一棵树中）随便用一个年龄比如20岁来拟合，然后发现误差有10岁；
2. 接下来在第二棵树中，用6岁去拟合剩下的损失，发现差距还有4岁；
3. 接着在第三棵树中用3岁拟合剩下的差距，发现差距只有1岁了；
4. 最后在第四课树中用1岁拟合剩下的残差，完美。
5. 最终，四棵树的结论加起来，就是真实年龄30岁（实际工程中，gbdt是计算负梯度，用负梯度近似残差）。

> 学习并不断改正错误，直至残差小于阈值。

那么XGB其实是gbdt的优化版本，XGBoost还在工程实现上做了大量的优化。总的来说，两者之间的区别和联系可以总结成以下几个方面。

1. GBDT是机器学习算法，XGBoost是该算法的工程实现。
2. XGBoost显式地加入了正则项来控制模 型的复杂度，有利于防止过拟合，从而提高模型的泛化能力。
3. GBDT在模型训练时只使用了代价函数的一阶导数信息，XGBoost对代 价函数进行二阶泰勒展开，可以同时使用一阶和二阶导数。
4. XGBoost支持多种类型的基分类器
5. 传统的GBDT在每轮迭代时使用全部的数据，XGBoost则采用了与随机 森林相似的策略，支持对数据进行采样（相当于dropout）避免过拟合
6. 传统的GBDT没有设计对缺失值进行处理，XGBoost能够自动学习出缺 失值的处理策略。
7. xgboost工具支持特征颗粒度下的并行

> 决策树的学习最耗时的一个步骤就是对特征的值进行排序（因为要确定最佳分割点），xgboost在训练之前，预先对数据进行了排序，然后保存为block结构，后面的迭代中重复地使用这个结构，大大减小计算量。
>
>这个block结构也使得并行成为了可能，在进行节点的分裂时，需要计算每个特征的增益，最终选增益最大的那个特征去做分裂，那么各个特征的增益计算就可以开多线程进行。树节点在进行分裂时，我们需要计算每个特征的每个分割点对应的增益，即用贪心法枚举所有可能的分割点。
>
>当数据无法一次载入内存或者在分布式情况下，贪心算法效率就会变得很低，所以xgboost还提出了一种可并行的近似直方图算法，用于高效地生成候选的分割点。

![](https://dzone.com/storage/temp/13069535-xgboost-features.png)

### 参数调节

- n_estimators: The number of trees or rounds. Adding more trees will be at the risk of overfitting. The reason is in the way that the boosted tree model is constructed, sequentially where each new tree attempts to model and correct for the errors made by the sequence of previous trees. Quickly, the model reaches a point of diminishing returns.

- max_depth: The maximum depth of a tree. It is also used to control overfitting as higher depth will allow model to learn relations very specific to a particular sample. Typically, it should be chosen from 3 to 10 and tuned using CV.

- objective: The loss function to be minimized. binary:logistic is for binary classification, which will return predicted probability (NOT CLASS).

- learning_rate: The convergence control parameter in gradient descent. It is intuitive that XGB will not reach its minimum if both n_estimaters and learning_rate are very small.

- subsample: The fraction of observations to be randomly chosen for each tree. Lower values make the algorithm more conservative and prevents overfitting, but too small values might lead to underfitting. So, be careful to choose and the typical values are between 0.5 and 1.

- min_child_weight: The minimum sum of weights all observations required in child. It is the minimum weight (or number of samples if all samples have a weight of 1) required in order to create a new node in the tree. A smaller min_child_weight allows the algorithm to create children that correspond to fewer samples, thus allowing for more complex trees, but again, more likely to overfit.

- colsample_bytree: The fraction of features to use. By default it is set to 1 meaning that we will use all features. But in order to avoid the number of highly correlated trees is getting too big, we would like to use a sample of all the features for training to avoid overfitting.

- scale_pos_weight: The parameter that controls the balance of positive and negative weights, useful for unbalanced classes. This dataset is unbalanced as we have seen, so we should be careful to tune it. The typical value to consider: sum(negative instances) / sum(positive instances).

- gamma: The minimum loss reduction required to make a split. A node is split only when the resulting split gives a positive reduction in the loss function. The larger gamma is, the more conservative (overfitting) the algorithm will be. The values can vary depending on the loss function and should be tuned.

- reg_alpha: L1 regularization term on weights. Increasing this value will make model more conservative.

- reg_lambda: L2 regularization term on weights. Increasing this value will make model more conservative. Normalised to number of training examples.

模型的参数有很多，如何选择合适的参数，是个很头疼的问题，所以建模的玄学就在这里，一些超参数必须主动调节，可以固定筛选范围，一个一个试，看哪个表现更好，但是这么做成本巨大，下面这一堆代码，运行完，大概要16min。

In [6]:
# params = {
#         'min_child_weight': [1, 5, 10],
#         'gamma': [0.5, 1, 1.5, 2, 5, 10],
#         'subsample': [0.6, 0.8, 1.0],
#         'colsample_bytree': [0.6, 0.8, 1.0],
#         'max_depth': [3, 4, 5]
#         }

In [7]:
# xgb = XGBClassifier(learning_rate=0.06, n_estimators=300, objective='binary:logistic',nthread=4)

In [8]:
# from datetime import datetime
# def timer(start_time=None):
#     if not start_time:
#         start_time = datetime.now()
#         return start_time
#     elif start_time:
#         thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
#         tmin, tsec = divmod(temp_sec, 60)
#         print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))
        
        
# from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
# from sklearn.model_selection import StratifiedKFold


# folds = 3
# param_comb = 5

# skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

# random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(X,y), verbose=3, random_state=1001 )

# # Here we go
# start_time = timer(None)
# random_search.fit(X, y)
# timer(start_time) 

In [9]:
# print('\n All results:')
# print(random_search.cv_results_)
# print('\n Best estimator:')
# print(random_search.best_estimator_)
# print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
# print(random_search.best_score_ * 2 - 1)
# print('\n Best hyperparameters:')
# print(random_search.best_params_)
# results = pd.DataFrame(random_search.cv_results_)
# results.to_csv('xgb-random-grid-search-results-01.csv', index=False)

### 优化迭代

In [10]:
# 调参之后，较优的参数组合

from xgboost import XGBClassifier
MAX_ROUNDS = 400
OPTIMIZE_ROUNDS = False
LEARNING_RATE = 0.07
EARLY_STOPPING_ROUNDS = 50  

model = XGBClassifier(    
                        n_estimators=MAX_ROUNDS,
                        max_depth=4,
                        objective="binary:logistic",
                        learning_rate=LEARNING_RATE, 
                        subsample=.8,
                        min_child_weight=6,
                        colsample_bytree=.8,
                        scale_pos_weight=1.6,
                        gamma=10,
                        reg_alpha=8,
                        reg_lambda=1.3,
                     )

In [11]:
def XGB_gini(df_train,tar_enc = True,pca = False):
    
    '''
    df_train: 已处理的训练集数据
    tar_enc: 是否对类别型变量使用target encoding
    pca: 是否使用pca
    '''    
    
    y = df_train.target
    X = df_train.drop('target',axis=1)
    
    
    y_valid_pred = 0*y
    y_test_pred = 0
    
    
    from target_encoding import target_encode
    
    train = pd.concat([X,y],axis=1)
    for i, (train_index, test_index) in enumerate(kf.split(train)):

        # 分成训练集、验证集、测试集

        y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index]
        X_train, X_valid = X.iloc[train_index,:].copy(), X.iloc[test_index,:].copy()        
        X_test = final_test.copy()
        
        
        if pca == True:
            n_comp = 20
            print('\nPCA执行中...')
            pca = PCA(n_components=n_comp, svd_solver='full', random_state=1001)
            X_train = pd.DataFrame(pca.fit_transform(X_train))
            X_valid = pd.DataFrame(pca.transform(X_valid))
            X_test = pd.DataFrame(pca.transform(final_test.copy()))
        print( f"\n{i}折交叉验证： ")
        
        if pca == False:
            if tar_enc == True:
                f_cat = [f for f in X.columns if '_cat' in f and 'tar_enc' not in  f]
                for f in f_cat:
                    X_train[f + "_avg"], X_valid[f + "_avg"], X_test[f + "_avg"] = target_encode(
                                                                    trn_series=X_train[f],
                                                                    val_series=X_valid[f],
                                                                    tst_series=X_test[f],
                                                                    target=y_train,
                                                                    min_samples_leaf=100,
                                                                    smoothing=10,
                                                                    noise_level=0
                                                                    )

    #     from category_encoders.target_encoder import TargetEncoder
    #     tar_enc = TargetEncoder(cols = f_cat).fit(X_train,y_train)
    #     X_train = tar_enc.transform(X_train) # 转换训练集
    #     X_test = tar_enc.transform(X_test) # 转换测试集


            X_train.drop(f_cat,axis=1,inplace=True)
            X_valid.drop(f_cat,axis=1,inplace=True)
            X_test.drop(f_cat,axis=1,inplace=True)


        # 对于当前折，跑XGB
        if OPTIMIZE_ROUNDS:
            eval_set=[(X_valid,y_valid)]
            fit_model = model.fit( X_train, y_train, 
                                   eval_set=eval_set,
                                   eval_metric=gini_xgb,
                                   early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                                   verbose=False
                                 )
            print( "  Best N trees = ", model.best_ntree_limit )
            print( "  Best gini = ", model.best_score )
        else:
            fit_model = model.fit( X_train, y_train )

        # 生成验证集的预测结果
        pred = fit_model.predict_proba(X_valid)[:,1]
        print( "  normalized gini coefficent = ", eval_gini(y_valid, pred) )
        y_valid_pred.iloc[test_index] = pred

        # 累积计算测试集预测结果
        y_test_pred += fit_model.predict_proba(X_test)[:,1]

        del X_test, X_train, X_valid, y_train

    y_test_pred /= K  # 取各fold结果均值

    print( "\n整个训练集（合并）的normalized gini coefficent:" )
    print( "  final normalized gini coefficent = ", eval_gini(y, y_valid_pred) )
    
    return y_test_pred,eval_gini(y, y_valid_pred)

In [12]:
%%time
y_test_pred, gini_score = XGB_gini(df_train=final_train,tar_enc=True)


0折交叉验证： 
  normalized gini coefficent =  0.2538658383235931

1折交叉验证： 
  normalized gini coefficent =  0.30728575577060424

2折交叉验证： 
  normalized gini coefficent =  0.28508943739615844

3折交叉验证： 
  normalized gini coefficent =  0.2829810430876034

4折交叉验证： 
  normalized gini coefficent =  0.29113416245470913

5折交叉验证： 
  normalized gini coefficent =  0.2940775662258974

6折交叉验证： 
  normalized gini coefficent =  0.2818130027819927

7折交叉验证： 
  normalized gini coefficent =  0.27666764206685135

8折交叉验证： 
  normalized gini coefficent =  0.2723987713495525

9折交叉验证： 
  normalized gini coefficent =  0.3070892136897676

整个训练集（合并）的normalized gini coefficent:
  final normalized gini coefficent =  0.28507826718380913
Wall time: 18min 55s


In [13]:
submission = pd.DataFrame()
submission['id'] = final_test.index.values
submission['target'] = y_test_pred
submission.to_csv('xgb_submit.csv', float_format='%.6f', index=False)