# 排序模型
通过召回的操作， 我们已经进行了问题规模的缩减， 对于每个用户， 选择出了N篇文章作为了候选集，并基于召回的候选集构建了与用户历史相关的特征，以及用户本身的属性特征，文章本省的属性特征，以及用户与文章之间的特征，下面就是使用机器学习模型来对构造好的特征进行学习，然后对测试集进行预测，得到测试集中的每个候选集用户点击的概率，返回点击概率最大的topk个文章，作为最终的结果。

排序阶段选择了三个比较有代表性的排序模型，它们分别是：

1. LGB的排序模型
2. LGB的分类模型
3. 深度学习的分类模型DIN

得到了最终的排序模型输出的结果之后，还选择了两种比较经典的模型集成的方法：

1. 输出结果加权融合
2. Staking（将模型的输出结果再使用一个简单模型进行预测）

In [75]:
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
import gc, os
import time
from datetime import datetime
import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

## 读取排序特征

In [76]:
data_path = './data_raw/'
save_path = './tmp_results/'
offline = False

In [77]:
# 重新读取数据的时候，发现click_article_id是一个浮点数，所以将其转换成int类型
trn_user_item_feats_df = pd.read_csv(save_path + 'trn_user_item_feats_df.csv')
trn_user_item_feats_df['click_article_id'] = trn_user_item_feats_df['click_article_id'].astype(int)

if offline:
    val_user_item_feats_df = pd.read_csv(save_path + 'val_user_item_feats_df.csv')
    val_user_item_feats_df['click_article_id'] = val_user_item_feats_df['click_article_id'].astype(int)
else:
    val_user_item_feats_df = None
    
tst_user_item_feats_df = pd.read_csv(save_path + 'tst_user_item_feats_df.csv')
tst_user_item_feats_df['click_article_id'] = tst_user_item_feats_df['click_article_id'].astype(int)

# 做特征的时候为了方便，给测试集也打上了一个无效的标签，这里直接删掉就行
del tst_user_item_feats_df['label']

## 返回排序后的结果

In [78]:
def submit(recall_df, topk=5, model_name=None):
    recall_df = recall_df.sort_values(by=['user_id', 'pred_score'])
    recall_df['rank'] = recall_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
    
    # 判断是不是每个用户都有5篇文章及以上
    tmp = recall_df.groupby('user_id').apply(lambda x: x['rank'].max())
    assert tmp.min() >= topk
    
    del recall_df['pred_score']
    submit = recall_df[recall_df['rank'] <= topk].set_index(['user_id', 'rank']).unstack(-1).reset_index()
    
    submit.columns = [int(col) if isinstance(col, int) else col for col in submit.columns.droplevel(0)]
    # 按照提交格式定义列名
    submit = submit.rename(columns={'': 'user_id', 1: 'article_1', 2: 'article_2', 
                                                  3: 'article_3', 4: 'article_4', 5: 'article_5'})
    
    save_name = save_path + model_name + '_' + datetime.today().strftime('%m-%d') + '.csv'
    submit.to_csv(save_name, index=False, header=True)

In [79]:
# 排序结果归一化
def norm_sim(sim_df, weight=0.0):
    # print(sim_df.head())
    min_sim = sim_df.min()
    max_sim = sim_df.max()
    if max_sim == min_sim:
        sim_df = sim_df.apply(lambda sim: 1.0)
    else:
        sim_df = sim_df.apply(lambda sim: 1.0 * (sim - min_sim) / (max_sim - min_sim))

    sim_df = sim_df.apply(lambda sim: sim + weight)  # plus one
    return sim_df

## LGB排序模型

In [80]:
# 防止中间出错之后重新读取数据
trn_user_item_feats_df_rank_model = trn_user_item_feats_df.copy()

if offline:
    val_user_item_feats_df_rank_model = val_user_item_feats_df.copy()
    
tst_user_item_feats_df_rank_model = tst_user_item_feats_df.copy()

In [81]:
# 定义特征列
lgb_cols = ['sim0', 'time_diff0', 'word_diff0','sim_max', 'sim_min', 'sim_sum', 
            'sim_mean', 'score','click_size', 'time_diff_mean', 'active_level',
            'click_environment','click_deviceGroup', 'click_os', 'click_country', 
            'click_region','click_referrer_type', 'user_time_hob1', 'user_time_hob2',
            'words_hbo', 'category_id', 'created_at_ts','words_count']

In [82]:
# 排序模型分组
trn_user_item_feats_df_rank_model.sort_values(by=['user_id'], inplace=True)
g_train = trn_user_item_feats_df_rank_model.groupby(['user_id'], as_index=False).count()["label"].values

if offline:
    val_user_item_feats_df_rank_model.sort_values(by=['user_id'], inplace=True)
    g_val = val_user_item_feats_df_rank_model.groupby(['user_id'], as_index=False).count()["label"].values

In [83]:
# 排序模型定义
lgb_ranker = lgb.LGBMRanker(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
                            max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
                            learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16)  

In [84]:
# 排序模型训练
if offline:
    lgb_ranker.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'], group=g_train,
                eval_set=[(val_user_item_feats_df_rank_model[lgb_cols], val_user_item_feats_df_rank_model['label'])], 
                eval_group= [g_val], eval_at=[1, 2, 3, 4, 5], eval_metric=['ndcg', ], early_stopping_rounds=50, )
else:
    lgb_ranker.fit(trn_user_item_feats_df[lgb_cols], trn_user_item_feats_df['label'], group=g_train)

In [85]:
# 模型预测
tst_user_item_feats_df['pred_score'] = lgb_ranker.predict(tst_user_item_feats_df[lgb_cols], num_iteration=lgb_ranker.best_iteration_)

# 将这里的排序结果保存一份，用户后面的模型融合
tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']].to_csv(save_path + 'lgb_ranker_score.csv', index=False)

In [86]:
# 预测结果重新排序, 及生成提交结果
rank_results = tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']]
rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)
submit(rank_results, topk=5, model_name='lgb_ranker')

In [87]:
# 五折交叉验证，这里的五折交叉是以用户为目标进行五折划分
#  这一部分与前面的单独训练和验证是分开的
def get_kfold_users(trn_df, n=5):
    user_ids = trn_df['user_id'].unique()
    user_set = [user_ids[i::n] for i in range(n)]
    return user_set

k_fold = 5
trn_df = trn_user_item_feats_df_rank_model
user_set = get_kfold_users(trn_df, n=k_fold)

score_list = []
score_df = trn_df[['user_id', 'click_article_id','label']]
sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])

# 五折交叉验证，并将中间结果保存用于staking
for n_fold, valid_user in enumerate(user_set):
    train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user
    valid_idx = trn_df[trn_df['user_id'].isin(valid_user)]
    
    # 训练集与验证集的用户分组
    train_idx.sort_values(by=['user_id'], inplace=True)
    g_train = train_idx.groupby(['user_id'], as_index=False).count()["label"].values
    
    valid_idx.sort_values(by=['user_id'], inplace=True)
    g_val = valid_idx.groupby(['user_id'], as_index=False).count()["label"].values
    
    # 定义模型
    lgb_ranker = lgb.LGBMRanker(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
                            max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
                            learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16)  
    # 训练模型
    lgb_ranker.fit(train_idx[lgb_cols], train_idx['label'], group=g_train,
                   eval_set=[(valid_idx[lgb_cols], valid_idx['label'])], eval_group= [g_val], 
                   eval_at=[1, 2, 3, 4, 5], eval_metric=['ndcg', ], early_stopping_rounds=50, )
    
    # 预测验证集结果
    valid_idx['pred_score'] = lgb_ranker.predict(valid_idx[lgb_cols], num_iteration=lgb_ranker.best_iteration_)
    
    # 对输出结果进行归一化
    valid_idx['pred_score'] = valid_idx[['pred_score']].transform(lambda x: norm_sim(x))
    
    valid_idx.sort_values(by=['user_id', 'pred_score'])
    valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
    
    # 将验证集的预测结果放到一个列表中，后面进行拼接
    score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']])
    
    # 如果是线上测试，需要计算每次交叉验证的结果相加，最后求平均
    if not offline:
        sub_preds += lgb_ranker.predict(tst_user_item_feats_df_rank_model[lgb_cols], lgb_ranker.best_iteration_)
    
score_df_ = pd.concat(score_list, axis=0)
score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])
# 保存训练集交叉验证产生的新特征
score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_lgb_ranker_feats.csv', index=False)
    
# 测试集的预测结果，多次交叉验证求平均,将预测的score和对应的rank特征保存，可以用于后面的staking，这里还可以构造其他更多的特征
tst_user_item_feats_df_rank_model['pred_score'] = sub_preds / k_fold
tst_user_item_feats_df_rank_model['pred_score'] = tst_user_item_feats_df_rank_model['pred_score'].transform(lambda x: norm_sim(x))
tst_user_item_feats_df_rank_model.sort_values(by=['user_id', 'pred_score'])
tst_user_item_feats_df_rank_model['pred_rank'] = tst_user_item_feats_df_rank_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')

# 保存测试集交叉验证的新特征
tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_lgb_ranker_feats.csv', index=False)

[1]	valid_0's ndcg@1: 0.91205	valid_0's ndcg@2: 0.963487	valid_0's ndcg@3: 0.966062	valid_0's ndcg@4: 0.96646	valid_0's ndcg@5: 0.966537
Training until validation scores don't improve for 50 rounds
[2]	valid_0's ndcg@1: 0.91785	valid_0's ndcg@2: 0.966069	valid_0's ndcg@3: 0.968294	valid_0's ndcg@4: 0.968649	valid_0's ndcg@5: 0.968765
[3]	valid_0's ndcg@1: 0.92255	valid_0's ndcg@2: 0.968166	valid_0's ndcg@3: 0.970179	valid_0's ndcg@4: 0.970491	valid_0's ndcg@5: 0.970588
[4]	valid_0's ndcg@1: 0.924475	valid_0's ndcg@2: 0.968829	valid_0's ndcg@3: 0.970867	valid_0's ndcg@4: 0.97119	valid_0's ndcg@5: 0.971287
[5]	valid_0's ndcg@1: 0.925025	valid_0's ndcg@2: 0.969127	valid_0's ndcg@3: 0.971114	valid_0's ndcg@4: 0.971427	valid_0's ndcg@5: 0.971514
[6]	valid_0's ndcg@1: 0.9268	valid_0's ndcg@2: 0.969908	valid_0's ndcg@3: 0.971783	valid_0's ndcg@4: 0.972117	valid_0's ndcg@5: 0.972214
[7]	valid_0's ndcg@1: 0.926825	valid_0's ndcg@2: 0.969902	valid_0's ndcg@3: 0.971802	valid_0's ndcg@4: 0.972125	

[63]	valid_0's ndcg@1: 0.944025	valid_0's ndcg@2: 0.976739	valid_0's ndcg@3: 0.978376	valid_0's ndcg@4: 0.978645	valid_0's ndcg@5: 0.978694
[64]	valid_0's ndcg@1: 0.94375	valid_0's ndcg@2: 0.976621	valid_0's ndcg@3: 0.978271	valid_0's ndcg@4: 0.978541	valid_0's ndcg@5: 0.978589
[65]	valid_0's ndcg@1: 0.94395	valid_0's ndcg@2: 0.976695	valid_0's ndcg@3: 0.978345	valid_0's ndcg@4: 0.978614	valid_0's ndcg@5: 0.978663
[66]	valid_0's ndcg@1: 0.944025	valid_0's ndcg@2: 0.976739	valid_0's ndcg@3: 0.978376	valid_0's ndcg@4: 0.978645	valid_0's ndcg@5: 0.978694
[67]	valid_0's ndcg@1: 0.944275	valid_0's ndcg@2: 0.976815	valid_0's ndcg@3: 0.978465	valid_0's ndcg@4: 0.978734	valid_0's ndcg@5: 0.978783
[68]	valid_0's ndcg@1: 0.94445	valid_0's ndcg@2: 0.976848	valid_0's ndcg@3: 0.978536	valid_0's ndcg@4: 0.978794	valid_0's ndcg@5: 0.978843
[69]	valid_0's ndcg@1: 0.9444	valid_0's ndcg@2: 0.97683	valid_0's ndcg@3: 0.978517	valid_0's ndcg@4: 0.978776	valid_0's ndcg@5: 0.978824
[70]	valid_0's ndcg@1: 0.9

[35]	valid_0's ndcg@1: 0.939475	valid_0's ndcg@2: 0.974823	valid_0's ndcg@3: 0.97651	valid_0's ndcg@4: 0.976866	valid_0's ndcg@5: 0.976953
[36]	valid_0's ndcg@1: 0.93955	valid_0's ndcg@2: 0.974835	valid_0's ndcg@3: 0.976572	valid_0's ndcg@4: 0.976895	valid_0's ndcg@5: 0.976982
[37]	valid_0's ndcg@1: 0.940475	valid_0's ndcg@2: 0.97516	valid_0's ndcg@3: 0.97691	valid_0's ndcg@4: 0.977233	valid_0's ndcg@5: 0.97732
[38]	valid_0's ndcg@1: 0.94035	valid_0's ndcg@2: 0.975114	valid_0's ndcg@3: 0.976877	valid_0's ndcg@4: 0.977189	valid_0's ndcg@5: 0.977276
[39]	valid_0's ndcg@1: 0.94025	valid_0's ndcg@2: 0.975093	valid_0's ndcg@3: 0.976843	valid_0's ndcg@4: 0.977155	valid_0's ndcg@5: 0.977242
[40]	valid_0's ndcg@1: 0.940775	valid_0's ndcg@2: 0.975271	valid_0's ndcg@3: 0.977034	valid_0's ndcg@4: 0.977346	valid_0's ndcg@5: 0.977433
[41]	valid_0's ndcg@1: 0.941275	valid_0's ndcg@2: 0.975519	valid_0's ndcg@3: 0.977231	valid_0's ndcg@4: 0.977554	valid_0's ndcg@5: 0.977632
[42]	valid_0's ndcg@1: 0.94

[1]	valid_0's ndcg@1: 0.9122	valid_0's ndcg@2: 0.964157	valid_0's ndcg@3: 0.966195	valid_0's ndcg@4: 0.966658	valid_0's ndcg@5: 0.966725
Training until validation scores don't improve for 50 rounds
[2]	valid_0's ndcg@1: 0.9159	valid_0's ndcg@2: 0.965854	valid_0's ndcg@3: 0.967791	valid_0's ndcg@4: 0.968147	valid_0's ndcg@5: 0.968224
[3]	valid_0's ndcg@1: 0.9218	valid_0's ndcg@2: 0.968031	valid_0's ndcg@3: 0.970081	valid_0's ndcg@4: 0.970351	valid_0's ndcg@5: 0.970409
[4]	valid_0's ndcg@1: 0.9247	valid_0's ndcg@2: 0.969196	valid_0's ndcg@3: 0.971196	valid_0's ndcg@4: 0.971433	valid_0's ndcg@5: 0.971501
[5]	valid_0's ndcg@1: 0.9246	valid_0's ndcg@2: 0.969128	valid_0's ndcg@3: 0.971153	valid_0's ndcg@4: 0.971401	valid_0's ndcg@5: 0.971459
[6]	valid_0's ndcg@1: 0.9262	valid_0's ndcg@2: 0.969908	valid_0's ndcg@3: 0.97182	valid_0's ndcg@4: 0.972046	valid_0's ndcg@5: 0.972114
[7]	valid_0's ndcg@1: 0.9275	valid_0's ndcg@2: 0.970545	valid_0's ndcg@3: 0.972358	valid_0's ndcg@4: 0.972573	valid_0'

[70]	valid_0's ndcg@1: 0.945025	valid_0's ndcg@2: 0.977628	valid_0's ndcg@3: 0.979103	valid_0's ndcg@4: 0.9792	valid_0's ndcg@5: 0.979249
[71]	valid_0's ndcg@1: 0.945475	valid_0's ndcg@2: 0.977731	valid_0's ndcg@3: 0.979256	valid_0's ndcg@4: 0.979353	valid_0's ndcg@5: 0.979402
[72]	valid_0's ndcg@1: 0.945475	valid_0's ndcg@2: 0.977747	valid_0's ndcg@3: 0.979272	valid_0's ndcg@4: 0.979358	valid_0's ndcg@5: 0.979407
[73]	valid_0's ndcg@1: 0.9458	valid_0's ndcg@2: 0.977867	valid_0's ndcg@3: 0.979405	valid_0's ndcg@4: 0.97948	valid_0's ndcg@5: 0.979528
[74]	valid_0's ndcg@1: 0.94585	valid_0's ndcg@2: 0.977885	valid_0's ndcg@3: 0.979423	valid_0's ndcg@4: 0.979498	valid_0's ndcg@5: 0.979547
[75]	valid_0's ndcg@1: 0.946	valid_0's ndcg@2: 0.977957	valid_0's ndcg@3: 0.979482	valid_0's ndcg@4: 0.979557	valid_0's ndcg@5: 0.979605
[76]	valid_0's ndcg@1: 0.946275	valid_0's ndcg@2: 0.978058	valid_0's ndcg@3: 0.979583	valid_0's ndcg@4: 0.979658	valid_0's ndcg@5: 0.979707
[77]	valid_0's ndcg@1: 0.9462

[35]	valid_0's ndcg@1: 0.9386	valid_0's ndcg@2: 0.974642	valid_0's ndcg@3: 0.976454	valid_0's ndcg@4: 0.976648	valid_0's ndcg@5: 0.976687
[36]	valid_0's ndcg@1: 0.938625	valid_0's ndcg@2: 0.974698	valid_0's ndcg@3: 0.976473	valid_0's ndcg@4: 0.976667	valid_0's ndcg@5: 0.976696
[37]	valid_0's ndcg@1: 0.939425	valid_0's ndcg@2: 0.974962	valid_0's ndcg@3: 0.97675	valid_0's ndcg@4: 0.976954	valid_0's ndcg@5: 0.976983
[38]	valid_0's ndcg@1: 0.9394	valid_0's ndcg@2: 0.974969	valid_0's ndcg@3: 0.976744	valid_0's ndcg@4: 0.976948	valid_0's ndcg@5: 0.976977
[39]	valid_0's ndcg@1: 0.939125	valid_0's ndcg@2: 0.974914	valid_0's ndcg@3: 0.976652	valid_0's ndcg@4: 0.976857	valid_0's ndcg@5: 0.976886
[40]	valid_0's ndcg@1: 0.939675	valid_0's ndcg@2: 0.975133	valid_0's ndcg@3: 0.976858	valid_0's ndcg@4: 0.977063	valid_0's ndcg@5: 0.977102
[41]	valid_0's ndcg@1: 0.939475	valid_0's ndcg@2: 0.975059	valid_0's ndcg@3: 0.976784	valid_0's ndcg@4: 0.976989	valid_0's ndcg@5: 0.977028
[42]	valid_0's ndcg@1: 0.

[1]	valid_0's ndcg@1: 0.910825	valid_0's ndcg@2: 0.963949	valid_0's ndcg@3: 0.965824	valid_0's ndcg@4: 0.966244	valid_0's ndcg@5: 0.966302
Training until validation scores don't improve for 50 rounds
[2]	valid_0's ndcg@1: 0.915775	valid_0's ndcg@2: 0.965918	valid_0's ndcg@3: 0.967831	valid_0's ndcg@4: 0.968132	valid_0's ndcg@5: 0.968171
[3]	valid_0's ndcg@1: 0.918325	valid_0's ndcg@2: 0.967206	valid_0's ndcg@3: 0.968819	valid_0's ndcg@4: 0.969142	valid_0's ndcg@5: 0.96918
[4]	valid_0's ndcg@1: 0.922075	valid_0's ndcg@2: 0.968527	valid_0's ndcg@3: 0.970177	valid_0's ndcg@4: 0.9705	valid_0's ndcg@5: 0.970539
[5]	valid_0's ndcg@1: 0.923675	valid_0's ndcg@2: 0.969007	valid_0's ndcg@3: 0.970757	valid_0's ndcg@4: 0.97108	valid_0's ndcg@5: 0.971109
[6]	valid_0's ndcg@1: 0.925	valid_0's ndcg@2: 0.969701	valid_0's ndcg@3: 0.971301	valid_0's ndcg@4: 0.971603	valid_0's ndcg@5: 0.971651
[7]	valid_0's ndcg@1: 0.92725	valid_0's ndcg@2: 0.97039	valid_0's ndcg@3: 0.972077	valid_0's ndcg@4: 0.972411	va

[65]	valid_0's ndcg@1: 0.94435	valid_0's ndcg@2: 0.977079	valid_0's ndcg@3: 0.978529	valid_0's ndcg@4: 0.978809	valid_0's ndcg@5: 0.978858
[66]	valid_0's ndcg@1: 0.9443	valid_0's ndcg@2: 0.977077	valid_0's ndcg@3: 0.978514	valid_0's ndcg@4: 0.978805	valid_0's ndcg@5: 0.978853
[67]	valid_0's ndcg@1: 0.944675	valid_0's ndcg@2: 0.977215	valid_0's ndcg@3: 0.978653	valid_0's ndcg@4: 0.978943	valid_0's ndcg@5: 0.978992
[68]	valid_0's ndcg@1: 0.9448	valid_0's ndcg@2: 0.977293	valid_0's ndcg@3: 0.978693	valid_0's ndcg@4: 0.979005	valid_0's ndcg@5: 0.979044
[69]	valid_0's ndcg@1: 0.94495	valid_0's ndcg@2: 0.977332	valid_0's ndcg@3: 0.978745	valid_0's ndcg@4: 0.979057	valid_0's ndcg@5: 0.979096
[70]	valid_0's ndcg@1: 0.945025	valid_0's ndcg@2: 0.97736	valid_0's ndcg@3: 0.978773	valid_0's ndcg@4: 0.979085	valid_0's ndcg@5: 0.979124
[71]	valid_0's ndcg@1: 0.9452	valid_0's ndcg@2: 0.977409	valid_0's ndcg@3: 0.978834	valid_0's ndcg@4: 0.979146	valid_0's ndcg@5: 0.979185
[72]	valid_0's ndcg@1: 0.9453

In [88]:
# 预测结果重新排序, 及生成提交结果
# 单模型生成提交结果
rank_results = tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score']]
rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)
submit(rank_results, topk=5, model_name='lgb_ranker')

## LGB分类模型

In [89]:
# 模型及参数的定义
lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
                            max_depth=-1, n_estimators=500, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
                            learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16, verbose=10)  

In [90]:
# 模型训练
if offline:
    lgb_Classfication.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'],
                    eval_set=[(val_user_item_feats_df_rank_model[lgb_cols], val_user_item_feats_df_rank_model['label'])], 
                    eval_metric=['auc', ],early_stopping_rounds=50, )
else:
    lgb_Classfication.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'])

[LightGBM] [Info] Number of positive: 64190, number of negative: 226880
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.046243
[LightGBM] [Debug] init for col-wise cost 0.000150 seconds, init for row-wise cost 0.010592 seconds
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4130
[LightGBM] [Info] Number of data points in the train set: 291070, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.220531 -> initscore=-1.262574
[LightGBM] [Info] Start training from score -1.262574
[LightGBM] [Debug] Re-bagging, using 203746 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[LightGBM] [Debug] Re-bagging, using 203356 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[LightGBM] [Debug] Re-bagging, using 203636 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[LightGBM] [Debug] Re-bagging, using 203579 dat

[LightGBM] [Debug] Re-bagging, using 203552 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[LightGBM] [Debug] Re-bagging, using 203343 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[LightGBM] [Debug] Re-bagging, using 203529 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[LightGBM] [Debug] Re-bagging, using 203833 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 9
[LightGBM] [Debug] Re-bagging, using 203863 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[LightGBM] [Debug] Re-bagging, using 203958 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[LightGBM] [Debug] Re-bagging, using 203654 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[LightGBM] [Debug] Re-bagging, using 203833 data to train
[LightGBM] [Debug] Trained a tree with leaves =

[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 12
[LightGBM] [Debug] Re-bagging, using 203770 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[LightGBM] [Debug] Re-bagging, using 203906 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 9
[LightGBM] [Debug] Re-bagging, using 203862 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 16
[LightGBM] [Debug] Re-bagging, using 203737 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[LightGBM] [Debug] Re-bagging, using 203734 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 12
[LightGBM] [Debug] Re-bagging, using 203766 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 9
[LightGBM] [Debug] Re-bagging, using 203365 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 12
[LightGBM] [Debug] Re-bagging, using

[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[LightGBM] [Debug] Re-bagging, using 203485 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 8
[LightGBM] [Debug] Re-bagging, using 204036 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[LightGBM] [Debug] Re-bagging, using 203448 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 9
[LightGBM] [Debug] Re-bagging, using 203792 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[LightGBM] [Debug] Re-bagging, using 204069 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 12
[LightGBM] [Debug] Re-bagging, using 203389 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 13
[LightGBM] [Debug] Re-bagging, using 204182 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 12
[LightGBM] [Debug] Re-bagging, using

[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 9
[LightGBM] [Debug] Re-bagging, using 203686 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[LightGBM] [Debug] Re-bagging, using 204038 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 9
[LightGBM] [Debug] Re-bagging, using 203689 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[LightGBM] [Debug] Re-bagging, using 203693 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 15
[LightGBM] [Debug] Re-bagging, using 203819 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 13
[LightGBM] [Debug] Re-bagging, using 203890 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[LightGBM] [Debug] Re-bagging, using 203804 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 17
[LightGBM] [Debug] Re-bagging, using

[LightGBM] [Debug] Re-bagging, using 203266 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 14
[LightGBM] [Debug] Re-bagging, using 203591 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 16
[LightGBM] [Debug] Re-bagging, using 203839 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[LightGBM] [Debug] Re-bagging, using 203506 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 9
[LightGBM] [Debug] Re-bagging, using 203668 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 14
[LightGBM] [Debug] Re-bagging, using 203771 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 14
[LightGBM] [Debug] Re-bagging, using 203991 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 15
[LightGBM] [Debug] Re-bagging, using 203798 data to train
[LightGBM] [Debug] Trained a tree with leaves =

[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[LightGBM] [Debug] Re-bagging, using 203305 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[LightGBM] [Debug] Re-bagging, using 203870 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 15
[LightGBM] [Debug] Re-bagging, using 204070 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 12
[LightGBM] [Debug] Re-bagging, using 203756 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 13
[LightGBM] [Debug] Re-bagging, using 203543 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 9
[LightGBM] [Debug] Re-bagging, using 203422 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 13
[LightGBM] [Debug] Re-bagging, using 204080 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 13
[LightGBM] [Debug] Re-bagging, usin

In [91]:
# 模型预测
tst_user_item_feats_df['pred_score'] = lgb_Classfication.predict_proba(tst_user_item_feats_df[lgb_cols])[:,1]

# 将这里的排序结果保存一份，用户后面的模型融合
tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']].to_csv(save_path + 'lgb_cls_score.csv', index=False)

In [92]:
# 预测结果重新排序, 及生成提交结果
rank_results = tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']]
rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)
submit(rank_results, topk=5, model_name='lgb_cls')

In [93]:
# 五折交叉验证，这里的五折交叉是以用户为目标进行五折划分
#  这一部分与前面的单独训练和验证是分开的
def get_kfold_users(trn_df, n=5):
    user_ids = trn_df['user_id'].unique()
    user_set = [user_ids[i::n] for i in range(n)]
    return user_set

k_fold = 5
trn_df = trn_user_item_feats_df_rank_model
user_set = get_kfold_users(trn_df, n=k_fold)

score_list = []
score_df = trn_df[['user_id', 'click_article_id', 'label']]
sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])

# 五折交叉验证，并将中间结果保存用于staking
for n_fold, valid_user in enumerate(user_set):
    train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user
    valid_idx = trn_df[trn_df['user_id'].isin(valid_user)]
    
    # 模型及参数的定义
    lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
                            max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
                            learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16, verbose=10)  
    # 训练模型
    lgb_Classfication.fit(train_idx[lgb_cols], train_idx['label'],eval_set=[(valid_idx[lgb_cols], valid_idx['label'])], 
                          eval_metric=['auc', ],early_stopping_rounds=50, )
    
    # 预测验证集结果
    valid_idx['pred_score'] = lgb_Classfication.predict_proba(valid_idx[lgb_cols], 
                                                              num_iteration=lgb_Classfication.best_iteration_)[:,1]
    
    # 对输出结果进行归一化 分类模型输出的值本身就是一个概率值不需要进行归一化
    # valid_idx['pred_score'] = valid_idx[['pred_score']].transform(lambda x: norm_sim(x))
    
    valid_idx.sort_values(by=['user_id', 'pred_score'])
    valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
    
    # 将验证集的预测结果放到一个列表中，后面进行拼接
    score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']])
    
    # 如果是线上测试，需要计算每次交叉验证的结果相加，最后求平均
    if not offline:
        sub_preds += lgb_Classfication.predict_proba(tst_user_item_feats_df_rank_model[lgb_cols], 
                                                     num_iteration=lgb_Classfication.best_iteration_)[:,1]
    
score_df_ = pd.concat(score_list, axis=0)
score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])
# 保存训练集交叉验证产生的新特征
score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_lgb_cls_feats.csv', index=False)
    
# 测试集的预测结果，多次交叉验证求平均,将预测的score和对应的rank特征保存，可以用于后面的staking，这里还可以构造其他更多的特征
tst_user_item_feats_df_rank_model['pred_score'] = sub_preds / k_fold
tst_user_item_feats_df_rank_model['pred_score'] = tst_user_item_feats_df_rank_model['pred_score'].transform(lambda x: norm_sim(x))
tst_user_item_feats_df_rank_model.sort_values(by=['user_id', 'pred_score'])
tst_user_item_feats_df_rank_model['pred_rank'] = tst_user_item_feats_df_rank_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')

# 保存测试集交叉验证的新特征
tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_lgb_cls_feats.csv', index=False)

[LightGBM] [Info] Number of positive: 51564, number of negative: 181388
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.046205
[LightGBM] [Debug] init for col-wise cost 0.000121 seconds, init for row-wise cost 0.008484 seconds
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4132
[LightGBM] [Info] Number of data points in the train set: 232952, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.221350 -> initscore=-1.257815
[LightGBM] [Info] Start training from score -1.257815
[LightGBM] [Debug] Re-bagging, using 163211 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[1]	valid_0's auc: 0.763708	valid_0's binary_logloss: 0.521805
Training until validation scores don't improve for 50 rounds
[LightGBM] [Debug] Re-bagging, using 162839 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[2]	valid_0's auc: 0.777195	valid_0's binary_loglos

[46]	valid_0's auc: 0.810714	valid_0's binary_logloss: 0.473597
[LightGBM] [Debug] Re-bagging, using 162961 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[47]	valid_0's auc: 0.810831	valid_0's binary_logloss: 0.47269
[LightGBM] [Debug] Re-bagging, using 162636 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[48]	valid_0's auc: 0.810988	valid_0's binary_logloss: 0.471914
[LightGBM] [Debug] Re-bagging, using 163156 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[49]	valid_0's auc: 0.811313	valid_0's binary_logloss: 0.471482
[LightGBM] [Debug] Re-bagging, using 163027 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[50]	valid_0's auc: 0.811468	valid_0's binary_logloss: 0.471081
[LightGBM] [Debug] Re-bagging, using 162822 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[51]	valid_0's auc: 0.811767	valid_0's bi

[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[96]	valid_0's auc: 0.818128	valid_0's binary_logloss: 0.443627
[LightGBM] [Debug] Re-bagging, using 163058 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[97]	valid_0's auc: 0.818555	valid_0's binary_logloss: 0.443217
[LightGBM] [Debug] Re-bagging, using 162727 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[98]	valid_0's auc: 0.818603	valid_0's binary_logloss: 0.44271
[LightGBM] [Debug] Re-bagging, using 162827 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[99]	valid_0's auc: 0.81869	valid_0's binary_logloss: 0.442437
[LightGBM] [Debug] Re-bagging, using 162752 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[100]	valid_0's auc: 0.818853	valid_0's binary_logloss: 0.44209
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.818853	valid_0's binary_loglo

[LightGBM] [Debug] Re-bagging, using 162460 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[48]	valid_0's auc: 0.812115	valid_0's binary_logloss: 0.476207
[LightGBM] [Debug] Re-bagging, using 162973 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 12
[49]	valid_0's auc: 0.812476	valid_0's binary_logloss: 0.475743
[LightGBM] [Debug] Re-bagging, using 162839 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 9
[50]	valid_0's auc: 0.812737	valid_0's binary_logloss: 0.475328
[LightGBM] [Debug] Re-bagging, using 162654 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[51]	valid_0's auc: 0.81301	valid_0's binary_logloss: 0.474477
[LightGBM] [Debug] Re-bagging, using 162531 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[52]	valid_0's auc: 0.813051	valid_0's binary_logloss: 0.473674
[LightGBM] [Debug] Re-bagging, using 16309

[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 12
[97]	valid_0's auc: 0.820949	valid_0's binary_logloss: 0.446827
[LightGBM] [Debug] Re-bagging, using 162595 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 12
[98]	valid_0's auc: 0.821008	valid_0's binary_logloss: 0.446319
[LightGBM] [Debug] Re-bagging, using 162664 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 12
[99]	valid_0's auc: 0.821086	valid_0's binary_logloss: 0.446032
[LightGBM] [Debug] Re-bagging, using 162594 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[100]	valid_0's auc: 0.821279	valid_0's binary_logloss: 0.445665
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.821279	valid_0's binary_logloss: 0.445665
[LightGBM] [Info] Number of positive: 51395, number of negative: 181604
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.046232
[LightGBM] [Debug] init for

[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[46]	valid_0's auc: 0.810674	valid_0's binary_logloss: 0.477112
[LightGBM] [Debug] Re-bagging, using 163017 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[47]	valid_0's auc: 0.810689	valid_0's binary_logloss: 0.476206
[LightGBM] [Debug] Re-bagging, using 162679 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[48]	valid_0's auc: 0.810641	valid_0's binary_logloss: 0.47544
[LightGBM] [Debug] Re-bagging, using 163187 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[49]	valid_0's auc: 0.810967	valid_0's binary_logloss: 0.474997
[LightGBM] [Debug] Re-bagging, using 163061 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[50]	valid_0's auc: 0.81107	valid_0's binary_logloss: 0.474599
[LightGBM] [Debug] Re-bagging, using 162859 data to train
[LightGBM] [Debug] Trained a tree with lea

[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[93]	valid_0's auc: 0.818897	valid_0's binary_logloss: 0.4478
[LightGBM] [Debug] Re-bagging, using 163386 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 9
[94]	valid_0's auc: 0.818953	valid_0's binary_logloss: 0.447296
[LightGBM] [Debug] Re-bagging, using 163089 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[95]	valid_0's auc: 0.819089	valid_0's binary_logloss: 0.446794
[LightGBM] [Debug] Re-bagging, using 163245 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 12
[96]	valid_0's auc: 0.819465	valid_0's binary_logloss: 0.446417
[LightGBM] [Debug] Re-bagging, using 163120 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[97]	valid_0's auc: 0.819902	valid_0's binary_logloss: 0.445984
[LightGBM] [Debug] Re-bagging, using 162777 data to train
[LightGBM] [Debug] Trained a tree with leav

[LightGBM] [Debug] Re-bagging, using 162542 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[48]	valid_0's auc: 0.809541	valid_0's binary_logloss: 0.476946
[LightGBM] [Debug] Re-bagging, using 163031 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[49]	valid_0's auc: 0.809866	valid_0's binary_logloss: 0.476505
[LightGBM] [Debug] Re-bagging, using 162901 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[50]	valid_0's auc: 0.810083	valid_0's binary_logloss: 0.4761
[LightGBM] [Debug] Re-bagging, using 162714 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 9
[51]	valid_0's auc: 0.810318	valid_0's binary_logloss: 0.475301
[LightGBM] [Debug] Re-bagging, using 162579 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 8
[52]	valid_0's auc: 0.810439	valid_0's binary_logloss: 0.4745
[LightGBM] [Debug] Re-bagging, using 163153 da

[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[97]	valid_0's auc: 0.818385	valid_0's binary_logloss: 0.448006
[LightGBM] [Debug] Re-bagging, using 162649 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 14
[98]	valid_0's auc: 0.818415	valid_0's binary_logloss: 0.447509
[LightGBM] [Debug] Re-bagging, using 162723 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 16
[99]	valid_0's auc: 0.818594	valid_0's binary_logloss: 0.447216
[LightGBM] [Debug] Re-bagging, using 162649 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[100]	valid_0's auc: 0.818719	valid_0's binary_logloss: 0.446879
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.818719	valid_0's binary_logloss: 0.446879
[LightGBM] [Info] Number of positive: 51252, number of negative: 181568
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.046238
[LightGBM] [Debug] init for

[47]	valid_0's auc: 0.810783	valid_0's binary_logloss: 0.478019
[LightGBM] [Debug] Re-bagging, using 162561 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[48]	valid_0's auc: 0.810804	valid_0's binary_logloss: 0.47724
[LightGBM] [Debug] Re-bagging, using 163060 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 12
[49]	valid_0's auc: 0.811116	valid_0's binary_logloss: 0.476816
[LightGBM] [Debug] Re-bagging, using 162927 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[50]	valid_0's auc: 0.811289	valid_0's binary_logloss: 0.476411
[LightGBM] [Debug] Re-bagging, using 162720 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[51]	valid_0's auc: 0.811776	valid_0's binary_logloss: 0.475544
[LightGBM] [Debug] Re-bagging, using 162577 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[52]	valid_0's auc: 0.811767	valid_0's bi

[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[97]	valid_0's auc: 0.820351	valid_0's binary_logloss: 0.447664
[LightGBM] [Debug] Re-bagging, using 162645 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 12
[98]	valid_0's auc: 0.820375	valid_0's binary_logloss: 0.447184
[LightGBM] [Debug] Re-bagging, using 162736 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[99]	valid_0's auc: 0.820459	valid_0's binary_logloss: 0.446892
[LightGBM] [Debug] Re-bagging, using 162662 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[100]	valid_0's auc: 0.820618	valid_0's binary_logloss: 0.44654
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.820618	valid_0's binary_logloss: 0.44654


In [94]:
# 预测结果重新排序, 及生成提交结果
rank_results = tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score']]
rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)
submit(rank_results, topk=5, model_name='lgb_cls')

## DIN模型

### 用户的历史点击行为列表
这个是为后面的DIN模型服务的

In [95]:
if offline:
    all_data = pd.read_csv('./data_raw/train_click_log.csv')
else:
    trn_data = pd.read_csv('./data_raw/train_click_log.csv')
    tst_data = pd.read_csv('./data_raw/testA_click_log.csv')
    all_data = trn_data.append(tst_data)

In [96]:
from sklearn.preprocessing import LabelEncoder

lbe = LabelEncoder()
all_data['click_article_id'] = lbe.fit_transform(all_data['click_article_id'].values)

hist_click =all_data[['user_id', 'click_article_id']].groupby('user_id').agg({list}).reset_index()
his_behavior_df = pd.DataFrame()
his_behavior_df['user_id'] = hist_click['user_id']
his_behavior_df['hist_click_article_id'] = hist_click['click_article_id']

In [97]:
trn_user_item_feats_df_din_model = trn_user_item_feats_df.copy()

if offline:
    val_user_item_feats_df_din_model = val_user_item_feats_df.copy()
else: 
    val_user_item_feats_df_din_model = None
    
tst_user_item_feats_df_din_model = tst_user_item_feats_df.copy()

In [98]:
trn_user_item_feats_df_din_model = trn_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id')

if offline:
    val_user_item_feats_df_din_model = val_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id')
else:
    val_user_item_feats_df_din_model = None

tst_user_item_feats_df_din_model = tst_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id')

### DIN模型简介
我们下面尝试使用DIN模型， DIN的全称是Deep Interest Network， 这是阿里2018年基于前面的深度学习模型无法表达用户多样化的兴趣而提出的一个模型， 它可以通过考虑【给定的候选广告】和【用户的历史行为】的相关性，来计算用户兴趣的表示向量。具体来说就是通过引入局部激活单元，通过软搜索历史行为的相关部分来关注相关的用户兴趣，并采用加权和来获得有关候选广告的用户兴趣的表示。与候选广告相关性较高的行为会获得较高的激活权重，并支配着用户兴趣。该表示向量在不同广告上有所不同，大大提高了模型的表达能力。所以该模型对于此次新闻推荐的任务也比较适合， 我们在这里通过当前的候选文章与用户历史点击文章的相关性来计算用户对于文章的兴趣。 该模型的结构如下：

![image-20201116201646983](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201116201646983.png)


我们这里直接调包来使用这个模型， 关于这个模型的详细细节部分我们会在下一期的推荐系统组队学习中给出。下面说一下该模型如何具体使用：deepctr的函数原型如下：
> def DIN(dnn_feature_columns, history_feature_list, dnn_use_bn=False,
>        dnn_hidden_units=(200, 80), dnn_activation='relu', att_hidden_size=(80, 40), att_activation="dice",
>       att_weight_normalization=False, l2_reg_dnn=0, l2_reg_embedding=1e-6, dnn_dropout=0, seed=1024,
>        task='binary'):
> 
> * dnn_feature_columns: 特征列， 包含数据所有特征的列表
> * history_feature_list: 用户历史行为列， 反应用户历史行为的特征的列表
> * dnn_use_bn: 是否使用BatchNormalization
> * dnn_hidden_units: 全连接层网络的层数和每一层神经元的个数， 一个列表或者元组
> * dnn_activation_relu: 全连接网络的激活单元类型
> * att_hidden_size: 注意力层的全连接网络的层数和每一层神经元的个数
> * att_activation: 注意力层的激活单元类型
> * att_weight_normalization: 是否归一化注意力得分
> * l2_reg_dnn: 全连接网络的正则化系数
> * l2_reg_embedding: embedding向量的正则化稀疏
> * dnn_dropout: 全连接网络的神经元的失活概率
> * task: 任务， 可以是分类， 也可是是回归

在具体使用的时候， 我们必须要传入特征列和历史行为列， 但是再传入之前， 我们需要进行一下特征列的预处理。具体如下：

1. 首先，我们要处理数据集， 得到数据， 由于我们是基于用户过去的行为去预测用户是否点击当前文章， 所以我们需要把数据的特征列划分成数值型特征， 离散型特征和历史行为特征列三部分， 对于每一部分， DIN模型的处理会有不同
    1. 对于离散型特征， 在我们的数据集中就是那些类别型的特征， 比如user_id这种， 这种类别型特征， 我们首先要经过embedding处理得到每个特征的低维稠密型表示， 既然要经过embedding， 那么我们就需要为每一列的类别特征的取值建立一个字典，并指明embedding维度， 所以在使用deepctr的DIN模型准备数据的时候， 我们需要通过SparseFeat函数指明这些类别型特征, 这个函数的传入参数就是列名， 列的唯一取值(建立字典用)和embedding维度。
    2. 对于用户历史行为特征列， 比如文章id， 文章的类别等这种， 同样的我们需要先经过embedding处理， 只不过和上面不一样的地方是，对于这种特征， 我们在得到每个特征的embedding表示之后， 还需要通过一个Attention_layer计算用户的历史行为和当前候选文章的相关性以此得到当前用户的embedding向量， 这个向量就可以基于当前的候选文章与用户过去点击过得历史文章的相似性的程度来反应用户的兴趣， 并且随着用户的不同的历史点击来变化，去动态的模拟用户兴趣的变化过程。这类特征对于每个用户都是一个历史行为序列， 对于每个用户， 历史行为序列长度会不一样， 可能有的用户点击的历史文章多，有的点击的历史文章少， 所以我们还需要把这个长度统一起来， 在为DIN模型准备数据的时候， 我们首先要通过SparseFeat函数指明这些类别型特征， 然后还需要通过VarLenSparseFeat函数再进行序列填充， 使得每个用户的历史序列一样长， 所以这个函数参数中会有个maxlen，来指明序列的最大长度是多少。
    3. 对于连续型特征列， 我们只需要用DenseFeat函数来指明列名和维度即可。
2. 处理完特征列之后， 我们把相应的数据与列进行对应，就得到了最后的数据。

下面根据具体的代码感受一下， 逻辑是这样， 首先我们需要写一个数据准备函数， 在这里面就是根据上面的具体步骤准备数据， 得到数据和特征列， 然后就是建立DIN模型并训练， 最后基于模型进行测试。

In [99]:
# 导入deepctr
from deepctr.models import DIN
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat, get_feature_names
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras import backend as K
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.keras.callbacks import * 
import tensorflow as tf

import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [100]:
# 数据准备函数
def get_din_feats_columns(df, dense_fea, sparse_fea, behavior_fea, his_behavior_fea, emb_dim=32, max_len=100):
    """
    数据准备函数:
    df: 数据集
    dense_fea: 数值型特征列
    sparse_fea: 离散型特征列
    behavior_fea: 用户的候选行为特征列
    his_behavior_fea: 用户的历史行为特征列
    embedding_dim: embedding的维度， 这里为了简单， 统一把离散型特征列采用一样的隐向量维度
    max_len: 用户序列的最大长度
    """
    
    sparse_feature_columns = [SparseFeat(feat, vocabulary_size=df[feat].nunique() + 1, embedding_dim=emb_dim) 
                              for feat in sparse_fea]
    
    dense_feature_columns = [DenseFeat(feat, 1, ) for feat in dense_fea]
    
    var_feature_columns = [VarLenSparseFeat(SparseFeat(feat, vocabulary_size=df['click_article_id'].nunique() + 1,
                                    embedding_dim=emb_dim, embedding_name='click_article_id'), maxlen=max_len) 
                           for feat in hist_behavior_fea]
    
    dnn_feature_columns = sparse_feature_columns + dense_feature_columns + var_feature_columns
    
    # 建立x, x是一个字典的形式
    x = {}
    for name in get_feature_names(dnn_feature_columns):
        if name in his_behavior_fea:
            # 这是历史行为序列
            his_list = [l for l in df[name]]
            x[name] = pad_sequences(his_list, maxlen=max_len, padding='post')      # 二维数组
        else:
            lbe = LabelEncoder()
            x[name] = lbe.fit_transform(df[name].values)
            
#             x[name] = df[name].values
    
    return x, dnn_feature_columns

In [101]:
# 把特征分开
sparse_fea = ['user_id', 'click_article_id', 'category_id', 'click_environment', 'click_deviceGroup', 
              'click_os', 'click_country', 'click_region', 'click_referrer_type', 'is_cat_hab']

behavior_fea = ['click_article_id']

hist_behavior_fea = ['hist_click_article_id']

dense_fea = ['sim0', 'time_diff0', 'word_diff0', 'sim_max', 'sim_min', 'sim_sum', 'sim_mean', 'score',
             'rank','click_size','time_diff_mean','active_level','user_time_hob1','user_time_hob2',
             'words_hbo','words_count']

In [102]:
# dense特征进行归一化, 神经网络训练都需要将数值进行归一化处理
mm = MinMaxScaler()

# 下面是做一些特殊处理，当在其他的地方出现无效值的时候，不处理无法进行归一化，刚开始可以先把他注释掉，在运行了下面的代码
# 之后如果发现报错，应该先去想办法处理如何不出现inf之类的值
# trn_user_item_feats_df_din_model.replace([np.inf, -np.inf], 0, inplace=True)
# tst_user_item_feats_df_din_model.replace([np.inf, -np.inf], 0, inplace=True)

for feat in dense_fea:
    trn_user_item_feats_df_din_model[feat] = mm.fit_transform(trn_user_item_feats_df_din_model[[feat]])
    
    if val_user_item_feats_df_din_model is not None:
        val_user_item_feats_df_din_model[feat] = mm.fit_transform(val_user_item_feats_df_din_model[[feat]])
    
    tst_user_item_feats_df_din_model[feat] = mm.fit_transform(tst_user_item_feats_df_din_model[[feat]])

In [103]:
# 准备训练数据
x_trn, dnn_feature_columns = get_din_feats_columns(trn_user_item_feats_df_din_model, dense_fea, 
                                               sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)
y_trn = trn_user_item_feats_df_din_model['label'].values

if offline:
    # 准备验证数据
    x_val, dnn_feature_columns = get_din_feats_columns(val_user_item_feats_df_din_model, dense_fea, 
                                                   sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)
    y_val = val_user_item_feats_df_din_model['label'].values
    
dense_fea = [x for x in dense_fea if x != 'label']
x_tst, dnn_feature_columns = get_din_feats_columns(tst_user_item_feats_df_din_model, dense_fea, 
                                               sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)

In [104]:
# 建立模型
model = DIN(dnn_feature_columns, behavior_fea)

# 查看模型结构
model.summary()

# 模型编译
model.compile('adam', 'binary_crossentropy',metrics=['binary_crossentropy', tf.keras.metrics.AUC()])

The following Variables were used a Lambda layer's call (lambda_2), but
are not present in its tracked objects:
  <tf.Variable 'attention_sequence_pooling_layer_2/local_activation_unit_2/kernel:0' shape=(40, 1) dtype=float32>
  <tf.Variable 'attention_sequence_pooling_layer_2/local_activation_unit_2/bias:0' shape=(1,) dtype=float32>
It is possible that this is intended behavior, but it is more likely
an omission. This is a strong indication that this layer should be
formulated as a subclassed Layer rather than a Lambda layer.
Model: "functional_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_id (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
click_article_id (InputLayer)   [(None, 1)]       

In [105]:
for k, v in x_trn.items():
    print(k)
    print(v.shape)
    print()

# len(y_trn)

# type(x_trn)

user_id
(291070,)

click_article_id
(291070,)

category_id
(291070,)

click_environment
(291070,)

click_deviceGroup
(291070,)

click_os
(291070,)

click_country
(291070,)

click_region
(291070,)

click_referrer_type
(291070,)

is_cat_hab
(291070,)

sim0
(291070,)

time_diff0
(291070,)

word_diff0
(291070,)

sim_max
(291070,)

sim_min
(291070,)

sim_sum
(291070,)

sim_mean
(291070,)

score
(291070,)

rank
(291070,)

click_size
(291070,)

time_diff_mean
(291070,)

active_level
(291070,)

user_time_hob1
(291070,)

user_time_hob2
(291070,)

words_hbo
(291070,)

words_count
(291070,)

hist_click_article_id
(291070, 50)



In [106]:
x_trn['hist_click_article_id']

array([[ 2342, 16104,     0, ...,     0,     0,     0],
       [  241, 30021, 16049, ...,     0,     0,     0],
       [  241, 30021, 16049, ...,     0,     0,     0],
       ...,
       [ 3299,  3177, 15226, ...,     0,     0,     0],
       [ 6869,  6972,     0, ...,     0,     0,     0],
       [27293, 29946,     0, ...,     0,     0,     0]], dtype=int32)

In [107]:
lbe = LabelEncoder()
r = lbe.fit_transform(x_tst['click_article_id'])

len(pd.Series(r).value_counts())

16426

In [108]:
print(pd.Series(y_trn).value_counts())

0.0    226880
1.0     64190
dtype: int64


In [109]:
print(len(y_trn))

291070


In [110]:
# 模型训练
if offline:
    history = model.fit(x_trn, y_trn, verbose=1, epochs=10, validation_data=(x_val, y_val) , batch_size=256)
else:
    # 也可以使用上面的语句用自己采样出来的验证集
    # history = model.fit(x_trn, y_trn, verbose=1, epochs=3, validation_split=0.3, batch_size=256)
    history = model.fit(x_trn, y_trn, verbose=1, epochs=2, batch_size=256)

Epoch 1/2


InvalidArgumentError: 2 root error(s) found.
  (0) Invalid argument:  indices[165,0] = 33113 is not in [0, 16427)
	 [[node functional_5/sparse_seq_emb_hist_click_article_id/embedding_lookup_2 (defined at <ipython-input-110-a1ddaec98eb8>:7) ]]
  (1) Invalid argument:  indices[165,0] = 33113 is not in [0, 16427)
	 [[node functional_5/sparse_seq_emb_hist_click_article_id/embedding_lookup_2 (defined at <ipython-input-110-a1ddaec98eb8>:7) ]]
	 [[gradient_tape/functional_5/sparse_emb_user_id/embedding_lookup/VariableShape/_10]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_15714]

Errors may have originated from an input operation.
Input Source operations connected to node functional_5/sparse_seq_emb_hist_click_article_id/embedding_lookup_2:
 functional_5/sparse_seq_emb_hist_click_article_id/embedding_lookup/13949 (defined at /home/wangxs/anaconda3/lib/python3.6/contextlib.py:81)

Input Source operations connected to node functional_5/sparse_seq_emb_hist_click_article_id/embedding_lookup_2:
 functional_5/sparse_seq_emb_hist_click_article_id/embedding_lookup/13949 (defined at /home/wangxs/anaconda3/lib/python3.6/contextlib.py:81)

Function call stack:
train_function -> train_function


In [None]:
# 模型预测
tst_user_item_feats_df_din_model['pred_score'] = model.predict(x_tst, verbose=1, batch_size=256)
tst_user_item_feats_df_din_model[['user_id', 'click_article_id', 'pred_score']].to_csv(save_path + 'din_rank_score.csv', index=False)

In [None]:
# 预测结果重新排序, 及生成提交结果
rank_results = tst_user_item_feats_df_din_model[['user_id', 'click_article_id', 'pred_score']]
submit(rank_results, topk=5, model_name='din')

In [None]:
# 五折交叉验证，这里的五折交叉是以用户为目标进行五折划分
#  这一部分与前面的单独训练和验证是分开的
def get_kfold_users(trn_df, n=5):
    user_ids = trn_df['user_id'].unique()
    user_set = [user_ids[i::n] for i in range(n)]
    return user_set

k_fold = 5
trn_df = trn_user_item_feats_df_din_model
user_set = get_kfold_users(trn_df, n=k_fold)

score_list = []
score_df = trn_df[['user_id', 'click_article_id', 'label']]
sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])

dense_fea = [x for x in dense_fea if x != 'label']
x_tst, dnn_feature_columns = get_din_feats_columns(tst_user_item_feats_df_din_model, dense_fea, 
                                                   sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)

# 五折交叉验证，并将中间结果保存用于staking
for n_fold, valid_user in enumerate(user_set):
    train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user
    valid_idx = trn_df[trn_df['user_id'].isin(valid_user)]
    
    # 准备训练数据
    x_trn, dnn_feature_columns = get_din_feats_columns(train_idx, dense_fea, 
                                                       sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)
    y_trn = train_idx['label'].values

    # 准备验证数据
    x_val, dnn_feature_columns = get_din_feats_columns(valid_idx, dense_fea, 
                                                   sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)
    y_val = valid_idx['label'].values
    
    history = model.fit(x_trn, y_trn, verbose=1, epochs=2, validation_data=(x_val, y_val) , batch_size=256)
    
    # 预测验证集结果
    valid_idx['pred_score'] = model.predict(x_val, verbose=1, batch_size=256)   
    
    valid_idx.sort_values(by=['user_id', 'pred_score'])
    valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
    
    # 将验证集的预测结果放到一个列表中，后面进行拼接
    score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']])
    
    # 如果是线上测试，需要计算每次交叉验证的结果相加，最后求平均
    if not offline:
        sub_preds += model.predict(x_tst, verbose=1, batch_size=256)[:, 0]   
    
score_df_ = pd.concat(score_list, axis=0)
score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])
# 保存训练集交叉验证产生的新特征
score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_din_cls_feats.csv', index=False)
    
# 测试集的预测结果，多次交叉验证求平均,将预测的score和对应的rank特征保存，可以用于后面的staking，这里还可以构造其他更多的特征
tst_user_item_feats_df_din_model['pred_score'] = sub_preds / k_fold
tst_user_item_feats_df_din_model['pred_score'] = tst_user_item_feats_df_din_model['pred_score'].transform(lambda x: norm_sim(x))
tst_user_item_feats_df_din_model.sort_values(by=['user_id', 'pred_score'])
tst_user_item_feats_df_din_model['pred_rank'] = tst_user_item_feats_df_din_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')

# 保存测试集交叉验证的新特征
tst_user_item_feats_df_din_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_din_cls_feats.csv', index=False)

# 模型融合

## 加权融合

In [None]:
# 读取多个模型的排序结果文件
lgb_ranker = pd.read_csv(save_path + 'lgb_ranker_score.csv')
lgb_cls = pd.read_csv(save_path + 'lgb_cls_score.csv')
din_ranker = pd.read_csv(save_path + 'din_rank_score.csv')

# 这里也可以换成交叉验证输出的测试结果进行加权融合

In [None]:
rank_model = {'lgb_ranker': lgb_ranker, 
              'lgb_cls': lgb_cls, 
              'din_ranker': din_ranker}

In [None]:
def get_ensumble_predict_topk(rank_model, topk=5):
    final_recall = rank_model['lgb_cls'].append(rank_model['din_ranker'])
    rank_model['lgb_ranker']['pred_score'] = rank_model['lgb_ranker']['pred_score'].transform(lambda x: norm_sim(x))
    
    final_recall = final_recall.append(rank_model['lgb_ranker'])
    final_recall = final_recall.groupby(['user_id', 'click_article_id'])['pred_score'].sum().reset_index()
    
    submit(final_recall, topk=topk, model_name='ensemble_fuse')

In [None]:
get_ensumble_predict_topk(rank_model)

## Staking

In [None]:
# 读取多个模型的交叉验证生成的结果文件
# 训练集
trn_lgb_ranker_feats = pd.read_csv(save_path + 'trn_lgb_ranker_feats.csv')
trn_lgb_cls_feats = pd.read_csv(save_path + 'trn_lgb_cls_feats.csv')
trn_din_cls_feats = pd.read_csv(save_path + 'trn_din_cls_feats.csv')

# 测试集
tst_lgb_ranker_feats = pd.read_csv(save_path + 'tst_lgb_ranker_feats.csv')
tst_lgb_cls_feats = pd.read_csv(save_path + 'tst_lgb_cls_feats.csv')
tst_din_cls_feats = pd.read_csv(save_path + 'tst_din_cls_feats.csv')

In [None]:
# 将多个模型输出的特征进行拼接

finall_trn_ranker_feats = trn_lgb_ranker_feats[['user_id', 'click_article_id', 'label']]
finall_tst_ranker_feats = tst_lgb_ranker_feats[['user_id', 'click_article_id']]

for idx, trn_model in enumerate([trn_lgb_ranker_feats, trn_lgb_cls_feats, trn_din_cls_feats]):
    for feat in [ 'pred_score', 'pred_rank']:
        col_name = feat + '_' + str(idx)
        finall_trn_ranker_feats[col_name] = trn_model[feat]

for idx, tst_model in enumerate([tst_lgb_ranker_feats, tst_lgb_cls_feats, tst_din_cls_feats]):
    for feat in [ 'pred_score', 'pred_rank']:
        col_name = feat + '_' + str(idx)
        finall_tst_ranker_feats[col_name] = tst_model[feat]

In [None]:
# 定义一个逻辑回归模型再次拟合交叉验证产生的特征对测试集进行预测
# 这里需要注意的是，在做交叉验证的时候可以构造多一些与输出预测值相关的特征，来丰富这里简单模型的特征
from sklearn.linear_model import LogisticRegression

feat_cols = ['pred_score_0', 'pred_rank_0', 'pred_score_1', 'pred_rank_1', 'pred_score_2', 'pred_rank_2']

trn_x = finall_trn_ranker_feats[feat_cols]
trn_y = finall_trn_ranker_feats['label']

tst_x = finall_tst_ranker_feats[feat_cols]

# 定义模型
lr = LogisticRegression()

# 模型训练
lr.fit(trn_x, trn_y)

# 模型预测
finall_tst_ranker_feats['pred_score'] = lr.predict_proba(tst_x)[:, 1]

In [None]:
# 预测结果重新排序, 及生成提交结果
rank_results = finall_tst_ranker_feats[['user_id', 'click_article_id', 'pred_score']]
submit(rank_results, topk=5, model_name='ensumble_staking')

# 总结
本章主要学习了三个排序模型，包括LGB的Rank， LGB的Classifier还有深度学习的DIN模型， 当然，对于这三个模型的原理部分，我们并没有给出详细的介绍， 请大家课下自己探索原理，也欢迎大家把自己的探索与所学分享出来，我们一块学习和进步。最后，我们进行了简单的模型融合策略，包括简单的加权和Stacking。

关于Datawhale： Datawhale是一个专注于数据科学与AI领域的开源组织，汇集了众多领域院校和知名企业的优秀学习者，聚合了一群有开源精神和探索精神的团队成员。Datawhale 以“for the learner，和学习者一起成长”为愿景，鼓励真实地展现自我、开放包容、互信互助、敢于试错和勇于担当。同时 Datawhale 用开源的理念去探索开源内容、开源学习和开源方案，赋能人才培养，助力人才成长，建立起人与人，人与知识，人与企业和人与未来的联结。 本次数据挖掘路径学习，专题知识将在天池分享，详情可关注Datawhale：

![image-20201119112159065](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png)