# 简介

本文我们介绍一种能够较为稳定提升回归精度的方法,方法的思路非常简单,主要就是捕捉边缘值的预测,我们知道平方损失（MSE）的函数对于较大值的惩罚较大,而优化MAE的指标则比MSE更加注重较小值的优化,所以本文我们就以《消费者人群画像—信用智能评分》比赛为例，给出该方法的线下&线上的实践，来一起见证一下该想法是否正确。


# 工具包导入&数据读取
## 工具包导入

In [1]:
## 数据工具包
import numpy as np
np.random.seed(42)
import pandas as pd
from tqdm import tqdm,tqdm_notebook 

## 字符串处理工具包
import string
import re
import gensim
from collections import Counter
import pickle
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from keras.preprocessing import text, sequence 

import warnings
warnings.filterwarnings('ignore')

import xgboost as xgb
import lightgbm as lgb
from functools import partial

import os 
import gc
import joblib
from scipy import stats 
from scipy.sparse import vstack  
import time
import datetime
import multiprocessing as mp
import seaborn as sns 
tqdm.pandas() 
%matplotlib inline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## 数据读取

In [2]:
train = pd.read_csv('train_dataset.csv')
test = pd.read_csv('test_dataset.csv')

In [3]:
train.head()

Unnamed: 0,用户编码,用户实名制是否通过核实,用户年龄,是否大学生客户,是否黑名单客户,是否4G不健康客户,用户网龄（月）,用户最近一次缴费距今时长（月）,缴费用户最近一次缴费金额（元）,用户近6个月平均消费值（元）,...,当月是否景点游览,当月是否体育场馆消费,当月网购类应用使用次数,当月物流快递类应用使用次数,当月金融理财类应用使用总次数,当月视频播放类应用使用次数,当月飞机类应用使用次数,当月火车类应用使用次数,当月旅游资讯类应用使用次数,信用分
0,a4651f98c82948b186bdcdc8108381b4,1,44,0,0,0,186,1,99.8,163.86,...,1,1,713,0,2740,7145,0,0,30,664
1,aeb10247db4e4d67b2550bbc42ff9827,1,18,0,0,1,5,1,29.94,153.28,...,0,0,414,0,2731,44862,0,0,0,530
2,5af23a1e0e77410abb25e9a7eee510aa,1,47,0,0,0,145,1,49.9,109.64,...,0,0,3391,0,0,4804,0,0,1,643
3,43c64379d3c24a15b8478851b22049e4,1,55,0,0,0,234,1,99.8,92.97,...,1,1,500,0,1931,3141,0,0,5,649
4,f1687f3b8a6f4910bd0b13eb634056e2,1,40,0,0,0,76,1,49.9,95.47,...,1,0,522,0,64,59,0,0,0,648


# 简单特征工程

下面的特征很简单，我也没对比做不做特征好不好,就是随意写了一些简单的一眼能看出来的特征,具体有没有用，大家自己验证......


In [4]:
def _simple_features(df_):
    df = df_.copy() 
    df['次数'] = df['当月网购类应用使用次数'] +  df['当月物流快递类应用使用次数'] +  df['当月金融理财类应用使用总次数'] + df['当月视频播放类应用使用次数']\
                 + df['当月飞机类应用使用次数'] + df['当月火车类应用使用次数'] + df['当月旅游资讯类应用使用次数']  + 1
        
    for col in ['当月金融理财类应用使用总次数','当月旅游资讯类应用使用次数']: # 这两个比较积极向上一点
        df[col + '百分比'] = df[col].values / df['次数'].values 
    
    
    df['当月通话人均话费'] = df['用户账单当月总费用（元）'].values / (df['当月通话交往圈人数'].values + 1)
    df['上个月费用'] = df['用户当月账户余额（元）'].values + df['用户账单当月总费用（元）'].values
     
    df['用户上网年龄'] = df['用户年龄'] - df['用户网龄（月）']
    df['用户上网年龄百分比'] = df['用户网龄（月）'] / (df['用户年龄'] + 1)
     
    df['近似总消费'] = df['用户近6个月平均消费值（元）'] / 6 * df['用户网龄（月）']
    return df
    

In [6]:
train_fea = _simple_features(train)
test_fea  = _simple_features(test)

In [7]:

fea_cols = [col for col in train_fea.columns if train_fea[col].dtypes!='object' and train_fea[col].dtypes != '<M8[ns]' and col!='用户编码' and\
            col!='信用分']   

len(fea_cols)

36

# 线下训练&验证
## 线下验证函数

- 我们认为MSE对于两端极值的预测较为准确,而MAE则对于中间的预测更为准确,所以我们对函数预测的极值附近进行简单的加权修正。

In [9]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.metrics import mean_squared_error
def _get_values_lgbregresser_models(df_fea, df_label,  feature_names):
    kf = KFold(n_splits=5,shuffle=False)#,random_state=1)
     
    models  = [] 
    models_1 = []
    models_2 = []
    
    importances = pd.DataFrame() 
    
    lgb_params = {'num_leaves': 31,
         'min_data_in_leaf': 32, 
#          'objective':'mae',
         'max_depth': -1,
         'learning_rate': 0.005,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         'n_estimators': 10000,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "nthread": 50,
         "verbosity": -1}


    lgb_params1 = {'num_leaves': 31,
         'min_data_in_leaf': 32, 
         'objective':'mae',
         'max_depth': -1,
         'learning_rate': 0.005,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         'n_estimators': 10000,
         "bagging_seed": 11,
         "lambda_l1": 0.1,
         "nthread": 50,
         "verbosity": -1}
    
    min_val = np.min(df_label)
    print(min_val)
    for fold_, (trn_, val_) in enumerate(kf.split(df_fea)): 
        trn_x, trn_y= df_fea[trn_,:], df_label[trn_]#, df_label1[trn_] 
        val_x, val_y = df_fea[val_,:], df_label[val_]#, df_label1[val_] 
        tmp = pd.DataFrame()
         
        
        model = lgb.LGBMRegressor(**lgb_params1)
        model.fit(trn_x, trn_y, eval_set=[(trn_x, trn_y), (val_x, val_y)], eval_metric ='mae',verbose=50,early_stopping_rounds=250)     
        tmp['target'] = val_y
        tmp['pred1'] = model.predict(val_x)
        models.append(model)
        
        model1 = lgb.LGBMRegressor(**lgb_params)
        model1.fit(trn_x, trn_y, eval_set=[(trn_x, trn_y), (val_x, val_y)], eval_metric ='mae',verbose=50,early_stopping_rounds=250)     
        tmp['pred2'] = model1.predict(val_x)
        models_1.append(model1)
  
        tmp = tmp.sort_values('pred1')
        tmp['ranks'] = list(range(tmp.shape[0]))
        tmp['preds'] = tmp['pred1'].values
        tmp.loc[tmp.ranks<2000,'preds']  = tmp.loc[tmp.ranks< 2000,'pred2'].values *0.4 + tmp.loc[tmp.ranks< 2000,'pred1'].values * 0.6
        tmp.loc[tmp.ranks>8000,'preds']  = tmp.loc[tmp.ranks> 8000,'pred2'].values *0.4 + tmp.loc[tmp.ranks> 8000,'pred1'].values * 0.6
         
        print('*' * 100)
        print('MAE Model',     1 / (1 + (mean_absolute_error(y_true= tmp['target'] , y_pred= tmp['pred1'] ))))
        print('MSE Model',     1 / (1 + (mean_absolute_error(y_true= tmp['target'] , y_pred= tmp['pred2'] ))))
        print('Merge Model12', 1 / (1 + (mean_absolute_error(y_true= tmp['target'] , y_pred= tmp['preds'] )))) 
        
        imp_df = pd.DataFrame()
        imp_df['feature'] = feature_names
        imp_df['gain'] = model.feature_importances_
        imp_df['fold'] = fold_ + 1
        
        importances = pd.concat([importances, imp_df], axis=0)
        
        gc.collect() 
    return models,models_1,importances 

## 模型训练

In [11]:
models_mae, models_mse, importances   = _get_values_lgbregresser_models(train_fea[fea_cols].values, train_fea['信用分'].values, feature_names=fea_cols)

422
Training until validation scores don't improve for 250 rounds.
[50]	valid_0's l1: 27.6401	valid_1's l1: 27.8644
[100]	valid_0's l1: 23.8974	valid_1's l1: 24.1098
[150]	valid_0's l1: 21.2366	valid_1's l1: 21.481
[200]	valid_0's l1: 19.3635	valid_1's l1: 19.6343
[250]	valid_0's l1: 18.0437	valid_1's l1: 18.3464
[300]	valid_0's l1: 17.1246	valid_1's l1: 17.4756
[350]	valid_0's l1: 16.478	valid_1's l1: 16.8828
[400]	valid_0's l1: 16.0189	valid_1's l1: 16.4667
[450]	valid_0's l1: 15.6864	valid_1's l1: 16.1697
[500]	valid_0's l1: 15.4399	valid_1's l1: 15.9506
[550]	valid_0's l1: 15.2488	valid_1's l1: 15.783
[600]	valid_0's l1: 15.0962	valid_1's l1: 15.653
[650]	valid_0's l1: 14.9725	valid_1's l1: 15.5524
[700]	valid_0's l1: 14.8674	valid_1's l1: 15.4702
[750]	valid_0's l1: 14.7777	valid_1's l1: 15.4038
[800]	valid_0's l1: 14.6983	valid_1's l1: 15.3473
[850]	valid_0's l1: 14.6263	valid_1's l1: 15.2988
[900]	valid_0's l1: 14.5648	valid_1's l1: 15.2578
[950]	valid_0's l1: 14.5098	valid_1's 

Early stopping, best iteration is:
[7882]	valid_0's l1: 12.3762	valid_1's l1: 14.9055
Training until validation scores don't improve for 250 rounds.
[50]	valid_0's l1: 28.1121	valid_1's l1: 28.3035
[100]	valid_0's l1: 24.1254	valid_1's l1: 24.3496
[150]	valid_0's l1: 21.2909	valid_1's l1: 21.5635
[200]	valid_0's l1: 19.3152	valid_1's l1: 19.6053
[250]	valid_0's l1: 17.9673	valid_1's l1: 18.276
[300]	valid_0's l1: 17.0483	valid_1's l1: 17.3863
[350]	valid_0's l1: 16.4168	valid_1's l1: 16.7916
[400]	valid_0's l1: 15.9745	valid_1's l1: 16.3805
[450]	valid_0's l1: 15.6597	valid_1's l1: 16.0976
[500]	valid_0's l1: 15.4273	valid_1's l1: 15.8955
[550]	valid_0's l1: 15.2502	valid_1's l1: 15.7401
[600]	valid_0's l1: 15.1103	valid_1's l1: 15.624
[650]	valid_0's l1: 14.9953	valid_1's l1: 15.532
[700]	valid_0's l1: 14.9014	valid_1's l1: 15.4591
[750]	valid_0's l1: 14.8181	valid_1's l1: 15.3942
[800]	valid_0's l1: 14.7436	valid_1's l1: 15.3366
[850]	valid_0's l1: 14.6789	valid_1's l1: 15.2896
[900]

[2900]	valid_0's l1: 13.5369	valid_1's l1: 14.7742
[2950]	valid_0's l1: 13.5201	valid_1's l1: 14.7727
[3000]	valid_0's l1: 13.503	valid_1's l1: 14.7714
[3050]	valid_0's l1: 13.4873	valid_1's l1: 14.7711
[3100]	valid_0's l1: 13.4708	valid_1's l1: 14.7696
[3150]	valid_0's l1: 13.454	valid_1's l1: 14.7692
[3200]	valid_0's l1: 13.4379	valid_1's l1: 14.7685
[3250]	valid_0's l1: 13.422	valid_1's l1: 14.7669
[3300]	valid_0's l1: 13.4062	valid_1's l1: 14.7661
[3350]	valid_0's l1: 13.3907	valid_1's l1: 14.7647
[3400]	valid_0's l1: 13.3738	valid_1's l1: 14.7631
[3450]	valid_0's l1: 13.3583	valid_1's l1: 14.7619
[3500]	valid_0's l1: 13.3418	valid_1's l1: 14.7607
[3550]	valid_0's l1: 13.3263	valid_1's l1: 14.7588
[3600]	valid_0's l1: 13.3107	valid_1's l1: 14.7579
[3650]	valid_0's l1: 13.2956	valid_1's l1: 14.7563
[3700]	valid_0's l1: 13.2802	valid_1's l1: 14.7549
[3750]	valid_0's l1: 13.2645	valid_1's l1: 14.7539
[3800]	valid_0's l1: 13.25	valid_1's l1: 14.7533
[3850]	valid_0's l1: 13.2355	valid_1

[50]	valid_0's l1: 27.6975	valid_1's l1: 27.7059
[100]	valid_0's l1: 23.9586	valid_1's l1: 23.9808
[150]	valid_0's l1: 21.2922	valid_1's l1: 21.3326
[200]	valid_0's l1: 19.4109	valid_1's l1: 19.4846
[250]	valid_0's l1: 18.0956	valid_1's l1: 18.2037
[300]	valid_0's l1: 17.1809	valid_1's l1: 17.331
[350]	valid_0's l1: 16.5384	valid_1's l1: 16.7181
[400]	valid_0's l1: 16.0768	valid_1's l1: 16.2856
[450]	valid_0's l1: 15.7432	valid_1's l1: 15.9724
[500]	valid_0's l1: 15.4977	valid_1's l1: 15.7419
[550]	valid_0's l1: 15.3099	valid_1's l1: 15.5686
[600]	valid_0's l1: 15.159	valid_1's l1: 15.4287
[650]	valid_0's l1: 15.0378	valid_1's l1: 15.3206
[700]	valid_0's l1: 14.9355	valid_1's l1: 15.2317
[750]	valid_0's l1: 14.845	valid_1's l1: 15.1584
[800]	valid_0's l1: 14.7658	valid_1's l1: 15.0963
[850]	valid_0's l1: 14.6951	valid_1's l1: 15.0427
[900]	valid_0's l1: 14.634	valid_1's l1: 14.9992
[950]	valid_0's l1: 14.5805	valid_1's l1: 14.9624
[1000]	valid_0's l1: 14.531	valid_1's l1: 14.9303
[1050

[8150]	valid_0's l1: 12.3976	valid_1's l1: 14.5737
[8200]	valid_0's l1: 12.389	valid_1's l1: 14.5736
[8250]	valid_0's l1: 12.3799	valid_1's l1: 14.573
[8300]	valid_0's l1: 12.3705	valid_1's l1: 14.5723
[8350]	valid_0's l1: 12.3619	valid_1's l1: 14.572
[8400]	valid_0's l1: 12.3534	valid_1's l1: 14.5718
[8450]	valid_0's l1: 12.3452	valid_1's l1: 14.572
[8500]	valid_0's l1: 12.3369	valid_1's l1: 14.5717
[8550]	valid_0's l1: 12.3295	valid_1's l1: 14.5717
[8600]	valid_0's l1: 12.3217	valid_1's l1: 14.5715
[8650]	valid_0's l1: 12.3137	valid_1's l1: 14.5715
[8700]	valid_0's l1: 12.3059	valid_1's l1: 14.5715
[8750]	valid_0's l1: 12.2982	valid_1's l1: 14.5715
[8800]	valid_0's l1: 12.2909	valid_1's l1: 14.5713
[8850]	valid_0's l1: 12.2833	valid_1's l1: 14.5713
[8900]	valid_0's l1: 12.2781	valid_1's l1: 14.5715
[8950]	valid_0's l1: 12.2725	valid_1's l1: 14.5717
[9000]	valid_0's l1: 12.2659	valid_1's l1: 14.5718
[9050]	valid_0's l1: 12.2597	valid_1's l1: 14.572
Early stopping, best iteration is:
[

[50]	valid_0's l1: 27.7947	valid_1's l1: 27.2948
[100]	valid_0's l1: 24.0329	valid_1's l1: 23.6702
[150]	valid_0's l1: 21.3545	valid_1's l1: 21.0899
[200]	valid_0's l1: 19.4604	valid_1's l1: 19.2782
[250]	valid_0's l1: 18.1247	valid_1's l1: 18.0238
[300]	valid_0's l1: 17.1965	valid_1's l1: 17.1578
[350]	valid_0's l1: 16.541	valid_1's l1: 16.5618
[400]	valid_0's l1: 16.0744	valid_1's l1: 16.1419
[450]	valid_0's l1: 15.7358	valid_1's l1: 15.8459
[500]	valid_0's l1: 15.4887	valid_1's l1: 15.6298
[550]	valid_0's l1: 15.2981	valid_1's l1: 15.4734
[600]	valid_0's l1: 15.1451	valid_1's l1: 15.3541
[650]	valid_0's l1: 15.021	valid_1's l1: 15.2612
[700]	valid_0's l1: 14.9177	valid_1's l1: 15.1886
[750]	valid_0's l1: 14.8263	valid_1's l1: 15.1278
[800]	valid_0's l1: 14.7476	valid_1's l1: 15.0792
[850]	valid_0's l1: 14.6788	valid_1's l1: 15.0401
[900]	valid_0's l1: 14.6158	valid_1's l1: 15.0067
[950]	valid_0's l1: 14.5587	valid_1's l1: 14.9763
[1000]	valid_0's l1: 14.5067	valid_1's l1: 14.951
[10

Training until validation scores don't improve for 250 rounds.
[50]	valid_0's l1: 28.2731	valid_1's l1: 27.8112
[100]	valid_0's l1: 24.2584	valid_1's l1: 23.9191
[150]	valid_0's l1: 21.3975	valid_1's l1: 21.1752
[200]	valid_0's l1: 19.3991	valid_1's l1: 19.2712
[250]	valid_0's l1: 18.0264	valid_1's l1: 17.973
[300]	valid_0's l1: 17.0909	valid_1's l1: 17.1071
[350]	valid_0's l1: 16.4526	valid_1's l1: 16.5151
[400]	valid_0's l1: 16.0062	valid_1's l1: 16.104
[450]	valid_0's l1: 15.6889	valid_1's l1: 15.8203
[500]	valid_0's l1: 15.4541	valid_1's l1: 15.6212
[550]	valid_0's l1: 15.2765	valid_1's l1: 15.4802
[600]	valid_0's l1: 15.1318	valid_1's l1: 15.3719
[650]	valid_0's l1: 15.0164	valid_1's l1: 15.2915
[700]	valid_0's l1: 14.9179	valid_1's l1: 15.2238
[750]	valid_0's l1: 14.8322	valid_1's l1: 15.1673
[800]	valid_0's l1: 14.759	valid_1's l1: 15.1213
[850]	valid_0's l1: 14.6942	valid_1's l1: 15.0839
[900]	valid_0's l1: 14.6367	valid_1's l1: 15.0547
[950]	valid_0's l1: 14.5839	valid_1's l1:

[3600]	valid_0's l1: 13.3081	valid_1's l1: 14.8713
[3650]	valid_0's l1: 13.2928	valid_1's l1: 14.8702
[3700]	valid_0's l1: 13.2764	valid_1's l1: 14.8686
[3750]	valid_0's l1: 13.2611	valid_1's l1: 14.8675
[3800]	valid_0's l1: 13.2459	valid_1's l1: 14.8664
[3850]	valid_0's l1: 13.2323	valid_1's l1: 14.8663
[3900]	valid_0's l1: 13.2177	valid_1's l1: 14.8652
[3950]	valid_0's l1: 13.2031	valid_1's l1: 14.864
[4000]	valid_0's l1: 13.1898	valid_1's l1: 14.8636
[4050]	valid_0's l1: 13.176	valid_1's l1: 14.8621
[4100]	valid_0's l1: 13.161	valid_1's l1: 14.8613
[4150]	valid_0's l1: 13.1467	valid_1's l1: 14.8602
[4200]	valid_0's l1: 13.1336	valid_1's l1: 14.8596
[4250]	valid_0's l1: 13.121	valid_1's l1: 14.8586
[4300]	valid_0's l1: 13.1086	valid_1's l1: 14.8581
[4350]	valid_0's l1: 13.0945	valid_1's l1: 14.8569
[4400]	valid_0's l1: 13.081	valid_1's l1: 14.8556
[4450]	valid_0's l1: 13.0689	valid_1's l1: 14.8553
[4500]	valid_0's l1: 13.0565	valid_1's l1: 14.8553
[4550]	valid_0's l1: 13.045	valid_1'

[4100]	valid_0's l1: 13.1836	valid_1's l1: 14.7917
[4150]	valid_0's l1: 13.1677	valid_1's l1: 14.7913
[4200]	valid_0's l1: 13.1518	valid_1's l1: 14.7915
[4250]	valid_0's l1: 13.1356	valid_1's l1: 14.7907
[4300]	valid_0's l1: 13.1195	valid_1's l1: 14.7898
[4350]	valid_0's l1: 13.1041	valid_1's l1: 14.7893
[4400]	valid_0's l1: 13.0882	valid_1's l1: 14.7889
[4450]	valid_0's l1: 13.0727	valid_1's l1: 14.7886
[4500]	valid_0's l1: 13.058	valid_1's l1: 14.7886
[4550]	valid_0's l1: 13.0423	valid_1's l1: 14.7882
[4600]	valid_0's l1: 13.0271	valid_1's l1: 14.7879
[4650]	valid_0's l1: 13.0118	valid_1's l1: 14.7879
[4700]	valid_0's l1: 12.9967	valid_1's l1: 14.7874
[4750]	valid_0's l1: 12.9819	valid_1's l1: 14.7866
[4800]	valid_0's l1: 12.9668	valid_1's l1: 14.7863
[4850]	valid_0's l1: 12.9519	valid_1's l1: 14.7856
[4900]	valid_0's l1: 12.937	valid_1's l1: 14.7853
[4950]	valid_0's l1: 12.9221	valid_1's l1: 14.786
[5000]	valid_0's l1: 12.9067	valid_1's l1: 14.7854
[5050]	valid_0's l1: 12.8918	valid

## 模型验证结果
### 直接使用MAE指标的线下5-fold验证结果(0.06349584)

In [13]:
np.mean([0.06287124227830537,0.06356345258514168,0.06422149465677383, 0.06367346108002223,0.06314959550420497])

0.06349584922088962

### 使用MAE+MSE结合的线下5-fold验证结果(0.0636108)

In [14]:
np.mean([0.063039062547442,0.06359245145019841,0.06436835293198238, 0.0637306555814696, 0.06332385690501188])

0.06361087588322085

# 模型测试&提交
## MAE提交（0.06354）

In [16]:
pred_mae = 0
for i,model in enumerate(models_mae): 
    pred_mae += model.predict(test_fea[fea_cols]) * 0.2
test_fea['pred_mae'] = pred_mae

In [18]:
pred_mse = 0
for i,model in enumerate(models_mse): 
    pred_mse += model.predict(test_fea[fea_cols]) * 0.2
test_fea['pred_mse'] = pred_mse

In [19]:
submit_mae = pd.DataFrame()
submit_mae['id']    = test_fea['用户编码'].values
submit_mae['score'] = test_fea['pred_mae'].values 
submit_mae['score'] = submit_mae['score'].astype(int)
submit_mae[['id','score']].to_csv('baseline_mae.csv',index = None)

In [23]:
submit_mae['score'].describe()

count    50000.000000
mean       618.780040
std         37.730256
min        476.000000
25%        598.000000
50%        628.000000
75%        646.000000
max        696.000000
Name: score, dtype: float64


## MSE提交（0.06359）

In [24]:
test_fea = test_fea.sort_values('pred_mae')
test_fea['ranks'] = list(range(test_fea.shape[0]))
test_fea['score'] = test_fea['pred_mae'].values
test_fea.loc[test_fea.ranks<10000,'score']  = test_fea.loc[test_fea.ranks< 10000,'pred_mse'].values *0.4 + test_fea.loc[test_fea.ranks< 10000,'pred_mae'].values * 0.6
test_fea.loc[test_fea.ranks>40000,'score']  = test_fea.loc[test_fea.ranks> 40000,'pred_mse'].values *0.4 + test_fea.loc[test_fea.ranks> 40000,'pred_mae'].values * 0.6
         

In [27]:
submit_mae_mse = pd.DataFrame()
submit_mae_mse['id']    = test_fea['用户编码'].values
submit_mae_mse['score'] = test_fea['score'].values 
submit_mae_mse['score'] = submit_mae_mse['score'].astype(int)
submit_mae_mse[['id','score']].to_csv('baseline_mae_mse.csv',index = None)

In [28]:
submit_mae_mse['score'].describe()

count    50000.000000
mean       618.586220
std         37.896524
min        471.000000
25%        598.000000
50%        628.000000
75%        646.000000
max        695.000000
Name: score, dtype: float64

# 结论

结论不多说,我们看下面的分数就行,和我们线下验证的结果是一致的,如果这边文章有帮助到您,欢迎<font color=red>转发or喜欢作者</font>.....

![](./pic/compare.png)