# 数据处理

In [1]:
# Load Env
import numpy as np
import pandas as pd 
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load Data
data = pd.read_csv('df_final.csv', dtype={'plate_city_code':str,
                                            'warehouse_type':int,
                                            'warehouse_province_code':str,
                                            'warehouse_city_code':str,
                                            'warehouse_district_code':str,
                                            'shop_province_code':str,
                                            'shop_city_code':str,
                                            'shop_district_code':str,
                                            'is_same_province':int,
                                            'is_same_city':int,
                                            'is_same_district':int,
                                            'if_gps':int,
                                            'if_pay':int,
                                            'match_hour':int,
                                            'label':float})

In [3]:
pro_map1 = data[['warehouse_province_code', 'warehouse_province_name']].rename(columns={'warehouse_province_code':'province_code',
                                                                                         'warehouse_province_name':'province_name'}).copy()
pro_map2 = data[['shop_province_code', 'shop_province_name']].rename(columns={'shop_province_code':'province_code',
                                                                                'shop_province_name':'province_name'}).copy()
pro_map = pd.concat([pro_map1, pro_map2]).drop_duplicates().sort_values(by='province_code').reset_index().drop('index', axis=1).copy()
pro_map['province_map_code'] = np.arange(len(pro_map))

city_map1 = data[['warehouse_city_code', 'warehouse_city_name']].rename(columns={'warehouse_city_code':'city_code',
                                                                                 'warehouse_city_name':'city_name'}).copy()
city_map2 = data[['shop_city_code', 'shop_city_name']].rename(columns={'shop_city_code':'city_code',
                                                                       'shop_city_name':'city_name'}).copy()
city_map3 = data[['plate_city_code', 'plate_city_name']].rename(columns={'plate_city_code':'city_code',
                                                                         'plate_city_name':'city_name'}).copy()
city_map = pd.concat([city_map1, city_map2, city_map3]).drop_duplicates().sort_values(by='city_code').reset_index().drop('index', axis=1).copy()
city_map['city_map_code'] = np.arange(len(city_map))

df = data.merge(city_map, how='left', left_on='plate_city_code', right_on='city_code').rename(columns={'city_map_code':'plate_city'})\
.merge(pro_map, how='left', left_on='warehouse_province_code', right_on='province_code').rename(columns={'province_map_code':'warehouse_province'})\
.merge(city_map, how='left', left_on='warehouse_city_code', right_on='city_code').rename(columns={'city_map_code':'warehouse_city'})\
.merge(pro_map, how='left', left_on='shop_province_code', right_on='province_code').rename(columns={'province_map_code':'shop_province'})\
.merge(city_map, how='left', left_on='shop_city_code', right_on='city_code').rename(columns={'city_map_code':'shop_city'}).copy()

df = df[['plate_city','warehouse_province', 'warehouse_city', 'warehouse_type', 
         'shop_province', 'shop_city', 'if_gps', 'if_pay', 'match_hour',
         'distance', 'is_same_province', 'is_same_city', 'label']].copy()

# 手动调出来的模型

In [4]:
# Select needed data

X = df.iloc[:, [0, 1,  2,  3,  4,  5,  6,  7,  9, 10, 11]].copy()
Y = df.iloc[:, 12].copy()

# Set categorical features

cat_feats_ind = [c for c,col in enumerate(X.columns) if col != 'distance']

# Split data into train, valid and test

X_train,X_valid,Y_train,Y_valid = train_test_split(X, Y, test_size=0.1, random_state=101)

In [17]:
# hand-tuned | Random with Grid
# 因为GPU最多支持bin256，所以这里用cpu训练

params = {
    'boosting_type':'gbdt',               # boosting方式
    'metric':'rmse',                      # 验证模型方式
    'is_unbalance':'false',               # 数据是否有偏 | 仅分类时需决定是否开启

    # Leaf-wise Tree
    'max_depth':-1,                       # 树的最大深度 | 为-1时解除限制
    'num_leaves':48,                      # 每颗树上的叶子数量

    # For better accuracy
    'max_bin':255,                        # 箱内最大特征数
    'learning_rate':0.048,                 # 学习率
    'n_estimators':100000,                   # 迭代次数

    # Solving over-fitting
    'min_data_in_leaf':20,                # 一个叶子上的最小数据量
    'min_sum_hessian_in_leaf':1e-3,       # 一个叶子上最小的hessian和 
    'bagging_fraction':0.7,                 # 做bagging时取的数据百分比
    'bagging_freq':1,                     # 做bagging的频率。每 `参数值` 次迭代做一次bagging。为0时禁用bagging
    'feature_fraction':0.6,               # 每次迭代中随机选择 `参数值` 百分比的特征用于训练
    'lambda_l1':1,                      # L1正则
    'lambda_l2':0.8,                      # L2正则
    'min_data_per_group':838,             # 每个分类组的最小数据量 | 处理data少或者类别很多的情况
    'cat_smooth':32,                     # 用于分类变量 | 减少噪音在分类特征中的影响，尤其对分类后类别里面数据量很少的  
    
    # Device settings
    'device':'gpu',                       # 训练的设备 | 'cpu'/'gpu'
    'gpu_platform_id':0,                  # GPU训练的平台               
    'gpu_device_id': 1,                   # GPU训练的设备号
    'n_jobs':12
}

model = lgb.LGBMRegressor(
    boosting_type=params['boosting_type'],
    metric=params['metric'],
    is_unbalance=params['is_unbalance'],
    max_depth=params['max_depth'],
    num_leaves=params['num_leaves'],
    max_bin=params['max_bin'],
    learning_rate=params['learning_rate'],
    n_estimators=params['n_estimators'],
    min_data_in_leaf=params['min_data_in_leaf'],
    min_sum_hessian_in_leaf=params['min_sum_hessian_in_leaf'],
    bagging_fraction=params['bagging_fraction'],
    bagging_freq=params['bagging_freq'],
    feature_fraction=params['feature_fraction'],
    lambda_l1=params['lambda_l1'],
    lambda_l2=params['lambda_l2'],
    min_data_per_group=params['min_data_per_group'],
    cat_smooth=params['cat_smooth'],
    device=params['device'],
    gpu_platform_id=params['gpu_platform_id'],
    gpu_device_id=params['gpu_device_id'],
    n_jobs = params['n_jobs']
)

model.fit(
    X_train, Y_train,
    eval_set=[(X_train, Y_train),(X_valid, Y_valid)], # 使用两个数据进行评估
    eval_metric='rmse',
    categorical_feature=cat_feats_ind,
    early_stopping_rounds=300,
    verbose=100)

Training until validation scores don't improve for 300 rounds
[100]	training's rmse: 2.65035	valid_1's rmse: 2.66994
[200]	training's rmse: 2.60621	valid_1's rmse: 2.65507
[300]	training's rmse: 2.58259	valid_1's rmse: 2.6487
[400]	training's rmse: 2.56515	valid_1's rmse: 2.64547
[500]	training's rmse: 2.55086	valid_1's rmse: 2.64172
[600]	training's rmse: 2.53995	valid_1's rmse: 2.64206
[700]	training's rmse: 2.53104	valid_1's rmse: 2.64042
[800]	training's rmse: 2.52319	valid_1's rmse: 2.64002
[900]	training's rmse: 2.51536	valid_1's rmse: 2.64028
[1000]	training's rmse: 2.50943	valid_1's rmse: 2.64101
[1100]	training's rmse: 2.50392	valid_1's rmse: 2.64228
Early stopping, best iteration is:
[816]	training's rmse: 2.52169	valid_1's rmse: 2.63921


LGBMRegressor(bagging_fraction=0.7, bagging_freq=1, boosting_type='gbdt',
              cat_smooth=32, class_weight=None, colsample_bytree=1.0,
              device='gpu', feature_fraction=0.6, gpu_device_id=1,
              gpu_platform_id=0, importance_type='split', is_unbalance='false',
              lambda_l1=1, lambda_l2=0.8, learning_rate=0.048, max_bin=255,
              max_depth=-1, metric='rmse', min_child_samples=20,
              min_child_weight=0.001, min_data_in_leaf=20,
              min_data_per_group=838, min_split_gain=0.0,
              min_sum_hessian_in_leaf=0.001, n_estimators=100000, n_jobs=12,
              num_leaves=48, objective=None, random_state=None, reg_alpha=0.0, ...)

# TPE - hyperopt

In [7]:
from hyperopt import fmin, tpe, hp

In [8]:
train = lgb.Dataset(X_train, Y_train, free_raw_data=False)
valid = lgb.Dataset(X_valid, Y_valid, reference=train, free_raw_data=False)

In [9]:
# 当传入参数需为整数时，使用hp.quniform(label, low, high, step); 并且需要在训练参数里加上int()
# 当传入参数为实数时，使用hp.uniform(label, low, high)
# 当传入参数为类别时，使用hp.choice(label, [可选值])
    # e.g. 'boosting_type': hp.choice('boosting_type', 
    #                                 [{'boosting_type': 'gbdt', 
    #                                   'subsample': hp.uniform('gdbt_subsample', 0.5, 1)}, 
    #                                  {'boosting_type': 'dart', 
    #                                   'subsample': hp.uniform('dart_subsample', 0.5, 1)},
    #                                  {'boosting_type': 'goss'}
    #                                 ])
    # 即当选择boosting_type为gbdt的时候，连带的我需要选择subsample的参数值
    # 当选择boosting_type为goss，因为无subsample这个参数，所以不需要选

space = {
    'num_leaves':hp.quniform('num_leaves', 30, 151, 1),
    'max_bin':hp.quniform('max_bin', 32, 256, 1),
    # 'learning_rate':hp.uniform('learning_rate', 0.001, 0.1),
    'feature_fraction':hp.uniform('feature_fraction', 0.4, 1),
    'bagging_fraction':hp.uniform('bagging_fraction', 0.4, 1),
    'bagging_freq':hp.quniform('bagging_freq', 0, 100, 1),
    'lambda_l1':hp.uniform('lambda_l1', 0, 5),
    'lambda_l2':hp.uniform('lambda_l2', 0, 5),
    'min_data_per_group':hp.quniform('min_data_per_group', 50, 1000, 1),
    'cat_smooth':hp.uniform('cat_smooth', 5, 100)
}

In [12]:
def objective(space):
    
    params = {
        'boosting_type':'gbdt',             
        'metric':'rmse',                     
        'is_unbalance':'false',               

        # Leaf-wise Tree
        'max_depth':-1,                       
        'num_leaves':int(space['num_leaves']),                      

        # For better accuracy
        'max_bin':int(space['max_bin']),                       
        'learning_rate':0.05,                
        # 'n_estimators':10000,                 

        # Solving over-fitting
        'min_data_in_leaf':20,               
        'min_sum_hessian_in_leaf':1e-3,      
        'bagging_fraction':space['bagging_fraction'],                 
        'bagging_freq':int(space['bagging_freq']),                    
        'feature_fraction':space['feature_fraction'],              
        'lambda_l1':space['lambda_l1'],                        
        'lambda_l2':space['lambda_l2'],                        
        'min_data_per_group':int(space['min_data_per_group']),             
        'cat_smooth':space['cat_smooth'],                   
        
        # Device settings
        'device':'gpu',                      
        'gpu_platform_id':0,                      
        'gpu_device_id':1,                  
        'n_jobs': 12
    }
    
    cv_results = lgb.cv(
        params=params, 
        train_set=train, 
        num_boost_round=100000, 
        nfold=3,
        stratified=False, # Whether to perform stratified sampling
        early_stopping_rounds=100, 
        categorical_feature=cat_feats_ind,
        verbose_eval=None
    )
    
    return min(cv_results['rmse-mean'])

In [13]:
# 想办法通过Trails实现early-stop以及途中的画图功能，下次再尝试

best = fmin(
    objective, 
    space, 
    algo=tpe.suggest, 
    max_evals=5000
)

100%|██████████| 5000/5000 [32:34:04<00:00, 23.45s/it, best loss: 2.6812662515756034]   


In [16]:
params = {
    'boosting_type':'gbdt',             
    'metric':'rmse',                     
    'is_unbalance':'false',               

    # Leaf-wise Tree
    'max_depth':-1,                       
    'num_leaves':143,                      

    # For better accuracy
    'max_bin':74,                       
    'learning_rate':0.05,                
    'n_estimators':10000,                 

    # Solving over-fitting
    'min_data_in_leaf':20,               
    'min_sum_hessian_in_leaf':1e-3,      
    'bagging_fraction':0.98,                 
    'bagging_freq':9,                    
    'feature_fraction':0.7,              
    'lambda_l1':4.8,                        
    'lambda_l2':3.7,                        
    'min_data_per_group':273,             
    'cat_smooth':85.2,                   

    # Device settings
    'device':'gpu',                      
    'gpu_platform_id':0,                      
    'gpu_device_id':1,                  
    'n_jobs': 12
}

model = lgb.LGBMRegressor(
    boosting_type=params['boosting_type'],
    metric=params['metric'],
    is_unbalance=params['is_unbalance'],
    max_depth=params['max_depth'],
    num_leaves=params['num_leaves'],
    max_bin=params['max_bin'],
    learning_rate=params['learning_rate'],
    n_estimators=params['n_estimators'],
    min_data_in_leaf=params['min_data_in_leaf'],
    min_sum_hessian_in_leaf=params['min_sum_hessian_in_leaf'],
    bagging_fraction=params['bagging_fraction'],
    bagging_freq=params['bagging_freq'],
    feature_fraction=params['feature_fraction'],
    lambda_l1=params['lambda_l1'],
    lambda_l2=params['lambda_l2'],
    min_data_per_group=params['min_data_per_group'],
    cat_smooth=params['cat_smooth'],
    device=params['device'],
    gpu_platform_id=params['gpu_platform_id'],
    gpu_device_id=params['gpu_device_id'],
    n_jobs = params['n_jobs']
)

model.fit(
    X_train, Y_train,
    eval_set=[(X_train, Y_train),(X_valid, Y_valid)], # 使用两个数据进行评估
    eval_metric='rmse',
    categorical_feature=cat_feats_ind,
    early_stopping_rounds=300,
    verbose=100)

Training until validation scores don't improve for 300 rounds
[100]	training's rmse: 2.61517	valid_1's rmse: 2.65507
[200]	training's rmse: 2.56058	valid_1's rmse: 2.64473
[300]	training's rmse: 2.53353	valid_1's rmse: 2.64477
[400]	training's rmse: 2.51553	valid_1's rmse: 2.64764
[500]	training's rmse: 2.50134	valid_1's rmse: 2.65154
Early stopping, best iteration is:
[270]	training's rmse: 2.5402	valid_1's rmse: 2.64371


LGBMRegressor(bagging_fraction=0.98, bagging_freq=9, boosting_type='gbdt',
              cat_smooth=85.2, class_weight=None, colsample_bytree=1.0,
              device='gpu', feature_fraction=0.7, gpu_device_id=1,
              gpu_platform_id=0, importance_type='split', is_unbalance='false',
              lambda_l1=4.8, lambda_l2=3.7, learning_rate=0.05, max_bin=74,
              max_depth=-1, metric='rmse', min_child_samples=20,
              min_child_weight=0.001, min_data_in_leaf=20,
              min_data_per_group=273, min_split_gain=0.0,
              min_sum_hessian_in_leaf=0.001, n_estimators=10000, n_jobs=12,
              num_leaves=143, objective=None, random_state=None, reg_alpha=0.0, ...)