# Load Data

In [1]:
# Load Data
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import gc
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100)
path = 'D:/DCL/Downloads/ECAA_data/'
train_csv = 'train.csv'
test_csv = 'test.csv'
id_col = 'article_id'
target = 'orders_3h_15h'

def reduce_mem_usage(df, verbose=True):
    '''自定义用来降低内存空间的函数'''
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def read_data():
    print('reading data...')
    df_train = pd.read_csv(path+train_csv)
    df_test = pd.read_csv(path+test_csv)
    df_train = reduce_mem_usage(df_train)
    df_test = reduce_mem_usage(df_test)
    print('train data shape: ', df_train.shape)
    print('test  data shape: ', df_test.shape)
    return df_train, df_test

train, test = read_data()
# filter level
test_level2_lst = test['level2'].unique()
test_level3_lst = test['level3'].unique()
test_level4_lst = test['level4'].unique()
train = train[train['level2'].isin(test_level2_lst)]
train = train[train['level3'].isin(test_level3_lst)]
train = train[train['level4'].isin(test_level4_lst)]
# # filter baike id
# test_baike_lst = test['baike_id_2h'].unique()
# train = train[train['baike_id_2h'].isin(test_baike_lst)] #1348748
# filter price
# train = train[(train['price']<1e6) | (train['price'].isnull())]
# test.loc[test['price'] > 1e6, 'price'] = 1e6

df_feature = pd.concat([train, test], axis=0)
print('after filter, the train data shape is :', train.shape)

reading data...
Mem. usage decreased to 103.44 Mb (70.0% reduction)
Mem. usage decreased to  8.42 Mb (69.3% reduction)
train data shape:  (1807657, 25)
test  data shape:  (149589, 24)
after filter, the train data shape is : (1804712, 25)


# Feature Engineer

## Make New Features

In [2]:
# Feature Engineer
## Make New Features

# diffirent type feature cols
category_cols = ['date','author','level1','level2','level3','level4','brand','mall','url','baike_id_2h']
numeric_cols = [col for col in train.columns if col not in category_cols+[id_col, target]]
# target encode cols
target_encode_cols = ['author','level1','level2','level3','level4','brand','mall','url','baike_id_2h'] + ['price_20_bin']

def make_action_feature(df):
    #print('create new features...')
    ## action sum
    #df['action_sum_1h'] = df['comments_1h'] + df['zhi_1h'] + df['favorite_1h']
    df['action_sum_2h'] = df['comments_2h'] + df['zhi_2h'] + df['favorite_2h']
    #df['action_sum'] = df['action_sum_1h'] + df['action_sum_2h']
    # zhi buzhi
    df['total_zhi'] = df['zhi_1h'] + df['zhi_2h']
    df['total_buzhi'] = df['buzhi_1h'] + df['buzhi_2h']
    df['zhi_1h_ratio'] = df['zhi_1h'] / (df['zhi_1h'] + df['buzhi_1h'] + 1e-5)
    df['zhi_2h_ratio'] = df['zhi_2h'] / (df['zhi_2h'] + df['buzhi_2h'] + 1e-5)
    ## comments
    df['comments_perc'] = df['comments_2h'] / (df['comments_1h'] + 1e-5)
    ## favorite
    df['favorite_perc'] = df['favorite_2h'] / (df['favorite_1h'] + 1e-5)
    ## action perc
    df['comments_perc_2h'] = df['comments_2h'] / (df['action_sum_2h'] + 1e-5)
    df['zhi_perc_2h'] = df['zhi_2h'] / (df['action_sum_2h'] + 1e-5)
    df['buzhi_perc_2h'] = df['buzhi_2h'] / (df['action_sum_2h'] + 1e-5)
    df['favorite_perc_2h'] = df['favorite_2h'] / (df['action_sum_2h'] + 1e-5)
    ## cvr feature
    df['comments_cvr_2h'] = df['orders_2h'] / (df['comments_2h'] + 1e-5)
    df['zhi_cvr_2h'] = df['orders_2h'] / (df['zhi_2h'] + 1e-5)
    df['favorite_cvr_2h'] = df['favorite_2h'] / (df['zhi_2h'] + 1e-5)
    return df

def make_price_feature(df):
    ## price bin
    df['price_20_bin'] = pd.qcut(df['price'], 20, duplicates='drop')
    df['price_20_bin'] = df['price_20_bin'].cat.codes
#     price_dummy_df = pd.get_dummies(df['price_20_bin'], prefix='price')
#     df = pd.concat([df, price_dummy_df], axis=1)
    ## price min/max/diff
    df['price_max'] = df.groupby('url')['price'].transform('min')
    df['price_min'] = df.groupby('url')['price'].transform('max') 
    df['price_increase'] = df['price'] - df['price_min']
    df['price_decrease'] = df['price_max'] - df['price']
    return df

def count_encoding(df, count_cols):
    #print('count encoding...')
    for col in tqdm(count_cols):
        df[f'{col}_count'] = df[col].map(df[col].value_counts())
    return df

def make_order_stat_feature(df, orders='orders_2h'):
    order_group_mean_cols = ['date','author','level1','level2','level3','level4','brand','url','baike_id_2h']+['price_20_bin']
    order_group_std_cols = ['date','author','level3','brand','baike_id_2h']
    for col in tqdm(order_group_mean_cols):
        df[f'{orders}_by_{col}_mean'] = df.groupby(col)[orders].transform('mean')
    for col in tqdm(order_group_std_cols):
        df[f'{orders}_by_{col}_std'] = df.groupby(col)[orders].transform('std')
    return df

def make_price_stat_feature(df, price='price'):
    price_mean_cols = ['level1','level2','level3','level4','author','brand','baike_id_2h']
    price_std_cols = ['level3','author','brand','baike_id_2h']
    for col in tqdm(price_mean_cols):
        df[f'{price}_by_{col}_mean'] = df.groupby(col)[price].transform('mean')
    for col in tqdm(price_std_cols):
        df[f'{price}_by_{col}_std'] = df.groupby(col)[price].transform('std')
#     for col in tqdm(price_min_max_cols):
#         df[f'{price}_by_{col}_min'] = df.groupby(col)[price].transform('min')
#         df[f'{price}_by_{col}_max'] = df.groupby(col)[price].transform('max')
    return df

def cross_category_feature(df, cate1, cate2):
    df[f'{cate1}_{cate2}_count'] = df.groupby(cate1)[cate2].transform('count')
    df[f'{cate1}_{cate2}_nunique'] = df.groupby(cate1)[cate2].transform('nunique')
    df[f'{cate1}_{cate2}_count_nunique'] = df[f'{cate1}_{cate2}_count'] / df[f'{cate1}_{cate2}_nunique']
    return df

def cross_stat_feature(df, label):
    cross_group_cols = ['level2','level3','author','brand','baike_id_2h']
    for col in tqdm(cross_group_cols):
        df[f'{label}_{col}_mean_per_date'] = df.groupby(['date',col])[label].transform('mean')
    return df

# 每个商品销量(orders 2h)的滞后特征
# 每个date平均销量的滞后特征
# 每个level1/2/3/4平均销量的滞后特征
# 每个brand平均销量/总销量 的滞后特征
# 每个mall平均销量/总销量 的滞后特征
# 每个商品价格(price)的滞后特征
def make_lag_feature(df, lags, label, col):
    for i in lags:
        temp = df[['date',col,label]].copy()
        shifted = temp.groupby(['date',col])[label].mean().reset_index()
        shifted.columns = ['date',col,f'{label}_{col}_lag_{i}_adv']
        shifted['date'] += 1
        df = pd.merge(df, shifted, on=['date',col], how='left')
    print(f'The {label}_{col} lag feature ok!')
    del temp, shifted
    return df

def all_lag_features(df):
    orders_lag_cols = ['level3','level2','author','brand','mall','url']
    #price_lag_cols = ['level4','mall','url']
    df = make_lag_feature(df, lags=[1,2,3,6,12], label='orders_2h', col='baike_id_2h')
    df = make_lag_feature(df, lags=[1,2,3,6,12], label='price', col='baike_id_2h')
    for lag_col in orders_lag_cols:
        df = make_lag_feature(df, lags=[1], label='orders_2h', col=lag_col)
#     for lag_col in price_lag_cols:
#         df = make_lag_feature(df, lags=[1], label='price', col=lag_col)
    return df

def process_data(df):
    df = make_action_feature(df)
    df = make_price_feature(df)
    df = count_encoding(df, count_cols=category_cols)
    df = make_order_stat_feature(df, orders='orders_2h')
    df = make_price_stat_feature(df, price='price')
    df = cross_category_feature(df, cate1='baike_id_2h', cate2='author')
    df = cross_category_feature(df, cate1='level3', cate2='baike_id_2h')
    df = cross_category_feature(df, cate1='brand', cate2='baike_id_2h')
    df = cross_stat_feature(df, label='orders_2h')
    df = all_lag_features(df)
    return df

df_feature = process_data(df_feature)

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 14.38it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  8.96it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 12.64it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 14.23it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 11.60it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  4.28it/s]


The orders_2h_baike_id_2h lag feature ok!
The price_baike_id_2h lag feature ok!
The orders_2h_level3 lag feature ok!
The orders_2h_level2 lag feature ok!
The orders_2h_author lag feature ok!
The orders_2h_brand lag feature ok!
The orders_2h_mall lag feature ok!
The orders_2h_url lag feature ok!


## Target Encoding

In [3]:
## Target Encoding

### Groupby statistic feature (mean\std\min\max\median)
def stat(df, df_merge, group_by, agg):
    group = df.groupby(group_by).agg(agg)

    columns = []
    for on, methods in agg.items():
        for method in methods:
            columns.append('{}_{}_{}'.format('_'.join(group_by), on, method))
    group.columns = columns
    group.reset_index(inplace=True)
    df_merge = df_merge.merge(group, on=group_by, how='left')

    del (group)
    gc.collect()

    return df_merge

def statis_feat(df_know, df_unknow):
    '''只需要修改target_encode_cols'''
    for f in tqdm(target_encode_cols):
        df_unknow = stat(df_know, df_unknow, [f], {target: ['mean','std']})
    return df_unknow

## 5折交叉 target encoding
train = df_feature[~df_feature[target].isnull()]
train = train.reset_index(drop=True)
test = df_feature[df_feature[target].isnull()]

df_stas_feat = None
kfold = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
print('target encoding...')
for tra_index, val_index in kfold.split(train, train[target]):
    df_fold_train = train.iloc[tra_index]
    df_fold_val = train.iloc[val_index]

    df_fold_val = statis_feat(df_fold_train, df_fold_val)
    df_stas_feat = pd.concat([df_stas_feat, df_fold_val], axis=0)

    del(df_fold_train)
    del(df_fold_val)
    gc.collect()

test = statis_feat(train, test)
df_feature = pd.concat([df_stas_feat, test], axis=0)

del(df_stas_feat)
del(train)
del(test)
gc.collect()

target encoding...


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.44it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:03<00:00,  2.51it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.39it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.14it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:05<00:00,  1.86it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.46it/s]


23

# Modeling

## Split data for train & test

In [4]:
## Modeling
# Split data for train & test
train = df_feature[df_feature[target].notnull()]
test = df_feature[df_feature[target].isnull()]

useless_cols = [id_col, target, 'baike_id_1h']
all_cols = [col for col in train.columns if col not in useless_cols]
x_train = train[all_cols]
x_test = test[all_cols]
y_train = train[target]

print(x_train.shape)
print(x_test.shape)
x_train.head()

(1804712, 127)
(149589, 127)


Unnamed: 0,date,price,price_diff,author,level1,level2,level3,level4,brand,mall,url,comments_1h,zhi_1h,buzhi_1h,favorite_1h,orders_1h,baike_id_2h,comments_2h,zhi_2h,buzhi_2h,favorite_2h,orders_2h,action_sum_2h,total_zhi,total_buzhi,zhi_1h_ratio,zhi_2h_ratio,comments_perc,favorite_perc,comments_perc_2h,zhi_perc_2h,buzhi_perc_2h,favorite_perc_2h,comments_cvr_2h,zhi_cvr_2h,favorite_cvr_2h,price_20_bin,price_max,price_min,price_increase,price_decrease,date_count,author_count,level1_count,level2_count,level3_count,level4_count,brand_count,mall_count,url_count,baike_id_2h_count,orders_2h_by_date_mean,orders_2h_by_author_mean,orders_2h_by_level1_mean,orders_2h_by_level2_mean,orders_2h_by_level3_mean,orders_2h_by_level4_mean,orders_2h_by_brand_mean,orders_2h_by_url_mean,orders_2h_by_baike_id_2h_mean,orders_2h_by_price_20_bin_mean,orders_2h_by_date_std,orders_2h_by_author_std,orders_2h_by_level3_std,orders_2h_by_brand_std,orders_2h_by_baike_id_2h_std,price_by_level1_mean,price_by_level2_mean,price_by_level3_mean,price_by_level4_mean,price_by_author_mean,price_by_brand_mean,price_by_baike_id_2h_mean,price_by_level3_std,price_by_author_std,price_by_brand_std,price_by_baike_id_2h_std,baike_id_2h_author_count,baike_id_2h_author_nunique,baike_id_2h_author_count_nunique,level3_baike_id_2h_count,level3_baike_id_2h_nunique,level3_baike_id_2h_count_nunique,brand_baike_id_2h_count,brand_baike_id_2h_nunique,brand_baike_id_2h_count_nunique,orders_2h_level2_mean_per_date,orders_2h_level3_mean_per_date,orders_2h_author_mean_per_date,orders_2h_brand_mean_per_date,orders_2h_baike_id_2h_mean_per_date,orders_2h_baike_id_2h_lag_1_adv,orders_2h_baike_id_2h_lag_2_adv,orders_2h_baike_id_2h_lag_3_adv,orders_2h_baike_id_2h_lag_6_adv,orders_2h_baike_id_2h_lag_12_adv,price_baike_id_2h_lag_1_adv,price_baike_id_2h_lag_2_adv,price_baike_id_2h_lag_3_adv,price_baike_id_2h_lag_6_adv,price_baike_id_2h_lag_12_adv,orders_2h_level3_lag_1_adv,orders_2h_level2_lag_1_adv,orders_2h_author_lag_1_adv,orders_2h_brand_lag_1_adv,orders_2h_mall_lag_1_adv,orders_2h_url_lag_1_adv,author_orders_3h_15h_mean,author_orders_3h_15h_std,level1_orders_3h_15h_mean,level1_orders_3h_15h_std,level2_orders_3h_15h_mean,level2_orders_3h_15h_std,level3_orders_3h_15h_mean,level3_orders_3h_15h_std,level4_orders_3h_15h_mean,level4_orders_3h_15h_std,brand_orders_3h_15h_mean,brand_orders_3h_15h_std,mall_orders_3h_15h_mean,mall_orders_3h_15h_std,url_orders_3h_15h_mean,url_orders_3h_15h_std,baike_id_2h_orders_3h_15h_mean,baike_id_2h_orders_3h_15h_std,price_20_bin_orders_3h_15h_mean,price_20_bin_orders_3h_15h_std
0,1,5969.0,-597.799988,21562,8,53,63,212,15037,113,465683,0,0,0,0,0,104,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19,5969.0,5969.0,0.0,0.0,14295,33,137973,41612,1343,126,5765,924039,1,1,0.190906,0.0,0.094207,0.064477,0.03723,0.063492,0.063313,0.0,0.0,0.055872,0.765768,0.0,0.259167,0.417747,,2535.034912,6155.075684,42675.386719,5831.062012,22651.587891,12580.171875,5969.0,297329.379029,104066.924776,56031.563634,,1,1,1.0,1343,204,6.583333,5765,493,11.693712,0.009554,0.0,0.0,0.025641,0.0,,,,,,,,,,,,,,,,,0.0625,0.25,0.481633,1.141878,0.398832,1.014817,0.422805,1.041485,0.673077,1.35419,0.324394,0.93873,1.002347,1.678817,,,,,0.354899,0.987867
1,1,139.0,20.0,69683,7,22,0,0,31110,113,328023,0,0,0,0,0,92240,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13,121.5,165.0,-26.0,-17.5,14295,78935,127689,3554,104270,1432595,2315,924039,3,1,0.190906,0.107126,0.171941,0.138998,0.243445,0.231487,0.159827,0.0,0.0,0.14577,0.765768,0.533046,0.886777,0.623254,,2283.230713,300.38974,652.110779,1614.579834,379.607086,164.286148,139.0,24122.457132,1271.23709,141.639853,,1,1,1.0,104270,16735,6.230654,2315,401,5.773067,0.131579,0.188131,0.088061,0.111111,0.0,,,,,,,,,,,,,,,,,0.603717,1.261556,0.689226,1.400561,0.584057,1.306569,0.773442,1.530235,0.796477,1.514542,0.741123,1.411981,1.002347,1.678817,0.0,0.0,,,0.671951,1.359837
2,1,99.0,,63556,19,2,753,0,4034,113,382808,0,0,0,0,0,88894,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12,99.0,99.0,0.0,0.0,14295,379,402440,79530,6592,1432595,113,924039,4,4,0.190906,0.664908,0.462869,0.419615,0.508192,0.231487,0.079646,0.0,0.0,0.174199,0.765768,1.442407,1.25213,0.331167,0.0,174.190063,29.184385,61.035213,1614.579834,86.468071,92.205002,115.0,50.9196,266.539127,62.672107,23.10844,4,4,1.0,6592,813,8.108241,113,13,8.692308,0.2128,0.163265,0.0,0.0,0.0,,,,,,,,,,,,,,,,,1.622378,2.030657,1.303086,1.886818,1.191277,1.827642,1.45705,1.961478,0.796477,1.514542,0.550562,1.234092,1.002347,1.678817,0.0,0.0,,,0.745457,1.435962
3,1,56.700001,0.0,69683,17,100,0,0,25204,55,4708,0,0,0,0,0,219783,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9,56.700001,56.700001,0.0,0.0,14295,78935,162347,48145,104270,1432595,467,7356,5,1213182,0.190906,0.107126,0.317825,0.28491,0.243445,0.231487,0.130621,0.0,0.2223,0.252552,0.765768,0.533046,0.886777,0.587846,0.808585,100.380104,71.149666,652.110779,1614.579834,379.607086,98.53653,1659.035767,24122.457132,1271.23709,71.974835,626247.622926,1213182,58527,20.728587,104270,16735,6.230654,467,79,5.911392,0.154128,0.188131,0.088061,0.0,0.154231,,,,,,,,,,,,,,,,,0.603717,1.261556,1.066544,1.738804,0.939755,1.662546,0.773442,1.530235,0.796477,1.514542,0.373219,1.055744,0.211178,0.779264,0.0,0.0,0.747024,1.453944,0.929376,1.615963
4,1,65.0,-0.2,19867,19,1,3,0,16881,113,598685,2,14,7,14,0,90182,14,8,1,74,0,96,22,8,0.666666,0.888888,6.999965,5.285711,0.145833,0.083333,0.010417,0.770833,0.0,0.0,9.249988,10,61.599998,68.0,-3.0,-3.400002,14295,9,402440,51798,15286,1432595,7,924039,3,2,0.190906,0.555556,0.462869,0.687671,0.263247,0.231487,0.571429,0.0,0.0,0.243296,0.765768,1.333333,0.849202,1.511858,0.0,174.190063,57.873562,85.545372,1614.579834,107.535553,73.304283,66.5,355.914089,98.211605,34.6789,2.12132,2,1,2.0,15286,1327,11.519216,7,3,2.333333,0.259375,0.157025,0.0,0.0,0.0,,,,,,,,,,,,,,,,,1.2,1.30384,1.303086,1.886818,1.777265,2.12753,0.971165,1.602336,0.796477,1.514542,0.0,0.0,1.002347,1.678817,0.0,0.0,0.0,,0.89631,1.5821


## LightGBM

In [5]:
### LightGBM
## 作为baseline部分仅使用经典的LightGBM作为训练模型，我们还能尝试XGBoost、CatBoost和NN（神经网络）
def cv_model(clf, train_x, train_y, test_x, clf_name='lgb'):
    folds = 5
    seed = 2021
    kfold = KFold(n_splits=folds, shuffle=True, random_state=seed)
    #kfold = KFold(n_splits=folds, shuffle=False)
    #kfold = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    categorical_feature = ['author','level1','level2','level3','level4','brand','mall','url','baike_id_2h']

    train_pred_lst = np.zeros(train_x.shape[0])
    test_pred_lst = np.zeros(test_x.shape[0])

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kfold.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y = train_x.iloc[train_index], train_y.iloc[train_index]
        val_x, val_y = train_x.iloc[valid_index], train_y.iloc[valid_index]

        train_matrix = clf.Dataset(trn_x, label=trn_y)
        valid_matrix = clf.Dataset(val_x, label=val_y)

        params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': 'mse',
            'min_child_weight': 5,
            'num_leaves': 2 ** 7,
            'lambda_l2': 10,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.9,
            'bagging_freq': 4,
            'learning_rate': 0.1,
            'seed': 2021,
            'n_jobs': -1,
            'silent': True,
            'verbose': -1,
        }

        model = clf.train(params, train_matrix, 
                          num_boost_round=10000, 
                          valid_sets=[train_matrix, valid_matrix], 
                          verbose_eval=500,
                          early_stopping_rounds=200)
        val_pred = model.predict(val_x, num_iteration=model.best_iteration)
        test_pred = model.predict(test_x, num_iteration=model.best_iteration)

        train_pred_lst[valid_index] = val_pred
        test_pred_lst += test_pred / kfold.n_splits
        cv_scores.append(round(mean_squared_error(val_y, val_pred), 5))
        
        print(cv_scores)
    
    mean_mse = round(np.mean(cv_scores), 5)
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    return train_pred_lst, test_pred_lst, mean_mse

lgb_train, lgb_test, lgb_score = cv_model(lgb, x_train, y_train, x_test)

************************************ 1 ************************************
Training until validation scores don't improve for 200 rounds
[500]	training's l2: 0.984211	valid_1's l2: 1.15545
[1000]	training's l2: 0.864085	valid_1's l2: 1.15126
[1500]	training's l2: 0.771181	valid_1's l2: 1.15086
Early stopping, best iteration is:
[1366]	training's l2: 0.793786	valid_1's l2: 1.15054
[1.15054]
************************************ 2 ************************************
Training until validation scores don't improve for 200 rounds
[500]	training's l2: 0.98524	valid_1's l2: 1.15073
[1000]	training's l2: 0.863748	valid_1's l2: 1.14656
[1500]	training's l2: 0.769763	valid_1's l2: 1.14483
Early stopping, best iteration is:
[1779]	training's l2: 0.725211	valid_1's l2: 1.14438
[1.15054, 1.14438]
************************************ 3 ************************************
Training until validation scores don't improve for 200 rounds
[500]	training's l2: 0.986051	valid_1's l2: 1.15186
[1000]	trainin

# Submit

In [6]:
### Submit
submit_path = 'C:/Users/Administrator/Python_Learning/Competition/ECAA/submit/'
submit = test[[id_col]]
submit[target] = lgb_test
submit[target] = submit[target].apply(lambda x: x if x >= 0 else 0)
submit.to_csv(submit_path+f'submission_{lgb_score}.csv', index=False)