In [1]:
import pandas as pd
from tqdm import tqdm
import warnings
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import re

warnings.filterwarnings('ignore')

In [3]:
# 日期特征工程
def get_inner_date_feature_eng(data):
    data['date'] = pd.to_datetime(data['date'])
    data["month"] = data['date'].dt.month
    data["year"] = data['date'].dt.year

    data["day"] = data['date'].dt.day
    data["dayofw"] = data['date'].dt.dayofweek
    data["dayofy"] = data['date'].dt.dayofyear
    data["week"] = data['date'].dt.week
    data["quarter"] = data['date'].dt.quarter

    return data

wkd = pd.read_csv('data/wkd_v1.csv')
train = pd.read_csv('data/train_v2.csv')
train = pd.merge(train, wkd, left_on='date', right_on='ORIG_DT', how='left')
train['amount_sum'] = train.groupby(['date', 'post_id'])['amount'].transform('sum')
k = train.drop_duplicates(['date', 'post_id'])

k1 = k[k['post_id'] == 'A']
k2 = k[k['post_id'] == 'B']

In [4]:
k = k1.copy()
k = get_inner_date_feature_eng(k)
train1 = k[['WKD_TYP_CD',  'month',
       'year', 'day', 'dayofw', 'dayofy', 'week', 'quarter']][:-30]
train1_y = k['amount_sum'][:-30]

valid1 = k[['WKD_TYP_CD',  'month',
       'year', 'day', 'dayofw', 'dayofy', 'week', 'quarter']][-30:]
valid1_y = k['amount_sum'][-30:]

In [6]:
test_v1 = pd.read_csv('data/test_v2_day.csv')
test_v1 = pd.merge(test_v1, wkd, left_on='date', right_on='ORIG_DT', how='left')
test_v1_A = test_v1[test_v1['post_id'] == 'A']

test_v1_A = get_inner_date_feature_eng(test_v1_A)
test = test_v1_A[['WKD_TYP_CD',   'month',
       'year', 'day', 'dayofw', 'dayofy', 'week', 'quarter']]

In [7]:
cat_cols = ['WKD_TYP_CD', ]
train1[cat_cols] = train1[cat_cols].astype('category')
valid1[cat_cols] = valid1[cat_cols].astype('category')


test[cat_cols] = test[cat_cols].astype('category')

params = {'learning_rate': 0.1, 
        'boosting_type': 'gbdt', 
        'objective': 'regression_l1',
        'metric': 'poisson',
        'seed': 2019, 
        'verbosity': -1, 
       }


train_set = lgb.Dataset(train1, train1_y)
val_set = lgb.Dataset(valid1, valid1_y)

model = lgb.train(params, train_set, num_boost_round=2000,
                  valid_sets=(train_set, val_set), early_stopping_rounds=50,
                  verbose_eval=50,
                 categorical_feature=cat_cols
                 )
oof_train = model.predict(valid1)
test_predict = model.predict(test)

Training until validation scores don't improve for 50 rounds
[50]	training's poisson: -154774	valid_1's poisson: -129425
[100]	training's poisson: -154836	valid_1's poisson: -129427
[150]	training's poisson: -154855	valid_1's poisson: -129428
[200]	training's poisson: -154860	valid_1's poisson: -129428
Early stopping, best iteration is:
[158]	training's poisson: -154856	valid_1's poisson: -129428


In [8]:
def mape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / (y_true + 1))) 
test_v1_A['amount'] = test_predict
mape(valid1_y, oof_train)

0.036296142571395004

In [9]:
k = k2.copy()
k = get_inner_date_feature_eng(k)
test_v1_B = test_v1[test_v1['post_id'] == 'B']

test_v1_B = get_inner_date_feature_eng(test_v1_B)
test = test_v1_B[['WKD_TYP_CD',   'month',
       'year', 'day', 'dayofw', 'dayofy', 'week', 'quarter']]

In [10]:
train1 = k[['WKD_TYP_CD',  'month',
       'year', 'day', 'dayofw', 'dayofy', 'week', 'quarter']][:-30]
train1_y = k['amount_sum'][:-30]

valid1 = k[['WKD_TYP_CD',  'month',
       'year', 'day', 'dayofw', 'dayofy', 'week', 'quarter']][-30:]
valid1_y = k['amount_sum'][-30:]

In [11]:
cat_cols = ['WKD_TYP_CD', ]
train1[cat_cols] = train1[cat_cols].astype('category')
valid1[cat_cols] = valid1[cat_cols].astype('category')


test[cat_cols] = test[cat_cols].astype('category')

params = {'learning_rate': 0.1, 
        'boosting_type': 'gbdt', 
        'objective': 'regression_l1',
        'metric': 'poisson',
        'seed': 2019, 
        'verbosity': -1, 
       }


train_set = lgb.Dataset(train1, train1_y)
val_set = lgb.Dataset(valid1, valid1_y)

model = lgb.train(params, train_set, num_boost_round=5000,
                  valid_sets=(train_set, val_set), early_stopping_rounds=50,
                  verbose_eval=50,
                 categorical_feature=cat_cols
                 )
oof_train = model.predict(valid1)
test_predict = model.predict(test)

Training until validation scores don't improve for 50 rounds
[50]	training's poisson: -20750.7	valid_1's poisson: -14507.9
[100]	training's poisson: -20757.6	valid_1's poisson: -14507.1
Early stopping, best iteration is:
[61]	training's poisson: -20754.5	valid_1's poisson: -14508.9


In [12]:
test_predict = test_predict + 666
test_predict[(test_predict < 1000)] = 0
test_v1_B['amount'] = test_predict

In [13]:
oof_train = oof_train  + 250
oof_train[(oof_train < 1000)] = 0
mape(valid1_y, oof_train) * 30 / 21

0.06229676183366535

In [14]:
test_ = pd.concat([test_v1_A, test_v1_B], axis=0)
test_v1['date'] = pd.to_datetime(test_v1['date'])
test_ = pd.merge(test_v1[['date', 'post_id']], test_[['date', 'post_id', 'amount']], on=['date', 'post_id'], how='left')

test_['amount'] = test_['amount'].astype('int')
# test_[['date', 'post_id', 'amount']].to_csv('sub_day_lgb_all_666.txt', index=False)
test_

Unnamed: 0,date,post_id,amount
0,2020-12-01,A,18553
1,2020-12-01,B,2839
2,2020-12-02,A,17331
3,2020-12-02,B,2842
4,2020-12-03,A,17168
...,...,...,...
57,2020-12-29,B,3637
58,2020-12-30,A,20914
59,2020-12-30,B,3633
60,2020-12-31,A,20606
