In [1]:
from sklearn.model_selection import train_test_split, KFold, TimeSeriesSplit
from sklearn.ensemble import ExtraTreesRegressor
import catboost
import lightgbm as lgb
import pandas as pd
import numpy as np
import datetime
import pickle
from tqdm import tqdm

In [2]:
def sMAPE(y_true, y_predict, shift=0):
    return 2 * np.mean(
        np.abs(y_true - y_predict) /
        (np.abs(y_true) + np.abs(y_predict) + shift))

In [3]:
def sMAPE_log(y_predict, y_true):
    if isinstance(y_true, lgb.Dataset):
        y_true = y_true.label
    y_true = np.expm1(y_true)
    y_predict = np.expm1(y_predict)
    return ('sMAPE', 2 * np.mean(
        np.abs(y_true - y_predict) /
        (np.abs(y_true) + np.abs(y_predict))),False)

In [4]:
pd.options.display.max_columns=40

In [5]:
set_name = 'set1'
path_train_set = 'taxi-idao/data/train/{}.csv'.format(set_name)

data = pd.read_csv(path_train_set)
data.datetime = data.datetime.apply(
    lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
data = data.sort_values('datetime')
data.head()

Unnamed: 0,datetime,num_orders
0,2018-03-01 00:00:00,0
1,2018-03-01 00:01:00,0
2,2018-03-01 00:02:00,0
3,2018-03-01 00:03:00,0
4,2018-03-01 00:04:00,1


In [6]:
target_positions = {
    'set1': [10, 30, 45, 60, 75],
    'set2': [5, 10, 15, 20, 25],
    'set3': [5, 7, 9, 11, 13]
}[set_name]
target_positions

[10, 30, 45, 60, 75]

In [7]:
HOUR_IN_MINUTES = 60
DAY_IN_MINUTES = 24 * HOUR_IN_MINUTES
WEEK_IN_MINUTES = 7 * DAY_IN_MINUTES

MAX_TIME = DAY_IN_MINUTES

In [8]:
samples = {
    'datetime': [],
    'history': []}

for position in target_positions:
    samples['target_{}'.format(position)] = []
    
num_orders = data.num_orders.values

In [9]:
# start after 2 weeks because of history
# finish earlier because of target calculation
for i in range(2 * WEEK_IN_MINUTES,
               len(num_orders) - 2 * DAY_IN_MINUTES):
    
    samples['datetime'].append(data.datetime[i])
    samples['history'].append(num_orders[i-2*WEEK_IN_MINUTES:i])
    
    # cumsum not for all array because of time economy
    cumsum_num_orders = num_orders[i+1:i+1+2*DAY_IN_MINUTES].cumsum()
    for position in target_positions:
        orders_by_positions = np.where(cumsum_num_orders >= position)[0]
        if len(orders_by_positions):
            time = orders_by_positions[0] + 1
        else:
            # if no orders in last days
            time = MAX_TIME
        samples['target_{}'.format(position)].append(time)

In [10]:
for k in samples.keys():
    print(k, len(samples[k]))

datetime 241920
target_60 241920
target_75 241920
target_30 241920
target_45 241920
target_10 241920
history 241920


In [11]:
df = pd.DataFrame.from_dict(samples)

In [12]:
df['weekday'] = df.datetime.apply(lambda x: x.weekday())
df['hour'] = df.datetime.apply(lambda x: x.hour)
df['minute'] = df.datetime.apply(lambda x: x.minute)
df['daymin_sin'] = np.sin(2 * np.pi * (df['hour']*60 + df.minute)/1440.0)
df['daymin_cos'] = np.cos(2 * np.pi * (df['hour']*60 + df.minute)/1440.0)
df['weekday'] = df['weekday'].astype('category')
df = df.drop(['hour','minute'], axis=1)

df['dch'] = df.datetime.apply(lambda x: str(x.hour) + ' ' + str(x.weekday()))



In [13]:
def numpy_ewm_alpha_v1(a,wghts):
    out = np.dot(a,wghts)
    return out

wsize = 10000

In [14]:
wghts01 = (1-0.01)**np.arange(wsize)
wghts01 /= wghts01.sum()
wghts01 = wghts01[::-1]

wghts05 = (1-0.05)**np.arange(wsize)
wghts05 /= wghts05.sum()
wghts05 = wghts05[::-1]


wghts1 = (1-0.1)**np.arange(wsize)
wghts1 /= wghts1.sum()
wghts1 = wghts1[::-1]


wghts2 = (1-0.2)**np.arange(wsize)
wghts2 /= wghts2.sum()
wghts2 = wghts2[::-1]

wghts001 = (1-0.001)**np.arange(wsize)
wghts001 /= wghts001.sum()
wghts001 = wghts001[::-1]

wghts005 = (1-0.005)**np.arange(wsize)
wghts005 /= wghts005.sum()
wghts005 = wghts005[::-1]

df['exp_{}'.format(0.01)] = df.history.apply(lambda x: numpy_ewm_alpha_v1(x[-wsize:],wghts01))
df['exp_{}'.format(0.05)] = df.history.apply(lambda x: numpy_ewm_alpha_v1(x[-wsize:],wghts05))
df['exp_{}'.format(0.1)] = df.history.apply(lambda x: numpy_ewm_alpha_v1(x[-wsize:],wghts1))
df['exp_{}'.format(0.2)] = df.history.apply(lambda x: numpy_ewm_alpha_v1(x[-wsize:],wghts2))
df['exp_{}'.format(0.001)] = df.history.apply(lambda x: numpy_ewm_alpha_v1(x[-wsize:],wghts001))
df['exp_{}'.format(0.005)] = df.history.apply(lambda x: numpy_ewm_alpha_v1(x[-wsize:],wghts005))

In [15]:
# dch_enc_dict = {k: df.groupby('dch')['target_{}'.format(k)].mean() for k in target_positions}

# for position in target_positions:
#     df['dch_enc_{}'.format(position)] = df['dch'].map(model_to_save['encs'][position])

In [16]:
SHIFTS = [
    HOUR_IN_MINUTES // 4,
    HOUR_IN_MINUTES // 2,
    HOUR_IN_MINUTES,
    DAY_IN_MINUTES,
    DAY_IN_MINUTES * 2,
    WEEK_IN_MINUTES,
    WEEK_IN_MINUTES * 2]
WINDOWS = [
    HOUR_IN_MINUTES // 4,
    HOUR_IN_MINUTES // 2,
    HOUR_IN_MINUTES,
    DAY_IN_MINUTES,
    DAY_IN_MINUTES * 2,
    WEEK_IN_MINUTES,
    WEEK_IN_MINUTES * 2]

In [17]:
# block = []
# block.append((60,30))
# block.append((60,15))
# block.append((30,30))
# block.append((30,15))
# block.append((15,15))
# block.append((2880,60))
# block.append((2880,30))
# block.append((2880,15))
# block.append((20160,30))
# block.append((20160,15))
# block.append((10080,30))
# block.append((10080,15))
# block.append((1440,30))
# block.append((1440,15))
# block = set(block)

# for shift in tqdm(SHIFTS):
#     for window in WINDOWS:
#         if window > shift or (shift,window) in block:
#             continue
#         right = -shift + window
#         if right == 0:
#             right = None
        
#         df['num_orders_{}_{}'.format(shift, window)] = \
#             df.history.apply(lambda x: x[-shift : right].sum())
#         df['num_orders_{}_{}_mean'.format(shift, window)] = \
#             df.history.apply(lambda x: x[-shift : right].mean())
#         df['num_orders_{}_{}_std'.format(shift, window)] = \
#             df.history.apply(lambda x: x[-shift : right].std())
# #         df['num_orders_{}_{}_skew'.format(shift, window)] = \
# #             df.history.apply(lambda x: x[-shift : -shift + window].skew())
#         df['num_orders_{}_{}_min'.format(shift, window)] = \
#             df.history.apply(lambda x: x[-shift : right].min())
#         df['num_orders_{}_{}_max'.format(shift, window)] = \
#             df.history.apply(lambda x: x[-shift : right].max())

In [18]:
arr = []
arr.append((10080,60))
arr.append((20160,60))
arr.append((2880,60))
arr.append((1440,60))
arr.append((1440,1440))
arr.append((2880,2880))
arr.append((10080,10080))
arr.append((20160,20160))

# for shift in SHIFTS:
#     for window in WINDOWS:
for shift,window in tqdm(arr):
        if window > shift:
            continue
        right = -shift + window
        right = -1 if right == 0 else right
        df['num_orders_{}_{}'.format(shift, window)] = \
            df.history.apply(lambda x: x[-shift : right].sum())
        df['num_orders_{}_{}_std'.format(shift, window)] = \
            df.history.apply(lambda x: x[-shift : right].std())

100%|██████████| 8/8 [01:13<00:00, 13.17s/it]


In [19]:
kf = KFold(n_splits=5, random_state=442)
# ts = TimeSeriesSplit(n_splits=5)

In [20]:
target_cols = ['target_{}'.format(position) for position in target_positions]

In [21]:
parameters = {
        'objective': 'mae',
        'metric': 'mae',
        'learning_rate': 0.05,
        "bagging_freq": 1,
        "bagging_fraction": 0.8,
        "feature_fraction": 0.8,
#         "min_sum_hessian_in_leaf": 10,
        "num_threads": 15,
        "lambda_l1": 10.,
        "lambda_l2": 10.,
#         "min_split_gain": 1.,
        "min_data_in_leaf": 1000,
        "num_leaves" : 150,
#         'max_depth':5,
        "use_two_round_loading": False,
#         "histogram_pool_size": 1024*100,
#         'bin_construct_sample_cnt':10000000,
        "reg_sqrt": False
   }
parameters = {'feature_fraction': 0.6532271360402252,
              'learning_rate': 0.020208933466333873,
              'lambda_l1': 2.3258779081081675,
              'lambda_l2': 2.3258779081081675,
              'min_data_in_leaf': 750,
              'bagging_fraction': 0.5931454235914756,
              'num_leaves': 20,
              'objective': 'mae',
              'metric': 'mae',
              'bagging_freq': 2,
              'num_threads': 15,
              'reg_sqrt': True
             }


In [22]:
# df_use = df.loc[df.datetime.dt.month >= 5].copy()
df_use = df.copy()

In [23]:
model_to_save = {
    'models': {},
    'encs': {k: {} for k in target_positions}
}

In [24]:
# (df.groupby('dch')['target_{}'.format(k)].mean()).to_dict()

In [25]:
scores = {k:[] for k in target_positions}
fold_no = 1
for tr_ix, val_ix in kf.split(df_use):
    
    for position in target_positions:
#         df_use['dch_enc'] = df_use['dch'].map(model_to_save['encs'][position])
        df_train = df_use.iloc[tr_ix]
        df_test = df_use.iloc[val_ix]

        y_train = df_train[target_cols]
    #     y_train_log = np.log1p(y_train)
        y_test = df_test[target_cols]
    #     y_test_log = np.log1p(y_test)
        df_train_pos = df_train.drop(['datetime', 'history', 'dch'] + target_cols, axis=1)
        df_test_pos = df_test.drop(['datetime', 'history', 'dch'] + target_cols, axis=1)
        
        train_dataset = lgb.Dataset(df_train_pos, y_train['target_{}'.format(position)])
        test_dataset = lgb.Dataset(df_test_pos, y_test['target_{}'.format(position)])
        model = lgb.train(parameters,
                          train_dataset,
                          valid_sets=(train_dataset, test_dataset),
                          early_stopping_rounds=20,
                          verbose_eval=0,
                          num_boost_round=500,
#                           feval=sMAPE_log
                         )
        y_predict = model.predict(df_test_pos)
#         y_predict = np.expm1(y_predict)
        
        score = sMAPE(y_test['target_{}'.format(position)], y_predict)
        scores[position].append(score)
        
    print('fold {} complete'.format(fold_no))
    fold_no += 1
for position in target_positions:
    print('target_{}'.format(position))
    print('stupid:\t{}'.format(sMAPE(
        y_test['target_{}'.format(position)],
        y_train['target_{}'.format(position)].median())))
    print('model:\t{}'.format(np.mean(scores[position])))
    print()
    
#     df_use['dch_enc'] = df_use['dch'].map(model_to_save['encs'][position])
    df_train, df_test = train_test_split(df_use, test_size=.2, random_state=442, shuffle=True)
    y_train = df_train[target_cols]
    y_test = df_test[target_cols]
    df_train_pos = df_train.drop(['datetime', 'history', 'dch'] + target_cols, axis=1)
    df_test_pos = df_test.drop(['datetime', 'history', 'dch'] + target_cols, axis=1)
    
    train_dataset = lgb.Dataset(df_train_pos, y_train['target_{}'.format(position)])
    test_dataset = lgb.Dataset(df_test_pos, y_test['target_{}'.format(position)])
    model = lgb.train(parameters,
                          train_dataset,
                          valid_sets=(train_dataset, test_dataset),
                          early_stopping_rounds=20,
                          verbose_eval=0,
                          num_boost_round=500)    
    model_to_save['models'][position] = model
pickle.dump(model_to_save, open('model_kos/models_lgb.pkl', 'wb'))



fold 1 complete
fold 2 complete
fold 3 complete
fold 4 complete
fold 5 complete
target_10
stupid:	0.5717447627854316
model:	0.32095100002378435

target_30
stupid:	0.4319931479991387
model:	0.25436133819973905

target_45
stupid:	0.40888033307558536
model:	0.23138480874785441

target_60
stupid:	0.39144963971394
model:	0.214915389288443

target_75
stupid:	0.3781095560663338
model:	0.20135148354337412



In [28]:
df_train_pos.columns

Index(['weekday', 'daymin_sin', 'daymin_cos', 'exp_0.01', 'exp_0.05',
       'exp_0.1', 'exp_0.2', 'exp_0.001', 'exp_0.005', 'num_orders_10080_60',
       'num_orders_10080_60_std', 'num_orders_20160_60',
       'num_orders_20160_60_std', 'num_orders_2880_60',
       'num_orders_2880_60_std', 'num_orders_1440_60',
       'num_orders_1440_60_std', 'num_orders_1440_1440',
       'num_orders_1440_1440_std', 'num_orders_2880_2880',
       'num_orders_2880_2880_std', 'num_orders_10080_10080',
       'num_orders_10080_10080_std', 'num_orders_20160_20160',
       'num_orders_20160_20160_std'],
      dtype='object')

In [None]:
target_10
stupid:	0.5717447627854316
model:	0.318513210266847

target_30
stupid:	0.4319931479991387
model:	0.252706549564262

target_45
stupid:	0.40888033307558536
model:	0.23051693567073003

target_60
stupid:	0.39144963971394
model:	0.21356155804815974

target_75
stupid:	0.3781095560663338
model:	0.20052793366470678

In [None]:
target_5
stupid:	0.7257808547638245
model:	0.4998490851153359

target_10
stupid:	0.6608281056458376
model:	0.4019303165728233

target_15
stupid:	0.5927615840829038
model:	0.33823548020795857

target_20
stupid:	0.5421014104769756
model:	0.301166912871263

target_25
stupid:	0.502216363688881
model:	0.27612079609550644