In [1]:
%matplotlib inline
from matplotlib import pyplot as plt 
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

train = pd.read_csv('train.csv.zip')
train['DATE'] = pd.to_datetime(train['DATE'])

# Unique ATMs
ATM_IDs = train.ATM_ID.unique()

# The dates to predict
pred_dates  = ['2017-08-16', '2017-08-17', '2017-08-18', '2017-08-19',
               '2017-08-20', '2017-08-21', '2017-08-22', '2017-08-23',
               '2017-08-24', '2017-08-25', '2017-08-26', '2017-08-27',
               '2017-08-28', '2017-08-29', '2017-08-30', '2017-08-31',
               '2017-09-01', '2017-09-02', '2017-09-03', '2017-09-04',
               '2017-09-05', '2017-09-06', '2017-09-07', '2017-09-08',
               '2017-09-09', '2017-09-10', '2017-09-11', '2017-09-12',
               '2017-09-13', '2017-09-14', '2017-09-15', '2017-09-16',
               '2017-09-17']

In [2]:
valid_train_mask = (train.DATE < pd.to_datetime('2017-07-14')) & (train.DATE >= pd.to_datetime('2016-03-14'))
valid_mask = train.DATE >= pd.to_datetime('2017-07-14')

pre_valid_train_mask = (train.DATE < pd.to_datetime('2017-06-11')) & (train.DATE >= pd.to_datetime('2016-04-11'))
pre_valid_mask = (train.DATE >= pd.to_datetime('2017-06-11')) & (train.DATE < pd.to_datetime('2017-07-14'))

In [3]:
def replace_week_ago(trainnine):
    fist_q =  trainnine.CLIENT_OUT.quantile(0.9)
    last_q = trainnine.CLIENT_OUT.quantile(0.1)
    mask = (trainnine.CLIENT_OUT > fist_q) | (trainnine.CLIENT_OUT < last_q) 
    trainnine['mask'] = mask
    trainnine['shifted'] = trainnine.CLIENT_OUT.shift(7)
    trainnine.loc[mask,['CLIENT_OUT','shifted', ]] = trainnine.loc[mask,['shifted', 'CLIENT_OUT']].values
    trainnine = trainnine.iloc[7:]
    return trainnine[['DATE', "ATM_ID", 'CLIENT_OUT']]


def apply_replace(train):
    train2 = train.iloc[:0].copy()
    ATM_IDs = train.ATM_ID.unique()
    for ATM in ATM_IDs:
        trainnine = train[(train.ATM_ID==ATM)].copy()
        trainnine = replace_week_ago(trainnine)
    #     print(trainnine.head())
        train2 = pd.concat([train2, trainnine])
    train = train2.copy()
    del train2
    return train

train = apply_replace(train)
train = apply_replace(train)


In [4]:
def time_preprocess(X):
    X['DATE'] = pd.to_datetime(X['DATE'])
#     X['year'] = X['DATE'].dt.year
    X['month'] = X['DATE'].dt.month 
    X['week'] = X['DATE'].dt.week
    X['day'] = X['DATE'].dt.day
    X['week_day'] = X['DATE'].dt.weekday
    return X
train = time_preprocess(train)

In [5]:
def score_series(y_true, y_pred):
    """
        Expects y_true and y_pred to be 1d arrays of the same lenght
    """
    return np.mean(np.abs(y_true - y_pred))/np.mean(np.abs(y_true))*10000


In [6]:
def add_feats(train):
    train['day_ago'] = train.CLIENT_OUT.shift(1)
    train['two_days_ago'] = train.CLIENT_OUT.shift(2)
    train['week_ago'] = train.CLIENT_OUT.shift(7)
    train['month_ago'] = train.CLIENT_OUT.shift(30)
    train['day_diff'] = train.CLIENT_OUT - train.day_ago
    train['day_diff_day_ago'] = train.day_ago - train.two_days_ago
    train = train.iloc[30:].copy()
    return train

In [7]:
import lightgbm as lgb


def validate_lgb(X_train, y_train, X_valid, y_valid):
    
    
    d1 = lgb.Dataset(X_train, y_train, weight=np.linspace(0.5, 1, X_train.shape[0]))
    d2 = lgb.Dataset(X_valid, y_valid)
    
    params = {
        'objective':'regression',    
        'metric': 'l1', 
        'learning_rate': 0.5,
        'random_state':42,
        'verbose':-1,
        'boosting':'dart',
        #'min_data':1, 'min_data_in_bin':1
    }
    
    gbm = lgb.train(params, d1, verbose_eval=-1, valid_sets=d2, 
                    num_boost_round=50000, early_stopping_rounds=100)
    
    y_hat = gbm.predict(X_valid)
    opt_boost_rounds = gbm.best_iteration
    
    return y_hat


In [11]:
for atm in ATM_IDs[:50]:
    train1 = train[train['ATM_ID'] == atm].copy()
    train1 = add_feats(train1)
    
    valid_train_mask = (train1.DATE < pd.to_datetime('2017-07-14')) & (train1.DATE >= pd.to_datetime('2016-03-14'))
    valid_mask = train1.DATE >= pd.to_datetime('2017-07-14')

    pre_valid_train_mask = (train1.DATE < pd.to_datetime('2017-06-11')) & (train1.DATE >= pd.to_datetime('2016-04-11'))
    pre_valid_mask = (train1.DATE >= pd.to_datetime('2017-06-11')) & (train1.DATE < pd.to_datetime('2017-07-14'))
    
    train1.drop('DATE', axis=1, inplace=True)
    
    y_train = train1[pre_valid_train_mask].day_diff
    last_client = train1[pre_valid_train_mask].CLIENT_OUT.iloc[-1:]
    x_train = train1[pre_valid_train_mask].drop(['CLIENT_OUT', 'day_diff'], axis=1)
    
    y_valid = train1[valid_train_mask].day_diff
    y_valid_client = train1[valid_train_mask].CLIENT_OUT
    x_valid = train1[valid_train_mask].drop(['CLIENT_OUT', 'day_diff'], axis=1)

    y_pred = validate_lgb(x_train, y_train, x_valid, y_valid)
    
    print(atm, score_series(y_valid, y_pred))

     ATM_ID  CLIENT_OUT  month  week  day  week_day   day_ago  two_days_ago  \
891       0    424100.0      6    23   10         5  380100.0      656600.0   

     week_ago  month_ago  day_diff  day_diff_day_ago  
891  368300.0   490900.0   44000.0         -276500.0  
Training until validation scores don't improve for 100 rounds.


KeyboardInterrupt: 