# Import libraries 

In [None]:
import pandas as pd 
import numpy as np 
from matplotlib import pyplot as plt 
from scipy.optimize import minimize, fmin_slsqp
from datetime import date, timedelta
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pylab
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, ElasticNet, LinearRegression
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold, KFold
import lightgbm as lgb 

# Import data 

In [None]:
%%time
# train and sub format already merged with weather and holidays (plus the day before holiday)
train = pd.read_csv('train.csv')
hol = pd.read_csv('hol.csv')
hol['Timestamp'] = pd.to_datetime(hol['Timestamp'])

In [3]:
gold = pd.read_csv('gold_usd/gold.csv')
usd = pd.read_csv('gold_usd/usd.csv')

In [4]:
gold = 

Unnamed: 0,TICKER,DATE,OPEN,HIGH,LOW,CLOSE,VOL,WAPRICE
0,1,2012-04-03,,,,1570.08,,
1,2,2012-04-03,,,,30.60,,
2,3,2012-04-03,,,,1543.66,,
3,4,2012-04-03,,,,618.97,,
4,1,2012-04-04,,,,1577.34,,
5,2,2012-04-04,,,,30.53,,
6,3,2012-04-04,,,,1561.56,,
7,4,2012-04-04,,,,620.67,,
8,1,2012-04-05,,,,1543.88,,
9,2,2012-04-05,,,,31.19,,


In [None]:
train.columns = ['Timestamp','ForecastId','Value']
def time_preprocess(X):
    X['Timestamp'] = pd.to_datetime(X['Timestamp'])
    X['year'] = X['Timestamp'].dt.year
    X['month'] = X['Timestamp'].dt.month 
    X['day'] = X['Timestamp'].dt.day
    X['week_day'] = X['Timestamp'].dt.weekday
    X['hour'] = X['Timestamp'].dt.hour
    X['minute'] = X['Timestamp'].dt.minute
    X['minute'] = X['minute'] // 15 * 15
    
    return X
train = time_preprocess(train)

## validate 

In [None]:
def MeanEncodingTransforming(X, y, X_test, how_to_fill):
    
    # mean encoding for lgb
    
    X_train = pd.concat([X, y], axis=1)
    mean_values = X_train.groupby(X_train.columns[0]).agg(how_to_fill).to_dict()['Value']
    X_train = X_train.drop(y.columns[0], axis=1)
    X_train = X_train.replace(mean_values)
    X_test = X_test.replace(mean_values)
    
    return X_train, X_test

def feature_preprocessing(train, test, cat_cols, cat_type='ohe'):
    
    # ohe or mean encoding preprocessing for the lgb 
    
    X_train, X_test = train[cat_cols].copy(), test[cat_cols].copy()

    if (cat_type=='mean_enc'):
        for j in ['mean', 'max', 'min', 'median']:
            for i in cat_cols:
                X_train[i], X_test[i] = MeanEncodingTransforming(X_train[i], train[['Value']], X_test[i], j)
                X_train[i].columns = [i+'_'+j]
                X_test[i].columns = [i+'_'+j]
                
    if (cat_type=='ohe'):
        ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
        X_train = ohe.fit_transform(train[cat_cols])
        X_test = ohe.transform(test[cat_cols])
        X_train = pd.DataFrame(X_train)
        X_test = pd.DataFrame(X_test)
        
    X_train = pd.concat([X_train, train[['days_to_holiday', 'days_after_holiday']]], axis=1).fillna(-999)
    X_test = pd.concat([X_test, test[['days_to_holiday', 'days_after_holiday']]], axis=1).fillna(-999)
    
    return X_train, X_test 

In [None]:
def train_groupby(train, test, window, how):
    
    # simple groupby prediction 
    
    # time_delta = list((test['Timestamp'].iloc[-1:]  - train['Timestamp'].iloc[1] ).dt.days)[0]

    mean_values = train[['Value', 'week_day']][-window:].groupby(['week_day']).agg(how).reset_index()
    mean_values.columns = ['week_day', 'pred']
    test = pd.merge(test, mean_values, how='left', on = ['week_day'])  
    
    return test['pred'].fillna(np.mean(train['Value']))

def train_mean(train, window):
    
    # return mean value from train for the window 
    
    mean_value = np.mean(train['Value'].reset_index(drop=True)[-window:])
   
    return mean_value


In [None]:
def validate_lgb(X_train, y_train, X_valid, y_valid):
    
    
    d1 = lgb.Dataset(X_train, y_train, weight=np.linspace(0.5, 1, X_train.shape[0]))
    d2 = lgb.Dataset(X_valid, y_valid)
    
    params = {
        'objective':'regression',    
        'metric': 'l1', 
        'learning_rate': 0.160042,
        'random_state':42,
        'min_data':1,
        'min_data_in_bin':1
    }
    
    gbm = lgb.train(params, d1, verbose_eval=None, valid_sets=d2, 
                    num_boost_round=50000, early_stopping_rounds=100)
    
    y_hat = gbm.predict(X_valid)
    opt_boost_rounds = gbm.best_iteration
    
    return y_hat, opt_boost_rounds 



def train_lgb(X_train, y_train, X_test, opt_boost_rounds):
    
    d1 = lgb.Dataset(X_train, y_train, weight=np.linspace(0.5, 1, X_train.shape[0]))
    
    params = {
        'objective':'regression',    
        'metric': 'l1', 
        'learning_rate': 0.160042,
        'random_state':42,
        'min_data':1,
        'min_data_in_bin':1
    }
    
    gbm = lgb.train(params, d1, verbose_eval=None, num_boost_round=opt_boost_rounds)
    
    y_hat = gbm.predict(X_test)

    return y_hat

In [None]:
def calc_score(pred, fact, index_mult):
    return np.sum(abs(pred-fact)) / np.sum(fact) * 10000

def train_rf(X_train, y_train, X_valid):

    rf = RandomForestRegressor(max_features='sqrt', n_estimators=142, n_jobs=-1, random_state=4224)
    rf.fit(X_train, y_train, sample_weight=np.linspace(0.5, 1, X_train.shape[0]) )
    y_hat = rf.predict(X_valid)
    
    return y_hat

In [None]:
def combine_y_hats(y_hats):
    X_stack = pd.DataFrame({})
    for i in range(0, len(y_hats)):
        X_stack['stack'+str(i)] = y_hats[i]
    return X_stack

In [None]:
def train_stack(X_stack, y, model):
    model.fit(X_stack, y)
    return model

In [None]:
def make_harmonic_features(x, col, period=24):
    x['sin_'+col] = np.sin(x[col] * 2 * np.pi / period)
    x['cos_'+col] = np.cos(x[col] * 2 * np.pi / period)
    x = x.drop(col, axis=1)
    return x

## predict

In [None]:
train['minutes_in_day'] = train['hour']*60 + train['minute']

In [None]:
train['holidays'] = 0

In [None]:
train.head()

In [None]:
train.head()

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=False, random_state=None)

def validate_stack(X_stack, y, model):
    y_cros_val_pred = cross_val_predict(model, X_stack, y=y, cv=5, n_jobs=-1)
    return y_cros_val_pred

def cv_lr(X_stack, y, model):
    
    # cros val lasso on X_stack 
    
    y_cros_val_pred = pd.DataFrame({})
    coefs = []
    
    for train_index, test_index in kf.split(X_stack):
        X_train, X_test = X_stack.iloc[train_index], X_stack.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model_temp = model 
        model_temp.fit(X_train,y_train)
        y_hat = model_temp.predict(X_test)
        coefs.append(model_temp.coef_)
        
        temp_df = pd.DataFrame({'id': test_index, 'Value': y_hat})
        y_cros_val_pred = pd.concat([y_cros_val_pred, temp_df], axis=0)
      
    y_cros_val_pred = y_cros_val_pred.sort_values(by='id')
    
    #divide by sum - due to overfit of model, with dividing - coefs ~ weights of models
    return list(y_cros_val_pred['Value']) / (np.sum(np.mean(coefs,axis=0)) + 1)

In [None]:
# train.columns = ['Timestamp','ForecastId','Value']
def time_preprocess(X):
    X['Timestamp'] = pd.to_datetime(X['Timestamp'])
    X['year'] = X['Timestamp'].dt.year
    X['month'] = X['Timestamp'].dt.month 
    X['week'] = X['Timestamp'].dt.week
    X['day'] = X['Timestamp'].dt.day
    X['week_day'] = X['Timestamp'].dt.weekday
    X['hour'] = X['Timestamp'].dt.hour
    X['minute'] = X['Timestamp'].dt.minute
    X['minute'] = X['minute'] // 15 * 15
    
    return X

val = train[['ForecastId','Value']].groupby('ForecastId').diff()
val['Value'] = val['Value'].fillna(0)
val['Value'] = (val['Value']==0) * 1
val.columns = ['diff']
train = pd.concat([train, val], axis=1)

a = train[['ForecastId','diff']].groupby('ForecastId').apply(pd.rolling_sum, 7, min_periods=1)
a.columns = ['ForecastId', 'to_drop']
a = a['to_drop']
train = pd.concat([train, a], axis=1)

print(train.shape)
train = train[train['to_drop']<=3].reset_index(drop=True)
train = train[['Timestamp','ForecastId','Value']]
print(train.shape)



train = time_preprocess(train)
train = train[train.year>=2016].reset_index(drop=True)

#max_values = train[['Value', 'ForecastId']].groupby('ForecastId').agg('max').reset_index()
#max_values.columns = ['ForecastId', 'max_value']
#train = pd.merge(train, max_values, on = 'ForecastId', how='left')
#train['Value'] = train['Value'] / train['max_value']
#train = train.drop('max_value',axis=1)

train['year_month_weekday'] = train['year'].astype('str') + '_' + train['month'].astype('str') + '_' + train['week_day'].astype('str') 
features = train[['ForecastId', 'year_month_weekday', 'Value']].groupby(['ForecastId', 'year_month_weekday']).agg('mean').reset_index()
features = features.pivot(index='ForecastId', columns='year_month_weekday', values='Value')

features = features.fillna(0)
train = pd.merge(train, hol, on='Timestamp', how='left')

In [None]:
%%time

losses = []
iids = []

sub = pd.DataFrame({})
cat_cols = ['week_day','month','year', 'week']

pred_dates  = ['2017-08-16', '2017-08-17', '2017-08-18', '2017-08-19',
               '2017-08-20', '2017-08-21', '2017-08-22', '2017-08-23',
               '2017-08-24', '2017-08-25', '2017-08-26', '2017-08-27',
               '2017-08-28', '2017-08-29', '2017-08-30', '2017-08-31',
               '2017-09-01', '2017-09-02', '2017-09-03', '2017-09-04',
               '2017-09-05', '2017-09-06', '2017-09-07', '2017-09-08',
               '2017-09-09', '2017-09-10', '2017-09-11', '2017-09-12',
               '2017-09-13', '2017-09-14', '2017-09-15', '2017-09-16',
               '2017-09-17']

for i in pd.unique(train['ForecastId'])[0:50]:
 
    rf_stack = RandomForestRegressor(n_estimators=1000, random_state=42, n_jobs=-1)
    lr_stack = Lasso(alpha=1, fit_intercept=False, max_iter=3000, tol=0.0001, positive=True, random_state=424142)

    # prepare train and test Dfs 
    days_in_train = 99999
    X_train = train[train['ForecastId']==i].reset_index(drop=True)[-days_in_train:].reset_index(drop=True)
    X_train['Value'] = abs(X_train['Value'])
    X_train['Value_log'] = np.log1p(X_train['Value'])
    
    # drop outliers in train data set 
    up_border = X_train['Value'].quantile(0.985)
    low_border = X_train['Value'].quantile(0.015)
    X_train = X_train[(X_train['Value']<=up_border) & (X_train['Value']>=low_border)].reset_index(drop=True)
    
    # prepare 'train_v' and 'valid_v' data frames - for validation 
    X_test = pd.DataFrame({'Timestamp':pred_dates, 'ForecastId':i})
    X_test = time_preprocess(X_test)
    X_test = pd.merge(X_test, hol, on='Timestamp', how='left')
    
    obs_in_test = 35
    X_train_v = X_train[:-obs_in_test].reset_index(drop=True)
    X_valid_v = X_train[-obs_in_test:].reset_index(drop=True)
    
    # prepare features 
        # for train
    X_train_ohe, X_test_ohe = feature_preprocessing(X_train, X_test, cat_cols, cat_type='ohe')
    X_train_meanenc, X_test_meanenc = feature_preprocessing(X_train, X_test, cat_cols, cat_type='mean_enc')
    
        # for validation 
    X_train_v_ohe, X_valid_v_ohe = feature_preprocessing(X_train_v, X_valid_v, cat_cols, cat_type='ohe')
    X_train_v_meanenc, X_valid_v_meanenc = feature_preprocessing(X_train_v, X_valid_v, cat_cols, cat_type='mean_enc')
    
        # group by mean
    y_hat_grby_mean = train_groupby(X_train_v, X_valid_v, window=999999, how='mean')
    y_hat_grby3_mean = train_groupby(X_train_v, X_valid_v, window=obs_in_test*3, how='mean')
    
        # group by median
    y_hat_grby_median = train_groupby(X_train_v, X_valid_v, window=999999, how='median')
    y_hat_grby3_median = train_groupby(X_train_v, X_valid_v, window=obs_in_test*3, how='median')
    
        # RandomForest 
    y_rf = train_rf(X_train_v_ohe, X_train_v['Value'], X_valid_v_ohe)
    y_rf_mean = train_rf(X_train_v_meanenc, X_train_v['Value'], X_valid_v_meanenc)
    y_rf_log = np.exp(train_rf(X_train_v_ohe, X_train_v['Value_log'], X_valid_v_ohe)) - 1
    y_rf_mean_log = np.exp(train_rf(X_train_v_meanenc, X_train_v['Value_log'], X_valid_v_meanenc)) - 1
    
        # LightGBM
    #y_lgb, lgb_opt = validate_lgb(X_train_v_ohe, X_train_v['Value'], X_valid_v_ohe, X_valid_v['Value'])
    #y_lgb_mean, lgb_mean_opt = validate_lgb(X_train_v_meanenc, X_train_v['Value'], X_valid_v_meanenc, X_valid_v['Value'])
    #y_lgb_log, lgb_opt_log = validate_lgb(X_train_v_ohe, X_train_v['Value_log'], X_valid_v_ohe, X_valid_v['Value_log'])
    #y_lgb_log = np.exp(y_lgb_log) -1
    #y_lgb_mean_log, lgb_mean_opt_log = validate_lgb(X_train_v_meanenc, X_train_v['Value_log'], X_valid_v_meanenc, X_valid_v['Value_log'])
    #y_lgb_mean_log = np.exp(y_lgb_mean_log) -1
    
    # stack predictions and make predictions 
    X_valid_stack = combine_y_hats([y_hat_grby_mean, y_hat_grby3_mean, 
                                    y_hat_grby_median, y_hat_grby3_median, 
                                    y_rf, y_rf_mean, 
                                    y_rf_log, y_rf_mean_log, 
                                    #y_lgb, y_lgb_mean, 
                                   #y_lgb_log, y_lgb_mean_log
                                   ])
    #y_rf_hat = validate_stack(X_valid_stack, X_valid_v['Value'], rf_stack)
    #y_lr_hat = cv_lr(X_valid_stack, X_valid_v['Value'], lr_stack)
 
    # calculate scores and pick top model 
    iid = X_valid_v.reset_index()['index'] 
    T = np.max(iid)
    index_mult = (3*T -2*iid +1) / 2 / T**2
    
    score_grby_mean = calc_score(y_hat_grby_mean, X_valid_v['Value'], index_mult)
    score_grby3_mean = calc_score(y_hat_grby3_mean, X_valid_v['Value'], index_mult)
    score_grby_median = calc_score(y_hat_grby_median, X_valid_v['Value'], index_mult)
    score_grby3_median = calc_score(y_hat_grby3_median, X_valid_v['Value'], index_mult)
    
    score_rf = calc_score(y_rf, X_valid_v['Value'], index_mult)
    score_rf_mean = calc_score(y_rf_mean, X_valid_v['Value'], index_mult)
    score_rf_log = calc_score(y_rf_log, X_valid_v['Value'], index_mult)
    score_rf_mean_log = calc_score(y_rf_mean_log, X_valid_v['Value'], index_mult)
    
    #score_lgb = calc_score(y_lgb, X_valid_v['Value'], index_mult)
    #score_lgb_mean = calc_score(y_lgb_mean, X_valid_v['Value'], index_mult)
    #score_lgb_log = calc_score(y_lgb_log, X_valid_v['Value'], index_mult)
    #score_lgb_mean_log = calc_score(y_lgb_mean_log, X_valid_v['Value'], index_mult)

    #score_lr_stack = calc_score(y_lr_hat, X_valid_v['Value'], index_mult)
    #score_rf_stack = calc_score(y_rf_hat, X_valid_v['Value'], index_mult)
    
    
    y_hats     = [y_hat_grby_mean, y_hat_grby3_mean, 
                  y_hat_grby_median, y_hat_grby3_median, 
                  y_rf, 
                  y_rf_mean, 
                  y_rf_log, 
                  y_rf_mean_log, 
                  #y_lgb, y_lgb_mean, 
                  #y_lgb_log, y_lgb_mean_log, 
                  #y_lr_hat, y_rf_hat
                 ]
    
    all_scores = [score_grby_mean, score_grby3_mean, 
                  score_grby_median, score_grby3_median, 
                  score_rf, 
                  score_rf_mean, 
                  score_rf_log, 
                  score_rf_mean_log, 
                  #score_lgb, score_lgb_mean, 
                  #score_lgb_log, score_lgb_mean_log, 
                  #score_lr_stack
                 ]
    
    best_score = np.min(all_scores)
    
    models_names = ['mean_all', 'mean_3', 
                    'median_all', 'median_3', 
                    'rf', 'rf_mean', 
                    'rf_log', 'rf_mean_log', 
                    #'lgb', 'lgb_mean', 
                    #'lgb_log', 'lgb_mean_log', 
                    #'lr_stack', 'rf_stack'
                   ]
    # plot figures and seve to folder 
    fig, ax = plt.subplots( nrows=1, ncols=1 )  # create figure & 1 axis
    ax.plot(y_hats[np.argmin(all_scores)])
    ax.plot(X_valid_v['Value'].reset_index(drop=True))
    fig.savefig('C:/Users/denis/Machine_Learning_Competitions/idao/lr_stack/'+str(i)+'.jpg')   # save the figure to file
    # plt.show()
    plt.close(fig)    # close the figure
    
    #calc R2 and save later 
    #r2 = r2_score(y_hats[np.argmin(all_scores)], X_valid_v['Value'] ) 
    #losses.append( r2 )
    
    losses.append(best_score)
    
    temp_df = pd.DataFrame({'Timestamp':pred_dates, 'ForecastId':i})
    temp_df = time_preprocess(temp_df)
    
    if (score_grby_mean==best_score):
        y_hat = train_groupby(X_train, X_test, window=999999, how='mean')
    
    if (score_grby3_mean==best_score):
        y_hat = train_groupby(X_train, X_test, window=obs_in_test*3, how='mean')
        
    if (score_grby_median==best_score):
        y_hat = train_groupby(X_train, X_test, window=999999, how='median')
    
    if (score_grby3_median==best_score):
        y_hat = train_groupby(X_train, X_test, window=obs_in_test*3, how='median')
    y_hat = 0 
    X_test['Value'] = y_hat
    sub = pd.concat([sub,X_test],axis=0)
    
  
    print(i, 'Best model is', models_names[np.argmin(all_scores)])
    print('loss:', np.min(all_scores) )
    print('val score:', np.mean(losses),'+',np.std(losses) )
    print(dict(zip(models_names, all_scores)))
    print('-------------------------------')


In [None]:
60 Best model is mean_all
loss: 4305.94860943
val score: 4158.48998272 + 2005.10425588
{'rf_log': 4946.362227144722, 'median_3': 6169.4187582562745, 'rf_mean_log': 5304.017089875222, 'mean_3': 5520.169214317167, 'median_all': 4584.742404227213, 'mean_all': 4305.948609431632, 'rf_mean': 4971.130659230724, 'rf': 4742.6985121311545}

## sub 

In [2]:
%%time

import pandas as pd 
import numpy as np 
from datetime import date, timedelta
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict

train = pd.read_csv('train.csv')
hol = pd.read_csv('hol.csv')
hol['Timestamp'] = pd.to_datetime(hol['Timestamp'])

train.columns = ['Timestamp','ForecastId','Value']
def time_preprocess(X):
    X['Timestamp'] = pd.to_datetime(X['Timestamp'])
    X['year'] = X['Timestamp'].dt.year
    X['month'] = X['Timestamp'].dt.month 
    X['day'] = X['Timestamp'].dt.day
    X['week_day'] = X['Timestamp'].dt.weekday
    X['hour'] = X['Timestamp'].dt.hour
    X['minute'] = X['Timestamp'].dt.minute
    X['minute'] = X['minute'] // 15 * 15
    
    return X
train = time_preprocess(train)

def feature_preprocessing(train, test, cat_cols, cat_type='ohe'):
    
    # ohe or mean encoding preprocessing for the lgb 
    
    X_train, X_test = train[cat_cols].copy(), test[cat_cols].copy()

    if (cat_type=='mean_enc'):
        for j in ['mean', 'max', 'min', 'median']:
            for i in cat_cols:
                X_train[i], X_test[i] = MeanEncodingTransforming(X_train[i], train[['Value']], X_test[i], j)
                X_train[i].columns = [i+'_'+j]
                X_test[i].columns = [i+'_'+j]
                
    if (cat_type=='ohe'):
        ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
        X_train = ohe.fit_transform(train[cat_cols])
        X_test = ohe.transform(test[cat_cols])
        X_train = pd.DataFrame(X_train)
        X_test = pd.DataFrame(X_test)
        
    X_train = pd.concat([X_train, train[['days_to_holiday', 'days_after_holiday']]], axis=1).fillna(-999)
    X_test = pd.concat([X_test, test[['days_to_holiday', 'days_after_holiday']]], axis=1).fillna(-999)
    
    return X_train, X_test 

def train_groupby(train, test, window, how):
    mean_values = train[['Value', 'week_day']][-window:].groupby(['week_day']).agg(how).reset_index()
    mean_values.columns = ['week_day', 'pred']
    test = pd.merge(test, mean_values, how='left', on = ['week_day'])  
    
    return test['pred'].fillna(np.mean(train['Value']))

def train_mean(train, window):
    mean_value = np.mean(train['Value'].reset_index(drop=True)[-window:])
    return mean_value

def calc_score(pred, fact, index_mult):
    return np.sum(abs(pred-fact)) / np.sum(fact) * 10000

def train_rf(X_train, y_train, X_valid):

    rf = RandomForestRegressor(max_features='sqrt', n_estimators=240, n_jobs=-1, random_state=4224)
    rf.fit(X_train, y_train, sample_weight=np.linspace(0.5, 1, X_train.shape[0]) )
    y_hat = rf.predict(X_valid)
    
    return y_hat


# train.columns = ['Timestamp','ForecastId','Value']
def time_preprocess(X):
    X['Timestamp'] = pd.to_datetime(X['Timestamp'])
    X['year'] = X['Timestamp'].dt.year
    X['month'] = X['Timestamp'].dt.month 
    X['week'] = X['Timestamp'].dt.week
    X['day'] = X['Timestamp'].dt.day
    X['week_day'] = X['Timestamp'].dt.weekday
    
    return X

val = train[['ForecastId','Value']].groupby('ForecastId').diff()
val['Value'] = val['Value'].fillna(0)
val['Value'] = (val['Value']==0) * 1
val.columns = ['diff']
train = pd.concat([train, val], axis=1)

a = train[['ForecastId','diff']].groupby('ForecastId').apply(pd.rolling_sum, 7, min_periods=1)
a.columns = ['ForecastId', 'to_drop']
a = a['to_drop']
train = pd.concat([train, a], axis=1)

print(train.shape)
train = train[train['to_drop']<=3].reset_index(drop=True)
train = train[['Timestamp','ForecastId','Value']]
train = time_preprocess(train)
print(train.shape)

train = train[train.year>=2016].reset_index(drop=True)
train['year_month_weekday'] = train['year'].astype('str') + '_' + train['month'].astype('str') + '_' + train['week_day'].astype('str') 
features = train[['ForecastId', 'year_month_weekday', 'Value']].groupby(['ForecastId', 'year_month_weekday']).agg('mean').reset_index()
features = features.pivot(index='ForecastId', columns='year_month_weekday', values='Value')
features = features.fillna(0)
train = pd.merge(train, hol, on='Timestamp', how='left')


losses = []
iids = []

sub = pd.DataFrame({})
cat_cols = ['week_day','month','year','week']

pred_dates  = ['2017-08-16', '2017-08-17', '2017-08-18', '2017-08-19',
               '2017-08-20', '2017-08-21', '2017-08-22', '2017-08-23',
               '2017-08-24', '2017-08-25', '2017-08-26', '2017-08-27',
               '2017-08-28', '2017-08-29', '2017-08-30', '2017-08-31',
               '2017-09-01', '2017-09-02', '2017-09-03', '2017-09-04',
               '2017-09-05', '2017-09-06', '2017-09-07', '2017-09-08',
               '2017-09-09', '2017-09-10', '2017-09-11', '2017-09-12',
               '2017-09-13', '2017-09-14', '2017-09-15', '2017-09-16',
               '2017-09-17']

index_mult = 0

for i in pd.unique(train['ForecastId'])[0:50]:
    
    # prepare train and test Dfs 
    days_in_train = 99999
    X_train = train[train['ForecastId']==i].reset_index(drop=True)[-days_in_train:].reset_index(drop=True)
    X_train['Value'] = abs(X_train['Value'])
       
    X_train['Value_log'] = np.log1p(X_train['Value'])
     
    # prepare 'train_v' and 'valid_v' data frames - for validation 
    X_test = pd.DataFrame({'Timestamp':pred_dates, 'ForecastId':i})
    X_test = time_preprocess(X_test)
    X_test = pd.merge(X_test, hol, on='Timestamp', how='left')
    
    obs_in_test = 35
    X_train_v = X_train[:-obs_in_test].reset_index(drop=True)
    X_valid_v = X_train[-obs_in_test:].reset_index(drop=True)
    
    # prepare features 
        # for train
    X_train_ohe, X_test_ohe = feature_preprocessing(X_train, X_test, cat_cols, cat_type='ohe')
  
        # for validation 
    X_train_v_ohe, X_valid_v_ohe = feature_preprocessing(X_train_v, X_valid_v, cat_cols, cat_type='ohe')
    
        # group by mean
    y_hat_grby_mean = train_groupby(X_train_v, X_valid_v, window=999999, how='mean')
    y_hat_grby3_mean = train_groupby(X_train_v, X_valid_v, window=obs_in_test*3, how='mean')
    
        # group by median
    y_hat_grby_median = train_groupby(X_train_v, X_valid_v, window=999999, how='median')
    y_hat_grby3_median = train_groupby(X_train_v, X_valid_v, window=obs_in_test*3, how='median')
    
        # RandomForest 
    y_rf = train_rf(X_train_v_ohe, X_train_v['Value'], X_valid_v_ohe)
    y_rf_log = np.exp(train_rf(X_train_v_ohe, X_train_v['Value_log'], X_valid_v_ohe)) - 1
    
    score_grby_mean = calc_score(y_hat_grby_mean, X_valid_v['Value'], 0)
    score_grby3_mean = calc_score(y_hat_grby3_mean, X_valid_v['Value'], 0)
    score_grby_median = calc_score(y_hat_grby_median, X_valid_v['Value'], 0)
    score_grby3_median = calc_score(y_hat_grby3_median, X_valid_v['Value'], 0)
    
    score_rf = calc_score(y_rf, X_valid_v['Value'], index_mult)
    score_rf_log = calc_score(y_rf_log, X_valid_v['Value'], index_mult)
    
    
    y_hats     = [y_hat_grby_mean, y_hat_grby3_mean, 
                  y_hat_grby_median, y_hat_grby3_median, 
                  y_rf, y_rf_log, 
                 ]
    
    all_scores = [score_grby_mean, score_grby3_mean, 
                  score_grby_median, score_grby3_median, 
                  score_rf, 
                  score_rf_log, 
                 ]
    
    best_score = np.min(all_scores)
    
    models_names = ['mean_all', 'mean_3', 
                    'median_all', 'median_3', 
                    'rf', 'rf_mean', 
                   ]
    
    losses.append(best_score)
    
    temp_df = pd.DataFrame({'Timestamp':pred_dates, 'ForecastId':i})
    temp_df = time_preprocess(temp_df)
    
    if (score_grby_mean==best_score):
        y_hat = train_groupby(X_train, X_test, window=999999, how='mean')
    
    if (score_grby3_mean==best_score):
        y_hat = train_groupby(X_train, X_test, window=obs_in_test*3, how='mean')
        
    if (score_grby_median==best_score):
        y_hat = train_groupby(X_train, X_test, window=999999, how='median')
    
    if (score_grby3_median==best_score):
        y_hat = train_groupby(X_train, X_test, window=obs_in_test*3, how='median')

    if (score_rf==best_score):
        y_hat = train_rf(X_train_ohe, X_train['Value'], X_test_ohe)
        
    if (score_rf_log==best_score):
        y_hat = np.exp(train_rf(X_train_ohe, X_train['Value_log'], X_test_ohe)) -1    
        
    X_test['Value'] = y_hat
    sub = pd.concat([sub,X_test],axis=0)
    
  
    print(i, 'Best model is', models_names[np.argmin(all_scores)])
    print('loss:', np.min(all_scores) )
    print('val score:', np.mean(losses),'+',np.std(losses) )
    print(dict(zip(models_names, all_scores)))
    print('-------------------------------')

sub = sub[['Timestamp', 'ForecastId', 'Value']]
sub.columns = ['DATE', 'ATM_ID', 'CLIENT_OUT']
sub.to_csv('submission.csv', index=False)

	DataFrame.rolling(window=7,min_periods=1,center=False).sum()
  return func(g, *args, **kwargs)


(287400, 11)
(268676, 8)
0 Best model is rf
loss: 2394.6184637
val score: 2394.6184637 + 0.0
{'median_all': 2558.0772601443937, 'rf': 2394.6184636999733, 'rf_mean': 5110.334740075047, 'median_3': 3055.14209801494, 'mean_all': 2733.9940799384794, 'mean_3': 2970.008485539806}
-------------------------------
2 Best model is median_3
loss: 7794.28453523
val score: 5094.45149947 + 2699.83303577
{'median_all': 10317.755080986279, 'rf': 10884.641837752331, 'rf_mean': 7923.29590646612, 'median_3': 7794.284535231611, 'mean_all': 12335.282579856455, 'mean_3': 8764.27760828875}
-------------------------------
3 Best model is median_all
loss: 4216.94064831
val score: 4801.94788241 + 2242.88111128
{'median_all': 4216.940648313197, 'rf': 4552.428423670253, 'rf_mean': 6455.929795892504, 'median_3': 4555.88565392487, 'mean_all': 4628.6242052392, 'mean_3': 4523.078277326643}
-------------------------------
4 Best model is median_all
loss: 3078.59898978
val score: 4371.11065926 + 2080.80486785
{'median_

34 Best model is median_all
loss: 5930.76670677
val score: 4486.55326485 + 2629.29365715
{'median_all': 5930.766706770206, 'rf': 7093.167815086222, 'rf_mean': 6730.38648826187, 'median_3': 6035.218199715629, 'mean_all': 6356.711292877614, 'mean_3': 5965.729702140071}
-------------------------------
35 Best model is median_3
loss: 2638.58258633
val score: 4424.95424223 + 2606.2970627
{'median_all': 3143.241024109875, 'rf': 2845.879674554005, 'rf_mean': 2696.4648502223613, 'median_3': 2638.582586332083, 'mean_all': 2829.108147568826, 'mean_3': 2835.0399569329184}
-------------------------------
36 Best model is mean_3
loss: 3911.23339246
val score: 4408.38260192 + 2565.52156104
{'median_all': 4311.322690028841, 'rf': 4749.010989232002, 'rf_mean': 4545.617784135978, 'median_3': 4053.868153350469, 'mean_all': 5033.180911301648, 'mean_3': 3911.233392462802}
-------------------------------
38 Best model is rf_mean
loss: 2803.84402728
val score: 4358.24077146 + 2540.50325244
{'median_all': 29

In [None]:
60 Best model is mean_all
loss: 4305.94860943
val score: 4253.92961618 + 2483.72012984
{'rf_mean': 4946.362227144718, 'mean_3': 5520.169214317167, 'mean_all': 4305.948609431632, 'median_all': 4584.742404227213, 'rf': 4742.6985121311545, 'median_3': 6169.4187582562745}