In [401]:
import os 
import pandas as pd
import numpy as np

path = os.getcwd()

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, BaggingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
le = LabelEncoder()

from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error

import seaborn as sns
import matplotlib.pyplot as plt

In [402]:
path = 'C:\\Users\\sunil\\Projects\\Dockship\\segmind_grand_ai_challenge_2021-dataset'

In [403]:
train = pd.read_csv(path + "\\dataset\\TRAIN.csv")
test = pd.read_csv(path + "\\dataset\\TEST.csv")
ss = pd.read_csv(path + "\\dataset\\sample_submission.csv")
shop = pd.read_csv(path + "\\dataset\\store.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [404]:
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])

train['year'] = train['Date'].dt.year
train['month'] = train['Date'].dt.month
train['day'] = train['Date'].dt.day
train['week'] = train['Date'].dt.week

test['year'] = test['Date'].dt.year
test['month'] = test['Date'].dt.month
test['day'] = test['Date'].dt.day
test['week'] = test['Date'].dt.week

train['StateHoliday'].replace({0 : 4, '0':0, 'a':1, 'b':2, 'c':3}, inplace = True)
test['StateHoliday'].replace({0 : 4, '0':0, 'a':1, 'b':2, 'c':3}, inplace = True)

In [405]:
# Merging

train = pd.merge(train, shop, how = 'left', on = 'Store')
test = pd.merge(test, shop, how = 'left', on = 'Store')

train['Assortment'].unique()
train['Assortment'].replace({'a':1, 'b':2, 'c':3}, inplace = True)
test['Assortment'].replace({'a':1, 'b':2, 'c':3}, inplace = True)


a = [0, 'Jan,Apr,Jul,Oct', 'Feb,May,Aug,Nov', 'Mar,Jun,Sept,Dec']
b = [0, 1, 2, 3]

train['PromoInterval'].replace(dict(zip(a, b)), inplace = True)
test['PromoInterval'].replace(dict(zip(a, b)), inplace = True)

train['StoreType'].replace({'a':1, 'b':2, 'c':3, 'd':4}, inplace = True)
test['StoreType'].replace({'a':1, 'b':2, 'c':3, 'd':4}, inplace = True)


train['CompetitionDistance'].fillna(train['CompetitionDistance'].mean(), inplace = True)
test['CompetitionDistance'].fillna(test['CompetitionDistance'].mean(), inplace = True)

train.fillna(0, inplace = True)
test.fillna(0, inplace = True)

# Changing Datatype

int8_cols = ['DayOfWeek', 'Open', 'Promo','StateHoliday', 'SchoolHoliday', 
'month', 'day','week', 'Store_Type', 'Store_Assortment',
'Store_CompetitionOpenSinceMonth','Store_Promo2', 
'Store_Promo2SinceWeek','Store_PromoInterval']

int16_cols = ['Customers', 'year', 'Store', 'Store_CompetitionOpenSinceYear', 'Store_Promo2SinceYear']

int32_cols = ['Sales', 'Store_distance']

def int8(x):
    train[x] = train[x].astype('int8')
    
    test[x] = test[x].astype('int8')

def int16(x):
    train[x] = train[x].astype('int16')
    if x != 'Customers':
        test[x] = test[x].astype('int16')

def int32(x):
    train[x] = train[x].astype('int32')
    if x != 'Sales':
        test[x] = test[x].astype('int32')

for col in int8_cols:
    int8(col)

for col in int16_cols:
    int16(col)

for col in int32_cols:
    int32(col)

In [406]:
target = 'Sales'
date = 'Date'
customer = 'Customers'

cat_cols = ['Store', 'DayOfWeek', 'Open', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'year', 'month', 'day', 'dayofweek',
       'week', 'Store_Type', 'Store_Assortment',
       'Store_CompetitionOpenSinceMonth', 'Store_CompetitionOpenSinceYear',
       'Store_Promo2', 'Store_Promo2SinceWeek', 'Store_Promo2SinceYear',
       'Store_PromoInterval']

features = [col for col in train.columns if col not in [target, date, customer]]

In [407]:
trn, val = train_test_split(train, test_size = 0.2, random_state = 1)

##### Input for model
X_trn, X_val = trn[features], val[features]

##### Target column
y_trn, y_val = trn[target], val[target]

##### Features for test data that we will be predicting
X_test = test[features]

In [408]:
# Cross Validation for Boosting
def cross_val(regressor, train, test, features, name):
    N_splits = 5
    
    oofs = np.zeros(len(train))
    preds = np.zeros(len(test))
    
    target_col = train[target]
    
    folds = StratifiedKFold(n_splits = N_splits, shuffle = True,random_state = 1999)
    stratified_target = pd.qcut( train[target], 10, labels=False, duplicates='drop')
    for index, (trn_idx, val_idx) in enumerate(folds.split(train, stratified_target)):
        print(f'\n================================Fold{index + 1}===================================')
        
        #### Train Set
        X_trn, y_trn = train[features].iloc[trn_idx], train[target].iloc[trn_idx]
        
        #### Validation Set
        X_val, y_val = train[features].iloc[val_idx], train[target].iloc[val_idx]
        
        #### Test Set
        X_test = test[features]
        
        ############ Fitting #############
        _ = regressor.fit(X_trn, y_trn, eval_set = [(X_val, y_val)], early_stopping_rounds = 50, verbose = 100, eval_metric='rmse')
        
        ############ Predicting #############
        val_preds = np.abs(regressor.predict(X_val))
        test_preds = np.abs(regressor.predict(X_test))
        
        error = np.sqrt(mean_squared_error((y_val), (val_preds)))
        print(f'\n Root Mean Squared Error for Validation set is : {error}')
        
        oofs[val_idx] = val_preds
        preds += test_preds / N_splits
        
    total_error = np.sqrt(mean_squared_error((target_col), (oofs)))
    print(f'\n\Root Mean Squared Error for oofs is {total_error}')
    
    return oofs, preds

---
# Feature Engineering

In [409]:
def join(train_, test_):
    df = pd.concat([train_, test_], axis = 0).reset_index(drop=True)
    return df

def split(df):
    train_, test_ = df[:train.shape[0]], df[train.shape[0]:].reset_index(drop=True)
    return train_, test_

In [384]:
df = join(train, test)

In [385]:
# df = join(train, test)
# df['Store_Count'] = df.groupby('Store')['Date'].transform('count')
# train_feat, test_feat = split(df)

In [386]:
# df = join(train_feat, test_feat)
# df['Store_Cus_Count'] = df.groupby('Store')['Customers'].transform('sum')/df['Store_Count']
# train_feat, test_feat = split(df)

In [387]:
# df = join(train_feat, test_feat)
# df['Store_Mean_Sales'] = df.groupby('Store')['Sales'].transform('mean')
# train_feat, test_feat = split(df)

In [395]:
# df = join(train_feat, test_feat)
# df['Open_Closed_Sales'] = df.groupby(['Store', 'Open'])['Sales'].transform('mean')
# train_feat, test_feat = split(df)
# test_feat['Open_Closed_Sales'].fillna(0, inplace=True)

columns = ['SchoolHoliday', 'StateHoliday', 'Promo']
bwd = df[['Store']+columns].sort_index().groupby(["Store"]).rolling(7, min_periods=1).sum()


df = df.merge(bwd, 'left', ["Store"], suffixes=['', '_bw'])

In [410]:
features = [col for col in train_feat.columns if col not in [target, date, customer]]

In [411]:
# xgb = XGBRegressor(random_state=1, tree_method='gpu_hist', n_estimators = 500)
# lgb = LGBMRegressor(random_state=1, n_estimators = 400)

In [414]:
xgb_params_2 = {'n_estimators': 1496, 'learning_rate': 0.15096576157577118, 'max_depth': 8,
                'colsample_bytree': 0.6155314602051218, 'subsample': 0.7039538177172424, 'min_child_weight': 17}
xgb = XGBRegressor(random_state = 5, tree_method = 'gpu_hist', **xgb_params_2)

In [415]:
%%time
xgb_oofs, xgb_preds = cross_val(xgb, train_feat, test_feat, features, 'xgb') #749
#637


[0]	validation_0-rmse:6044.18408
[100]	validation_0-rmse:769.80560
[200]	validation_0-rmse:697.17108
[300]	validation_0-rmse:660.17297
[400]	validation_0-rmse:640.26642
[500]	validation_0-rmse:624.68304
[600]	validation_0-rmse:614.33551
[700]	validation_0-rmse:606.61017
[800]	validation_0-rmse:600.52978
[900]	validation_0-rmse:596.56799
[1000]	validation_0-rmse:592.79828
[1100]	validation_0-rmse:589.76440
[1200]	validation_0-rmse:586.91412
[1300]	validation_0-rmse:585.05695
[1400]	validation_0-rmse:583.55255
[1495]	validation_0-rmse:582.24493

 Root Mean Squared Error for Validation set is : 582.2370893568828

[0]	validation_0-rmse:6055.73828
[100]	validation_0-rmse:761.54211
[200]	validation_0-rmse:690.72699
[300]	validation_0-rmse:655.50061
[400]	validation_0-rmse:635.77881
[500]	validation_0-rmse:622.07031
[600]	validation_0-rmse:612.38684
[700]	validation_0-rmse:605.03052
[800]	validation_0-rmse:599.32190
[900]	validation_0-rmse:594.72644
[1000]	validation_0-rmse:590.94427
[1100]	

In [416]:
test.loc[index, 'Predictions'] = xgb_preds
test.loc[ test['Open'] == 0, 'Predictions'] = 0
preds = test['Predictions']

index = [i for i in range(test.shape[0])]

d = list(zip(index, preds))

ss = pd.DataFrame(d, columns = ['index', 'Sales'])

In [417]:
ss.to_csv(path + "\\xgb_base_tuned.csv", index = False)

In [228]:
def objective(trial, cv=KFold(10, shuffle = True, random_state = 1999)): 

    param_xgb = {
        "random_state": 1,
        'objective': 'reg:squarederror',
        "metric": "rmse",
        # "categorical_feature": cat_indices,
        "verbosity": 0,
        # 'alpha': trial.suggest_int('alpha', 1, 100),
        # 'lambda': trial.suggest_float('lambda', 0.001, 100),
        # 'gamma': trial.suggest_float('gamma', 0.001, 100),
        "n_estimators": trial.suggest_int('n_estimators', 1, 1600),
        "learning_rate": trial.suggest_float('learning_rate', 0.01, 1),
        # 'num_leaves': trial.suggest_int('num_leaves', 2, 1024),
        'max_depth': trial.suggest_int('max_depth', 1, 8),
        # 'reg_alpha': trial.suggest_float('reg_alpha', 1e-4, 25),
        # 'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 25),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.001, 1.0),
        # 'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.001, 1.0),
        # 'colsample_bynode': trial.suggest_float('colsample_bynode', 0.001, 1.0),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 100)
        # 'cat_smooth': trial.suggest_float('cat_smooth', 1.0, 50.0) 
        # 'max_delta_step' 
    }

    val_aucs = []
    aucs = []


    X_trn, X_valid, y_trn, y_valid = train_test_split(features[xgb_features], targets, 
                                                    test_size=0.20, random_state=42, shuffle=True)

    model = xgb.XGBRegressor(**param_xgb, tree_method='gpu_hist')        

    model.fit(X_trn, y_trn, eval_set=[(X_valid, y_valid)], early_stopping_rounds=100, verbose=0)    

    preds = model.predict(X_valid)

    auc = np.sqrt(mean_squared_error(y_valid, preds)) 

    # auc = np.sqrt(mean_squared_log_error(np.expm1(y_valid), np.expm1(preds)))
    # aucs.append(auc)

    return auc

In [None]:
#Set to False if you want to skip it
OPTUNA_OPTIMIZATION = True
N_TRIALS = 20   

if OPTUNA_OPTIMIZATION:
    study = optuna.create_study(study_name = 'XGB', direction="minimize") 
    
    study.optimize(objective, n_trials=N_TRIALS)