In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
import matplotlib.pylab as pylab

%matplotlib inline
matplotlib.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 8,6

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [None]:
rossman_df = pd.read_csv("train.csv", low_memory=False)

In [None]:
store_df = pd.read_csv("store.csv", low_memory=False)

In [None]:
rossman_df.drop(rossman_df[rossman_df.Open == 0].index, inplace = True)
rossman_df.drop(rossman_df[rossman_df.Sales == 0].index, inplace = True)

In [None]:
rossman_df['Date']=pd.to_datetime(rossman_df['Date'])
rossman_df['Year'] = rossman_df['Date'].apply(lambda x: x.year)
rossman_df['Month'] = rossman_df['Date'].apply(lambda x: x.month)
rossman_df['Day'] = rossman_df['Date'].apply(lambda x: x.day)
rossman_df['WeekOfYear'] = rossman_df['Date'].apply(lambda x: x.weekofyear)

In [None]:
rossman_df.drop(columns=['StateHoliday', 'Open'], inplace=True)


In [None]:
store_df = store_df.drop(['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear','Promo2SinceWeek',
                     'Promo2SinceYear', 'PromoInterval'], axis=1)

In [None]:
store_df.CompetitionDistance.fillna(store_df.CompetitionDistance.mode(), inplace=True)

In [None]:
df = pd.merge(rossman_df, store_df, how='left', on='Store')

In [None]:
def remove_outlier(df_in, col_name):
    q1 = df_in[col_name].quantile(0.25)
    q3 = df_in[col_name].quantile(0.75)
    iqr = q3-q1 #Interquartile range
    fence_low  = q1-1.5*iqr
    fence_high = q3+1.5*iqr
    df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
    return df_out

In [None]:
df = remove_outlier(df, 'Sales')
df = remove_outlier(df, 'Customers')
df = remove_outlier(df, 'CompetitionDistance')

In [None]:
df = pd.get_dummies(df,columns=['StoreType','Assortment'], dtype='int')

In [None]:
df = df.drop(['Store','Date','Year', 'WeekOfYear'] , axis = 1)

In [None]:
X = df.drop(['Sales'] , axis = 1)
y= df.Sales
y = y.reset_index(drop=True)

In [None]:
X_train, X_new, y_train, y_new = train_test_split(X, y, test_size=0.1, shuffle = True, random_state=0)

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_new = scaler.fit_transform(X_new)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost
regressor = xgboost.XGBRegressor()

In [None]:
hyperparameter_grid = {
    'n_estimators': [int(x) for x in np.linspace(start = 50, stop = 200, num = 15, endpoint = True)],
    'max_depth': [int(x) for x in np.linspace(3, 10)],
    'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4],
    'min_child_weight': [int(x) for x in np.linspace(1, 10)]
    }

In [None]:
rs_model=RandomizedSearchCV(regressor,param_distributions=hyperparameter_grid,n_iter=100, cv=3,verbose=3)

In [None]:
rs_model.fit(X,y)

In [None]:
rs_model.best_params_

In [None]:
rf_params = rs_model.best_params_

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import xgboost

r2_scores_xgboost = []
mae_scores_xgboost = []
rmse_scores_xgboost = []

for random_seed in range(10):
  kf = KFold(shuffle=True, random_state=random_seed)
  for train_index, test_index in kf.split(X_new):
    X_train, X_test = X_new[train_index], X_new[test_index]
    y_train, y_test = y_new[train_index], y_new[test_index]

    xg = xgboost.XGBRegressor(subsample = 0.7, n_estimators = 189, min_child_weight = 3, max_depth = 9, learning_rate = 0.2, gamma = 0.1, colsample_bytree = 1, verbosity = 2)
    xg.fit(X_train, y_train)

    y_pred = xg.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    r2_scores_xgboost.append(r2)
    print(f'R2-score of this fold is {r2}')

    mae = mean_absolute_error(y_test, y_pred)
    mae_scores_xgboost.append(mae)
    print(f'MAE of this fold is {mae}')

    rmse = mean_squared_error(y_test, y_pred, squared = False)
    rmse_scores_xgboost.append(rmse)
    print(f'RMSE of this fold is {rmse}')
