In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_log_error
from catboost import CatBoostRegressor, cv, Pool

**Preaparing data sets**# 

In [4]:
train_data=pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/train.csv")
test_data=pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/test.csv")
holidays_events=pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv")
oil=pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/oil.csv") 
train_data["date"]=pd.to_datetime(train_data["date"])
test_data["date"]=pd.to_datetime(test_data["date"])
holidays_events["date"]=pd.to_datetime(holidays_events["date"])
oil["date"]=pd.to_datetime(oil["date"])

In [5]:
train_q1=train_data["sales"].quantile(0.25)
train_q3=train_data["sales"].quantile(0.75)
IQL=train_q3-train_q1
down_limit=train_q1-1.5*IQL
up_limit=train_q3+1.5*IQL

#remove outlier observations from the data set
train_data["outliers"]=(train_data["sales"]<down_limit)|(train_data["sales"]>up_limit)
train_data.drop((train_data.loc[train_data["outliers"]==True]).index, inplace=True)
train_data=train_data.drop(["outliers"], axis=1)

In [6]:
#remove transferred=True from holiday_events
holidays_events.drop((holidays_events.loc[holidays_events["transferred"]==True]).index, inplace=True)
holidays_events.drop((holidays_events.loc[holidays_events["type"]=="Transfer"]).index, inplace=True)

In [7]:
# Adding the holidays corresponding to the dates in the train data set in the type column in the holidays_events data set to the train data set as holiday
holidays_events_train=holidays_events.iloc[:328,:]
train_data = train_data.merge(holidays_events_train[["date", "type"]], 
                              on="date", 
                              how="left")
train_data["type"]=train_data["type"].fillna("Not_Holiday")
# Adding the holidays corresponding to the datasets in the test data set in the type column in the holidays_events data set to the train data set as holiday
holidays_events_test=holidays_events.iloc[328:,:]
test_data = test_data.merge(holidays_events_test[["date", "type"]], 
                              on="date", 
                              how="left")
test_data["type"]=test_data["type"].fillna("Not_Holiday")
# Adding the fuel prices corresponding to the dates in the train data set in the dcoilwtico column in the oildata set to the train data set as a price
oil_train=oil.iloc[:1207,:]
oil_train["dcoilwtico"]=oil_train["dcoilwtico"].fillna(oil["dcoilwtico"].mean())
train_data = train_data.merge(oil_train[["date", "dcoilwtico"]], 
                              on="date", 
                              how="left")
train_data["dcoilwtico"]=train_data["dcoilwtico"].fillna(oil["dcoilwtico"].mean())
# Adding the fuel prices corresponding to the datasets in the test data set in the dcoilwtico column in the oildata set to the train data set as a price
oil_test=oil.iloc[1207:,:]
test_data = test_data.merge(oil_test[["date", "dcoilwtico"]], 
                              on="date", 
                              how="left")
test_data["dcoilwtico"]=test_data["dcoilwtico"].fillna(oil["dcoilwtico"].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oil_train["dcoilwtico"]=oil_train["dcoilwtico"].fillna(oil["dcoilwtico"].mean())


In [8]:
# assign date column in train data set as index
train_data.index=pd.to_datetime(train_data["date"])
train_data=train_data.drop(["id", "date"], axis=1)

# assign date column in test data set as index
test_data.index=pd.to_datetime(test_data["date"])
test_data=test_data.drop(["date"], axis=1)


train_data.rename(columns={'type': 'Holiday_type'}, inplace=True)
test_data.rename(columns={'type': 'Holiday_type'}, inplace=True)

In [9]:
#Add to tran and test data month, dayofweek and day columns to enhance model
train_data["month"]=train_data.index.month
test_data["month"]=test_data.index.month
train_data["dayofweek"]=train_data.index.dayofweek
test_data["dayofweek"]=test_data.index.dayofweek
train_data["dayofmonth"]=train_data.index.day
test_data["dayofmonth"]=test_data.index.day

train_data["payday"] = train_data["dayofmonth"].apply(lambda x: 1 if x in [15, 30] else 0)
test_data["payday"] = test_data["dayofmonth"].apply(lambda x: 1 if x in [15, 30] else 0)

In [10]:
col=train_data.select_dtypes(include="object").columns
le=LabelEncoder()
for col in col:
    train_data[col]=le.fit_transform(train_data[col])
    test_data[col]=le.fit_transform(test_data[col])

In [11]:
train_data["sales"] = np.log1p(train_data["sales"])
X=train_data.drop(["sales"], axis=1)
y=pd.DataFrame(train_data["sales"])
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
cat = CatBoostRegressor()
model=cat.fit(X_train, y_train)

Learning rate set to 0.136884
0:	learn: 2.0086973	total: 206ms	remaining: 3m 26s
1:	learn: 1.9151226	total: 327ms	remaining: 2m 43s
2:	learn: 1.8451212	total: 442ms	remaining: 2m 27s
3:	learn: 1.7814560	total: 559ms	remaining: 2m 19s
4:	learn: 1.7325070	total: 682ms	remaining: 2m 15s
5:	learn: 1.6942782	total: 797ms	remaining: 2m 12s
6:	learn: 1.6517813	total: 916ms	remaining: 2m 9s
7:	learn: 1.6264442	total: 1.04s	remaining: 2m 8s
8:	learn: 1.5955076	total: 1.15s	remaining: 2m 7s
9:	learn: 1.5731137	total: 1.26s	remaining: 2m 5s
10:	learn: 1.5553955	total: 1.38s	remaining: 2m 4s
11:	learn: 1.5343323	total: 1.49s	remaining: 2m 2s
12:	learn: 1.5167954	total: 1.6s	remaining: 2m 1s
13:	learn: 1.5018147	total: 1.73s	remaining: 2m 1s
14:	learn: 1.4924830	total: 1.84s	remaining: 2m
15:	learn: 1.4811628	total: 1.99s	remaining: 2m 2s
16:	learn: 1.4657049	total: 2.12s	remaining: 2m 2s
17:	learn: 1.4570819	total: 2.23s	remaining: 2m 1s
18:	learn: 1.4483218	total: 2.35s	remaining: 2m 1s
19:	learn

In [None]:
params={ "learning_rate":[0.01, 0.1, 0.03],
    "iterations":[300, 500, 1000],
    "depth":[6,7, 8]}
cat_tune=GridSearchCV(cat, params, cv=10, n_jobs=-1)
cat_tune_model=cat_tune.fit(X_train, y_train)


In [None]:
cat_tuned_model=CatBoostRegressor(
    learning_rate=cat_tune.best_params_("learning_rate"),
    iterations=cat_tune.best_params_("iterations"),
    depth=cat_tune.best_params_("depth").fit(X_train, y_train)



In [None]:
y_pred=cat_tuned_model.predict(X_test)
y_pred = np.expm1(y_pred)
y_pred = np.maximum(y_pred, 0)
y_pred = y_pred.reshape(-1, 1)
y_test= np.expm1(y_test)


In [None]:
print(f"RMSLE Error: {np.sqrt(mean_squared_log_error(y_test, y_pred))}")

In [None]:
test_data_new=test_data.drop("id", axis=1)
y_pred_real=model.predict(test_data_new)
y_pred_real = y_pred_real.reshape(-1, 1)
y_pred_real = np.expm1(y_pred_real)
y_pred_real = pd.DataFrame(np.maximum(y_pred_real, 0))
y_pred_real.reset_index(drop=True, inplace=True)
test_data["id"].reset_index(drop=True, inplace=True)
y_pred_real = pd.concat([test_data["id"], y_pred_real], axis=1)
y_pred_real.rename(columns={0: 'sales'}, inplace=True)
y_pred_real.index=y_pred_real["id"]
y_pred_real=y_pred_real.drop("id", axis=1)
pd.DataFrame(y_pred_real).to_csv("/kaggle/working/submission.csv")