In [None]:
# import the basic libraries we will use in this kernel
import os
import numpy as np
import pandas as pd
import pickle

import time
import datetime
from datetime import datetime
import calendar

from sklearn import metrics
from math import sqrt

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

from string import punctuation

from xgboost import XGBRegressor
from xgboost import plot_importance

from sklearn.preprocessing import LabelEncoder

import itertools
import warnings

warnings.filterwarnings("ignore") # specify to ignore warning messages

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
sales_df = pd.read_csv("/content/drive/MyDrive/MASTER EN DATA SCIENCE/Capstone/data_dsmarket/GrupoRetail_1/Proceso Total/BBDD output/DF_FINAL.csv", index_col = 0)

In [None]:
cols_to_drop = [
 "day", "sales_2011", "sales_2012", "sales_2013"
]

sales_df.drop(cols_to_drop, inplace = True, axis = 1)

In [None]:
sales_df.info()

In [None]:
sales_df.isnull().sum()

In [None]:
sales_df.rename(columns={'id': 'unique_id', 'units': 'item_cnt_day', 'sell_price': 'item_price', "sales" : "revenue"}, inplace=True)

In [None]:
x = sales_df["revenue"].mean()
print(x)
print(len(sales_df))
sales_df.drop(sales_df[sales_df['revenue'] <= x].index, inplace = True)
print(len(sales_df))

In [None]:
x = sales_df["revenue"].mean()
print(x)
print(len(sales_df))
sales_df.drop(sales_df[sales_df['revenue'] <= x].index, inplace = True)
print(len(sales_df))

In [None]:
type(sales_df["date"].iloc[0])

In [None]:
# convert to datetime the date column
# specify the format since otherwise it might give some problems
sales_df["date"] = pd.to_datetime(sales_df["date"], format = "%Y-%m-%d")

In [None]:
# max date in sales is 30.09.2015.
# In the Kaggle competition we are asked to predict the sales for the next month
# this means the sales of October
min_date = sales_df["date"].min()
max_date_sales = sales_df["date"].max()

In [None]:
max_date_sales

In [None]:
# how to create a new date
max_date_test = datetime(2016, 4, 18)

In [None]:
date_range = pd.date_range(min_date, max_date_test, freq = "W")
print("We have a total of {} weeks".format(len(date_range)))
date_range

In [None]:
unique_id = sales_df["unique_id"].unique()

In [None]:
cartesian_product = pd.MultiIndex.from_product([date_range, unique_id], names = ["date", "unique_id"])
len(cartesian_product)

In [None]:
# # set index
sales_df["revenue"] = sales_df["item_cnt_day"] * sales_df["item_price"]

In [None]:
sales_df.head(5)

In [None]:
sales_df.item_price.describe()

In [None]:
print(len(sales_df))

In [None]:
st = time.time()

gb_df = sales_df.set_index("date")

# # groupby shop_id and item_id
gb_df = gb_df.groupby(["unique_id"])

# # resample the sales to a monthly basis
gb_df = gb_df.resample("W").agg({'item_cnt_day':np.sum, "item_price":np.mean, "revenue":np.sum})

# # convert to dataframe and save the full dataframe
gb_df.reset_index(inplace = True)

et = time.time()

print("Total time in minutes to preprocess took {}".format((et - st)/60))

In [None]:
gb_df.head(2)

In [None]:
print(len(gb_df))

In [None]:
additional_info_2 = sales_df[["year", "month", "weekday_int", "weekend_sales", "event_dummy", "summer_sales", "christmas_sales","sales_2014", "sales_2015", "sales_2016", "unique_id"]].drop_duplicates()
additional_info_2.to_pickle("./additional_info_2")

In [None]:
gb_df.to_pickle("gb_df.pkl")
sales_df.to_pickle("sales_df.pkl")

In [None]:
# PATH_DATA = "../input/df-time-series"
gb_df = pd.read_pickle("./gb_df.pkl")
sales_df = pd.read_pickle("./sales_df.pkl")

In [None]:
# max date in sales is 30.09.2015.
# In the Kaggle competition we are asked to predict the sales for the next month
# this means the sales of October
min_date = sales_df["date"].min()
max_date_sales = sales_df["date"].max()
# how to create a new date
max_date_test = datetime(2016, 4, 18)
date_range = pd.date_range(min_date, max_date_test, freq = "M")
print("We have a total of {} months".format(len(date_range)))
date_range
unique_id = sales_df["unique_id"].unique()
cartesian_product = pd.MultiIndex.from_product([date_range, unique_id], names = ["date", "unique_id"])
len(cartesian_product)

In [None]:
sales_df.head(2)

In [None]:
full_df = pd.DataFrame(index = cartesian_product).reset_index()

full_df = pd.merge(full_df, gb_df, on = ['date','unique_id'], how = 'left')

full_df["item_cnt_day"] = np.clip(full_df["item_cnt_day"], 0, 30)

In [None]:
print(len(sales_df))
print(len(gb_df))

In [None]:
gb_df.revenue.describe()

In [None]:
y = gb_df["revenue"].mean()
print(y)
print(len(gb_df))
gb_df.drop(gb_df[gb_df['revenue'] <= y].index, inplace = True)
print(len(gb_df))

In [None]:
gb_df.head()

In [None]:
print(full_df.shape)

In [None]:
full_df.info()

In [None]:
sales_df.info()

In [None]:
additional_info = sales_df[["region", "store_code", "store", "category", "item", "department", "unique_id"]].drop_duplicates()

In [None]:
print(full_df.shape)

full_df = pd.merge(
    left = full_df,
    right = additional_info,
    how = "left",
    on = "unique_id"
)
print(full_df.shape)

In [None]:
full_df.info()

In [None]:
full_df["item_cnt_day"].fillna(0, inplace = True)
full_df["item_price"].fillna(0, inplace = True)
full_df["revenue"].fillna(0, inplace = True)

In [None]:
full_df["item_price"] = full_df.groupby("unique_id")["item_price"].apply(lambda series: series.backfill().ffill())

In [None]:
full_df.isnull().sum()

In [None]:
full_df.to_pickle("full_df.pkl")

In [None]:
PATH_DATA = "../input/df-time-series"
full_df = pd.read_pickle("./full_df.pkl")

In [None]:
encoder = LabelEncoder()
encoder.fit(full_df["region"])
full_df["region"] = encoder.transform(full_df["region"])
full_df["region"] = LabelEncoder().fit_transform(full_df["region"])

encoder = LabelEncoder()
encoder.fit(full_df["store_code"])
full_df["store_code"] = encoder.transform(full_df["store_code"])
full_df["store_code"] = LabelEncoder().fit_transform(full_df["store_code"])

encoder = LabelEncoder()
encoder.fit(full_df["category"])
full_df["category"] = encoder.transform(full_df["category"])
full_df["category"] = LabelEncoder().fit_transform(full_df["category"])

encoder = LabelEncoder()
encoder.fit(full_df["department"])
full_df["department"] = encoder.transform(full_df["department"])
full_df["department"] = LabelEncoder().fit_transform(full_df["department"])

encoder = LabelEncoder()
encoder.fit(full_df["item"])
full_df["item"] = encoder.transform(full_df["item"])
full_df["item"] = LabelEncoder().fit_transform(full_df["item"])

In [None]:
def build_ts_features(full_df, gb_list, agg_func, agg_func_name, target_column = "item_cnt_day", verbose = True) -> pd.DataFrame:
    '''
    Based on the full_df you pass and the gb_list, this functions creates Time Series features and returns
    a gb_df.
    '''

    # create dynamic name
    feature_name = "_".join(gb_list)

    if verbose: print(f"Working with {feature_name}")

    # first sales/DataFrame aggregation
    gb_df = full_df.groupby(
        gb_list
    )[target_column].apply(
        agg_func
    ).reset_index().rename(
        columns = {target_column : feature_name + "_" + agg_func_name}
    )

    # add lags features
    gb_df[feature_name + "_" + agg_func_name + "_" + "shift_1"] = gb_df.groupby(gb_list[1:])[feature_name + "_" + agg_func_name].transform(
        lambda series: series.shift(1)
    )

    return gb_df

In [None]:
additional_info_2 = pd.read_pickle("./additional_info_2")

In [None]:
additional_info_2.unique_id.value_counts

In [None]:
full_df.unique_id.value_counts

In [None]:
print(full_df.shape)

full_df = pd.merge(
    left = full_df,
    right = additional_info_2,
    how = "left",
    on = "unique_id"
)
print(full_df.shape)

In [None]:
print(len(full_df))
full_df.drop(full_df[full_df['revenue'] == 0].index, inplace = True)
print(len(full_df))

In [None]:
full_df.isnull().sum()

In [None]:
full_df.to_pickle("full_df.pk2")

In [None]:
full_df = pd.read_pickle("./full_df.pk2")

In [None]:
st = time.time()

gb_list = ["date", "store_code"]

shop_sales_features = build_ts_features(
    full_df = full_df,
    gb_list = gb_list,
    agg_func = np.sum,
    agg_func_name = "sales_sum",
    target_column = "item_cnt_day",
    verbose = True
)

et = time.time()

(et - st)/60

In [None]:
shop_sales_features.shape

In [None]:
st = time.time()

gb_list = ["date", "region"]

region_sales_features = build_ts_features(
    full_df = full_df,
    gb_list = gb_list,
    agg_func = np.sum,
    agg_func_name = "sales_sum",
    target_column = "item_cnt_day",
    verbose = True
)

et = time.time()

(et - st)/60

In [None]:
st = time.time()

gb_list = ["date", "department"]

department_sales_features = build_ts_features(
    full_df = full_df,
    gb_list = gb_list,
    agg_func = np.sum,
    agg_func_name = "sales_sum",
    target_column = "item_cnt_day",
    verbose = True
)

et = time.time()

(et - st)/60

In [None]:
department_sales_features

In [None]:
st = time.time()

gb_list = ["date", "month"]

month_sales_features = build_ts_features(
    full_df = full_df,
    gb_list = gb_list,
    agg_func = np.sum,
    agg_func_name = "sales_sum",
    target_column = "item_cnt_day",
    verbose = True
)

et = time.time()

(et - st)/60

In [None]:
month_sales_features

In [None]:
st = time.time()

gb_list = ["date", "weekend_sales"]

weekend_sales_features = build_ts_features(
    full_df = full_df,
    gb_list = gb_list,
    agg_func = np.sum,
    agg_func_name = "sales_sum",
    target_column = "item_cnt_day",
    verbose = True
)

et = time.time()

(et - st)/60

In [None]:
weekend_sales_features

In [None]:
st = time.time()

gb_list = ["date", "summer_sales"]

summer_sales_features = build_ts_features(
    full_df = full_df,
    gb_list = gb_list,
    agg_func = np.sum,
    agg_func_name = "sales_sum",
    target_column = "item_cnt_day",
    verbose = True
)

et = time.time()

(et - st)/60

In [None]:
summer_sales_features

In [None]:
st = time.time()

gb_list = ["date", "christmas_sales"]

christmas_sales_features = build_ts_features(
    full_df = full_df,
    gb_list = gb_list,
    agg_func = np.sum,
    agg_func_name = "sales_sum",
    target_column = "item_cnt_day",
    verbose = True
)

et = time.time()

(et - st)/60

In [None]:
christmas_sales_features

In [None]:
st = time.time()

gb_list = ["date", "item"]

item_sales_features = build_ts_features(
    full_df = full_df,
    gb_list = gb_list,
    agg_func = np.sum,
    agg_func_name = "sales_sum",
    target_column = "item_cnt_day",
    verbose = True
)

et = time.time()

(et - st)/60

In [None]:
item_sales_features

In [None]:
st = time.time()

gb_list = ["date", "category"]

month_item_category_features = build_ts_features(
    full_df = full_df,
    gb_list = gb_list,
    agg_func = np.sum,
    agg_func_name = "sales_sum",
    target_column = "item_cnt_day",
    verbose = True
)

et = time.time()

(et - st)/60

In [None]:
month_item_category_features.shape

In [None]:
print("Shape before merge is {}".format(full_df.shape))

full_df = pd.merge(full_df, shop_sales_features, on = ["date", "store_code"], how = "left")
full_df = pd.merge(full_df, month_item_category_features, on = ["date", "category"], how = "left")
full_df = pd.merge(full_df, item_sales_features, on = ["date", "item"], how = "left")
full_df = pd.merge(full_df, christmas_sales_features, on = ["date", "christmas_sales"], how = "left")
full_df = pd.merge(full_df, summer_sales_features, on = ["date", "summer_sales"], how = "left")
full_df = pd.merge(full_df, weekend_sales_features, on = ["date", "weekend_sales"], how = "left")
full_df = pd.merge(full_df, month_sales_features, on = ["date", "month"], how = "left")
full_df = pd.merge(full_df, department_sales_features, on = ["date", "department"], how = "left")
full_df = pd.merge(full_df, region_sales_features, on = ["date", "region"], how = "left")

full_df.rename(columns = {"item_cnt_day":"sales"}, inplace = True)

print("Shape after merge is {}".format(full_df.shape))

In [None]:
full_df.to_pickle("full_df.pk3")
full_df = pd.read_pickle("./full_df.pk3")

In [None]:
# save the file

st = time.time()

full_df.to_pickle("FULL_DF_ALL_FEATURES.pkl")

et = time.time()
(et - st)/60

In [None]:
# load the preprocessed data
full_df = pd.read_pickle("./FULL_DF_ALL_FEATURES.pkl")
full_df["sales"] = np.clip(full_df["sales"], 0, 30)

# delete all the columns where lags features are - 1 (shift(1))
full_df = full_df[full_df["date"] > np.datetime64("2011-01-31")]

In [None]:
full_df.info()

In [None]:
full_df.isnull().sum()

In [None]:
full_df["date_store_code_sales_sum_shift_1"].fillna(0, inplace = True)
full_df["date_category_sales_sum_shift_1"].fillna(0, inplace = True)
full_df["date_item_sales_sum_shift_1"].fillna(0, inplace = True)
full_df["date_summer_sales_sales_sum_shift_1"].fillna(0, inplace = True)
full_df["date_weekend_sales_sales_sum_shift_1"].fillna(0, inplace = True)
full_df["date_month_sales_sum_shift_1"].fillna(0, inplace = True)
full_df["date_department_sales_sum_shift_1"].fillna(0, inplace = True)
full_df["date_region_sales_sum_shift_1"].fillna(0, inplace = True)
full_df["date_christmas_sales_sales_sum_shift_1"].fillna(0, inplace = True)

In [None]:
cols_to_drop = [

'revenue',
'store',
"unique_id",
"date_store_code_sales_sum",
"date_category_sales_sum",
"date_region_sales_sum",
"date_department_sales_sum",
"date_month_sales_sum",
"date_weekend_sales_sales_sum",
"date_summer_sales_sales_sum",
"date_christmas_sales_sales_sum",
"date_item_sales_sum"

]

full_df.drop(cols_to_drop, inplace = True, axis = 1)

In [None]:
full_df.head()

In [None]:
# ------------------------------------------------------
# separate the dates for train, validation and test

train_index = sorted(list(full_df["date"].unique()))[:-2]

valida_index = [sorted(list(full_df["date"].unique()))[-2]]

test_index = [sorted(list(full_df["date"].unique()))[-1]]

In [None]:
# ------------------------------------------------------
# split the data into train, validation and test dataset
# we "simulate" the test dataset to be the Kaggle test dataset

X_train = full_df[full_df["date"].isin(train_index)].drop(['sales', "date"], axis=1)
Y_train = full_df[full_df["date"].isin(train_index)]['sales']

X_valida = full_df[full_df["date"].isin(valida_index)].drop(['sales', "date"], axis=1)
Y_valida = full_df[full_df["date"].isin(valida_index)]['sales']

X_test = full_df[full_df["date"].isin(test_index)].drop(['sales', "date"], axis = 1)
Y_test = full_df[full_df["date"].isin(test_index)]['sales']

In [None]:
st = time.time()

model = XGBRegressor(seed = 175)

model_name = str(model).split("(")[0]

day = str(datetime.now()).split()[0].replace("-", "_")
hour = str(datetime.now()).split()[1].replace(":", "_").split(".")[0]
t = str(day) + "_" + str(hour)

model.fit(X_train, Y_train, eval_metric = "rmse",
    eval_set = [(X_train, Y_train), (X_valida, Y_valida)],
    verbose = True,
    early_stopping_rounds = 10)

et = time.time()

print("Training took {} minutes!".format((et - st)/60))

In [None]:
pickle.dump(model, open("{}_{}.dat".format(model_name, t), "wb"))

In [None]:
print("{}_{}.dat".format(model_name, t))

In [None]:
model = pickle.load(open("{}_{}.dat".format(model_name, t), "rb"))

In [None]:
importance = model.get_booster().get_score(importance_type = "gain")
importance = {k: v for k, v in sorted(importance.items(), key = lambda item: item[1])}

fig, ax = plt.subplots(figsize = (10, 15))
plot_importance(model, importance_type = "gain", ax = ax);

In [None]:
Y_valida_pred = model.predict(X_valida)

rmse_valida = sqrt(metrics.mean_squared_error(Y_valida, Y_valida_pred))
rmse_valida

In [None]:
if "sales_predicted" in X_test.columns:
    X_test.drop("sales_predicted", axis = 1, inplace = True)

Y_test_predict = model.predict(X_test)
X_test["sales_predicted"] = np.clip(Y_test_predict, 0, 30)

In [None]:
X_test.head()

In [None]:
X_test.info()

In [None]:
X_test.to_csv("/content/drive/MyDrive/MASTER EN DATA SCIENCE/Capstone/data_dsmarket/GrupoRetail_1/Proceso Total/BBDD output/STOCK_PREDICTION_DF_10.csv", index = False)

In [None]:
X_test["unique_id"] = X_test["store_code"].map(str) + "-" + X_test["item"].map(str)

In [None]:
X_test_short = X_test[["unique_id", "sales_predicted"]]

In [None]:
PATH_DATA = "../input/df-time-series"

In [None]:
test_index = pd.read_csv("/content/drive/MyDrive/MASTER EN DATA SCIENCE/Capstone/data_dsmarket/GrupoRetail_1/Proceso Total/BBDD output/DF_FINAL.csv")

In [None]:
test_index.info()

In [None]:
test_index["id"] = test_index["id"].map(str)
test_index.rename(columns = {"id":"unique_id"}, inplace = True)

In [None]:
FINAL_STOCK_PRED_DF = pd.merge(test_index, X_test_short, left_on = "unique_id", right_on = "unique_id", how = "left")

In [None]:
FINAL_STOCK_PRED_DF.to_csv("/content/drive/MyDrive/MASTER EN DATA SCIENCE/Capstone/data_dsmarket/GrupoRetail_1/Proceso Total/BBDD output/FINAL_STOCK_PRED_DF_10.csv", index = False)

In [None]:
FINAL_STOCK_PRED_DF.head(2)