# Library importing

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import joblib

# data pre-proccessing

## data reading

In [2]:
final_data = pd.read_csv("/kaggle/input/depi-dataset/data.csv")

In [3]:
final_data.head()

Unnamed: 0,store_id,sales,wm_yr_wk,wday,event_name_1,event_type_1,event_name_2,event_type_2,snap,year,month,day,lag_1,item_category,item_subcategory,item_number,state_id,sell_price,price_flag,is_weekend
0,CA_1,3,1,1,No event,No event,No event,No event,0,2011,1,29,1,FOODS,1,1,CA,2.0,0,1
1,CA_1,0,1,2,No event,No event,No event,No event,0,2011,1,30,3,FOODS,1,1,CA,2.0,0,1
2,CA_1,0,1,3,No event,No event,No event,No event,0,2011,1,31,0,FOODS,1,1,CA,2.0,0,0
3,CA_1,1,1,4,No event,No event,No event,No event,1,2011,2,1,0,FOODS,1,1,CA,2.0,0,0
4,CA_1,4,1,5,No event,No event,No event,No event,1,2011,2,2,1,FOODS,1,1,CA,2.0,0,0


In [4]:
# Calendar interactions
final_data["snap_weekend"] = final_data["snap"] * final_data["is_weekend"]
final_data["wday_x_snap"] = final_data["wday"] * final_data["snap"]
# Lag feature
final_data["lag_7"] = (
    final_data.groupby(['item_category',"item_subcategory",'item_number', "store_id"])["sales"]
        .shift(7)
)
final_data["lag_7"] = final_data["lag_7"].fillna(0)

In [5]:
categorical_cols = ['store_id', 'event_name_1', 'event_name_2', 'event_type_1', 'event_type_2', 'state_id', 'item_category', 'item_subcategory']
numeric_cols = ["wm_yr_wk", "wday", "snap","year", "month", "day", "sell_price","lag_1",'price_flag','lag_7','snap_weekend','wday_x_snap','is_weekend']
for col in categorical_cols:
    if final_data[col].dtype == "object":
        final_data[col] = final_data[col].astype("category")
for col in numeric_cols:
    final_data[col] = pd.to_numeric(final_data[col], downcast="integer")
final_data['sell_price'] = pd.to_numeric(final_data['sell_price'], downcast="float")

## Data spliting

In [6]:
final_data.to_csv("/kaggle/working/data.csv",index= False)

In [7]:
preprocessor = categorical_cols = ['store_id','event_name_1','event_name_2','event_type_1','event_type_2',"state_id"]
numeric_cols = ["wm_yr_wk", "wday", "snap","year", "month", "day", "sell_price","lag_1",'price_flag','lag_7','snap_weekend','wday_x_snap','is_weekend']
categorical_cols += ["item_category", "item_subcategory",'item_number']
categorical_transformer = Pipeline(steps=[
    ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
    ("scaler", MinMaxScaler())
])
numeric_transformer = "passthrough"
preprocessor = ColumnTransformer(transformers=[
    ("cat", categorical_transformer, categorical_cols),
    ("num", numeric_transformer, numeric_cols)
])
joblib.dump(preprocessor,"/kaggle/working/pre-proccess.pkl")

['/kaggle/working/pre-proccess.pkl']