# Library importing

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import joblib
import numpy as np

# data pre-proccessing

## data reading

In [2]:
final_data = pd.read_csv("/kaggle/input/depi-dataset/data.csv")

In [3]:
final_data['daily_week_avg'] = (
        final_data.groupby(['store_id', 'item_category','item_subcategory',	'item_number', 'wm_yr_wk'])['sales']
          .transform(lambda x: x.shift(1).expanding().mean())
    )

In [4]:
final_data['is_event'] = final_data['event_name_1'].apply(lambda x: 0 if x == 'No event' else 1)

In [5]:
final_data['event_count'] = final_data.apply(
    lambda row: sum(x != 'No event' for x in [row['event_name_1'], row['event_name_2']]),
    axis=1
)

In [6]:
final_data['daily_week_avg'] = final_data['daily_week_avg'].replace(0, np.nan)

In [7]:
final_data['event_impact'] = final_data['sales']/final_data['daily_week_avg']

In [8]:
final_data['event_impact'] = final_data['event_impact'].fillna(0)

In [9]:
final_data.drop(columns='event_name_1',inplace=True)
final_data.drop(columns='event_type_1',inplace=True)
final_data.drop(columns='event_name_2',inplace=True)
final_data.drop(columns='event_type_2',inplace=True)
final_data.drop(columns='daily_week_avg',inplace=True)
final_data.drop(columns='state_id',inplace=True)

In [10]:
categorical_cols = ['store_id', 'item_category', 'item_subcategory']
numeric_cols = ["wm_yr_wk", "wday", "snap","year", "month", "day", "sell_price","lag_1",'price_flag','lag_7','snap_weekend','wday_x_snap','is_weekend','event_impact','event_count','is_event']
for col in categorical_cols:
    if final_data[col].dtype == "object":
        final_data[col] = final_data[col].astype("category")
for col in numeric_cols:
    final_data[col] = pd.to_numeric(final_data[col], downcast="integer")
final_data['sell_price'] = pd.to_numeric(final_data['sell_price'], downcast="float")

## Data spliting

In [11]:
final_data.to_csv("/kaggle/working/data.csv",index= False)

In [12]:
preprocessor = categorical_cols = ['store_id']
numeric_cols = ["wm_yr_wk", "wday", "snap","year", "month", "day", "sell_price","lag_1",'price_flag','lag_7','snap_weekend','wday_x_snap','is_weekend','event_impact','event_count','is_event']
categorical_cols += ["item_category", "item_subcategory",'item_number']
categorical_transformer = Pipeline(steps=[
    ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
    ("scaler", MinMaxScaler())
])
numeric_transformer = "passthrough"
preprocessor = ColumnTransformer(transformers=[
    ("cat", categorical_transformer, categorical_cols),
    ("num", numeric_transformer, numeric_cols)
])
joblib.dump(preprocessor,"/kaggle/working/pre-proccess.pkl")

['/kaggle/working/pre-proccess.pkl']