# Library importing

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import joblib

# data pre-proccessing

## data reading

In [2]:
final_data = pd.read_csv("/kaggle/input/depi-dataset/data.csv")

In [3]:
price=pd.read_csv("/kaggle/input/m5-forecasting-accuracy/sell_prices.csv")

## Data spliting

In [4]:
final_data.head()

Unnamed: 0,store_id,sales,wm_yr_wk,wday,event_name_1,event_type_1,event_name_2,event_type_2,snap,year,month,day,price,lag_1,item_id_enc,item_category,item_subcategory,item_number,state_id
0,CA_1,3,1,1,No event,No event,No event,No event,0,2011,1,29,2.0,1,0.640199,FOODS,1,1,CA
1,CA_1,0,1,2,No event,No event,No event,No event,0,2011,1,30,0.0,3,0.640199,FOODS,1,1,CA
2,CA_1,0,1,3,No event,No event,No event,No event,0,2011,1,31,0.0,0,0.640199,FOODS,1,1,CA
3,CA_1,1,1,4,No event,No event,No event,No event,1,2011,2,1,2.0,0,0.640199,FOODS,1,1,CA
4,CA_1,4,1,5,No event,No event,No event,No event,1,2011,2,2,2.0,1,0.640199,FOODS,1,1,CA


In [5]:
final_data['item_id']=(final_data['item_category']+"_"
                      +final_data['item_subcategory'].astype(str)+"_"
                      +final_data['item_number'].astype(str).str.zfill(3))

In [6]:
final_data["wm_yr_wk"] = ("1"
                          +final_data["year"].astype(str).str[2:]
                          +final_data['wm_yr_wk'].astype(str).str.zfill(2))

In [7]:
final_data.drop(columns='price',inplace = True)

In [8]:
final_data["wm_yr_wk"] = final_data["wm_yr_wk"].astype(int)

In [9]:
final_data = final_data.merge(
    price,
    on=["wm_yr_wk", "item_id","store_id"],
    how="left"
)

In [10]:
final_data.isna().sum()

store_id                   0
sales                      0
wm_yr_wk                   0
wday                       0
event_name_1               0
event_type_1               0
event_name_2               0
event_type_2               0
snap                       0
year                       0
month                      0
day                        0
lag_1                      0
item_id_enc                0
item_category              0
item_subcategory           0
item_number                0
state_id                   0
item_id                    0
sell_price          13017673
dtype: int64

In [11]:
final_data.isna().sum() / len(final_data)

store_id            0.000000
sales               0.000000
wm_yr_wk            0.000000
wday                0.000000
event_name_1        0.000000
event_type_1        0.000000
event_name_2        0.000000
event_type_2        0.000000
snap                0.000000
year                0.000000
month               0.000000
day                 0.000000
lag_1               0.000000
item_id_enc         0.000000
item_category       0.000000
item_subcategory    0.000000
item_number         0.000000
state_id            0.000000
item_id             0.000000
sell_price          0.223183
dtype: float64

In [12]:
final_data['price_flag'] = final_data['sell_price'].isna().astype(int)
final_data['sell_price'] = final_data.groupby(['item_id','store_id'])['sell_price'].ffill()
final_data['sell_price'] = final_data.groupby(['item_id','store_id'])['sell_price'].bfill()


In [13]:
final_data.drop(columns='item_id',inplace = True)
final_data['wm_yr_wk']=final_data['wm_yr_wk']%100
final_data['wm_yr_wk']=final_data['wm_yr_wk'].astype('int16')

In [14]:
final_data['price_flag']=final_data['price_flag'].astype('int16')
final_data['sell_price'] = pd.to_numeric(final_data['sell_price'], downcast="float")

In [15]:
categorical_cols = ['store_id','event_name_1','event_name_2','event_type_1','event_type_2',"state_id"]
numeric_cols = ["wm_yr_wk", "wday", "snap","year", "month", "day", "sell_price","item_id","lag_1",'price_flag']
numeric_cols += ["item_id_enc"]
categorical_cols += ["item_category", "item_subcategory"]
categorical_transformer = Pipeline(steps=[
    ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
    ("scaler", MinMaxScaler())
])
numeric_transformer = "passthrough"
preprocessor = ColumnTransformer(transformers=[
    ("cat", categorical_transformer, categorical_cols),
    ("num", numeric_transformer, numeric_cols)
])

In [16]:
final_data.to_csv("/kaggle/working/data.csv",index= False)

In [17]:
preprocessor = joblib.load("/kaggle/input/depi-dataset/pre-proccess.pkl")

In [18]:
joblib.dump(preprocessor,"/kaggle/working/pre-proccess.pkl")

['/kaggle/working/pre-proccess.pkl']