# Library importing

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import joblib

# data pre-proccessing

## data reading

In [2]:
final_data = pd.read_csv("/kaggle/input/depi-dataset/data.csv")

In [3]:
final_data["item_id_enc"] = final_data.groupby("item_id")["sales"].transform("mean")
final_data[["item_category", "item_subcategory", "item_number"]] = (
    final_data["item_id"].str.split("_", expand=True)
)
final_data["item_subcategory"] = final_data["item_subcategory"].astype(int)
final_data["item_number"] = final_data["item_number"].astype(int)

In [4]:
final_data["state_id"] = final_data["store_id"].str.split("_").str[0]

In [5]:
final_data.drop(columns='item_id',inplace = True)

In [6]:
categorical_cols = ['store_id','event_name_1','event_name_2','event_type_1','event_type_2',"state_id"]
numeric_cols = ["wm_yr_wk", "wday", "snap","year", "month", "day", "price","lag_1"]
numeric_cols += ["item_id_enc"]
categorical_cols += ["item_category", "item_subcategory"]
categorical_transformer = Pipeline(steps=[
    ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
    ("scaler", MinMaxScaler())
])
numeric_transformer = "passthrough"
preprocessor = ColumnTransformer(transformers=[
    ("cat", categorical_transformer, categorical_cols),
    ("num", numeric_transformer, numeric_cols)
])

## Data spliting

In [7]:
final_data['sales'] = final_data['sales'].astype('int16')
final_data['wm_yr_wk'] = final_data['wm_yr_wk'].astype('int16')
final_data['wday'] = final_data['wday'].astype('int16')
final_data['snap'] = final_data['snap'].astype('int16')
final_data['year'] = final_data['year'].astype('int16')
final_data['month'] = final_data['month'].astype('int16')
final_data['day'] = final_data['day'].astype('int16')
final_data['sales'] = final_data['sales'].astype('int16')
final_data['lag_1'] = final_data['lag_1'].astype('int16')
final_data['sales'] = final_data['sales'].astype('int16')
final_data['item_subcategory'] = final_data['item_subcategory'].astype('int16')
final_data['item_number'] = final_data['item_number'].astype('int16')

In [8]:
final_data.head(10)

Unnamed: 0.1,Unnamed: 0,store_id,sales,wm_yr_wk,wday,event_name_1,event_type_1,event_name_2,event_type_2,snap,year,month,day,price,lag_1,item_id_enc,item_category,item_subcategory,item_number,state_id
0,0,CA_1,3,1,1,No event,No event,No event,No event,0,2011,1,29,2.0,1,0.640199,FOODS,1,1,CA
1,1,CA_1,0,1,2,No event,No event,No event,No event,0,2011,1,30,0.0,3,0.640199,FOODS,1,1,CA
2,2,CA_1,0,1,3,No event,No event,No event,No event,0,2011,1,31,0.0,0,0.640199,FOODS,1,1,CA
3,3,CA_1,1,1,4,No event,No event,No event,No event,1,2011,2,1,2.0,0,0.640199,FOODS,1,1,CA
4,4,CA_1,4,1,5,No event,No event,No event,No event,1,2011,2,2,2.0,1,0.640199,FOODS,1,1,CA
5,5,CA_1,2,1,6,No event,No event,No event,No event,1,2011,2,3,2.0,4,0.640199,FOODS,1,1,CA
6,6,CA_1,0,1,7,No event,No event,No event,No event,1,2011,2,4,0.0,2,0.640199,FOODS,1,1,CA
7,7,CA_1,2,2,1,No event,No event,No event,No event,1,2011,2,5,2.0,0,0.640199,FOODS,1,1,CA
8,8,CA_1,0,2,2,SuperBowl,Sporting,No event,No event,1,2011,2,6,0.0,2,0.640199,FOODS,1,1,CA
9,9,CA_1,0,2,3,No event,No event,No event,No event,1,2011,2,7,0.0,0,0.640199,FOODS,1,1,CA


In [9]:
final_data.to_csv("/kaggle/working/data.csv",index= False)


In [10]:
joblib.dump(preprocessor,"/kaggle/working/pre-proccess.pkl")

['/kaggle/working/pre-proccess.pkl']