# Library importing

In [6]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
import joblib

# data pre-proccessing

## data reading

In [None]:
data = pd.read_csv("/kaggle/input/depi-dataset/data.csv")

In [None]:
data.head(3)

In [7]:
final_data = pd.read_csv("/kaggle/input/data-cleaning/preprocessing")
final_data.drop(columns='Unnamed: 0',inplace =True)

In [10]:
final_data.head(3)

Unnamed: 0.1,Unnamed: 0,item_id,store_id,sales,wm_yr_wk,wday,event_name_1,event_type_1,event_name_2,event_type_2,snap,year,month,day,price
0,0,FOODS_1_001,CA_1,3,1,1,No event,No event,No event,No event,0,2011,1,29,2.0
1,1,FOODS_1_001,CA_1,0,2,3,No event,No event,No event,No event,1,2011,2,7,0.0
2,2,FOODS_1_001,CA_1,0,15,2,Mother's day,Cultural,No event,No event,1,2011,5,8,0.0


## feature engineering

In [None]:
data['date'] = pd.to_datetime(data['date'])
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['day'] = data['date'].dt.day

In [None]:
data['price']= data['price_in_dollars']/data['sales']
data.fillna(0,inplace=True)

In [None]:
data['wm_yr_wk']=data['wm_yr_wk']%100

In [None]:
data.drop(columns = ['d'],inplace = True)
data.drop(columns=['date'],inplace = True)
data.drop(columns = ['price_in_dollars'],inplace = True)

## Encoding

In [8]:
categorical_cols = ['item_id','store_id','event_name_1','event_name_2','event_type_1','event_type_2']
numeric_cols = ["wm_yr_wk", "wday", "snap","year", "month", "day", "price"]

In [9]:
categorical_transformer = Pipeline(steps=[
    ("encoder", OrdinalEncoder()),
    ("scaler", MinMaxScaler())
])
numeric_transformer = "passthrough"
preprocessor = ColumnTransformer(transformers=[
    ("cat", categorical_transformer, categorical_cols),
    ("num", numeric_transformer, numeric_cols)
])

In [None]:
preprocessor

In [13]:
joblib.dump(preprocessor,'preprocessing_pipeline.pkl')

['preprocessing_pipeline.pkl']

## Data spliting

In [10]:
y=final_data['sales']
x=final_data.drop('sales',axis = 1)

# Model

In [11]:
x=preprocessor.fit_transform(x)

In [23]:
model = XGBRegressor(
    bjective = 'reg:squarederror',
    booster =  "gbtree",
    n_estimator = 500 ,
    max_depth = 20 ,
    random_state = 42 ,
    learning_rate = 0.01 ,
    eval_metric = 'logloss',
    early_stopping_rounds = 10
)

In [13]:
x_temp , x_test , y_temp , y_test = train_test_split(x , y , test_size = 0.2 , random_state = 42)
x_train , x_val , y_train , y_val = train_test_split(x_temp , y_temp , test_size = 0.15 , random_state = 42)

In [24]:
model.fit(x_train,y_train,eval_set = [(x_val , y_val)])

Parameters: { "bjective", "n_estimator" } are not used.

  self.starting_round = model.num_boosted_rounds()


[0]	validation_0-logloss:-4.79757
[1]	validation_0-logloss:-4.85978
[2]	validation_0-logloss:-4.91369
[3]	validation_0-logloss:-4.96180
[4]	validation_0-logloss:-5.00545
[5]	validation_0-logloss:-5.04555
[6]	validation_0-logloss:-5.08272
[7]	validation_0-logloss:-5.11742
[8]	validation_0-logloss:-5.15006
[9]	validation_0-logloss:-5.18081
[10]	validation_0-logloss:-5.20992
[11]	validation_0-logloss:-25.69261
[12]	validation_0-logloss:-27.30663
[13]	validation_0-logloss:-27.76731
[14]	validation_0-logloss:-28.05293
[15]	validation_0-logloss:-28.26274
[16]	validation_0-logloss:-28.43023
[17]	validation_0-logloss:-28.57030
[18]	validation_0-logloss:-28.69029
[19]	validation_0-logloss:-28.79655
[20]	validation_0-logloss:-28.89159
[21]	validation_0-logloss:-28.97779
[22]	validation_0-logloss:-29.05666
[23]	validation_0-logloss:-29.12944
[24]	validation_0-logloss:-29.19700
[25]	validation_0-logloss:-29.26008
[26]	validation_0-logloss:-29.31930
[27]	validation_0-logloss:-29.37504
[28]	validati

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,'gbtree'
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,10
,enable_categorical,False


In [27]:
model.score(x_train,y_train) 

0.6329197883605957

In [28]:
model.score(x_test,y_test)

0.5742210745811462