In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/cut_market_info.csv')

In [None]:
df

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion,month,dcoilwtico,city,state,type,cluster,family,class,perishable,holiday_type,locale,locale_name,description,transferred
0,2016-10-26,6,1489881,6.000,1.0,2016-10,48.75,Quito,Pichincha,D,13,PRODUCE,2018,1,WorkingDay,WorkingDay,WorkingDay,WorkingDay,WorkingDay
1,2014-06-02,9,953609,7.000,0.0,2014-06,103.07,Quito,Pichincha,B,6,CLEANING,3046,0,WorkingDay,WorkingDay,WorkingDay,WorkingDay,WorkingDay
2,2017-06-18,21,1946155,5.000,0.0,2017-06,44.73,Santo Domingo,Santo Domingo de los Tsachilas,B,6,DELI,2650,1,WorkingDay,WorkingDay,WorkingDay,WorkingDay,WorkingDay
3,2016-07-09,46,119023,6.000,0.0,2016-07,45.37,Quito,Pichincha,A,14,CLEANING,3026,0,WorkingDay,WorkingDay,WorkingDay,WorkingDay,WorkingDay
4,2016-08-04,43,760319,7.707,0.0,2016-08,41.92,Esmeraldas,Esmeraldas,E,10,POULTRY,2416,1,WorkingDay,WorkingDay,WorkingDay,WorkingDay,WorkingDay
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191917,2015-08-25,17,220432,10.000,0.0,2015-08,39.15,Quito,Pichincha,C,12,GROCERY I,1080,0,WorkingDay,WorkingDay,WorkingDay,WorkingDay,WorkingDay
191918,2014-07-03,30,890375,5.000,0.0,2014-07,104.76,Guayaquil,Guayas,C,3,GROCERY I,1002,0,Holiday,Local,El Carmen,Cantonizacion de El Carmen,Holiday
191919,2014-08-13,44,795610,38.000,0.0,2014-08,97.57,Quito,Pichincha,A,5,FROZEN FOODS,2220,0,WorkingDay,WorkingDay,WorkingDay,WorkingDay,WorkingDay
191920,2013-10-01,34,213788,3.000,,2013-10,102.09,Guayaquil,Guayas,B,6,GROCERY I,1096,0,WorkingDay,WorkingDay,WorkingDay,WorkingDay,WorkingDay


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191922 entries, 0 to 191921
Data columns (total 19 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   date          191922 non-null  object 
 1   store_nbr     191922 non-null  int64  
 2   item_nbr      191922 non-null  int64  
 3   unit_sales    191922 non-null  float64
 4   onpromotion   158794 non-null  float64
 5   month         191922 non-null  object 
 6   dcoilwtico    191922 non-null  float64
 7   city          191922 non-null  object 
 8   state         191922 non-null  object 
 9   type          191922 non-null  object 
 10  cluster       191922 non-null  int64  
 11  family        191922 non-null  object 
 12  class         191922 non-null  int64  
 13  perishable    191922 non-null  int64  
 14  holiday_type  191922 non-null  object 
 15  locale        191922 non-null  object 
 16  locale_name   191922 non-null  object 
 17  description   191922 non-null  object 
 18  tran

In [None]:
df.describe()

Unnamed: 0,store_nbr,item_nbr,unit_sales,onpromotion,dcoilwtico,cluster,class,perishable
count,191922.0,191922.0,191922.0,158794.0,191922.0,191922.0,191922.0,191922.0
mean,27.436849,971285.6,8.602086,0.075979,61.377082,8.741098,1965.316665,0.252353
std,16.315028,519157.9,29.737848,0.264965,23.743899,4.62889,1153.139895,0.434364
min,1.0,96995.0,0.106,0.0,26.19,1.0,1002.0,0.0
25%,12.0,522383.0,2.0,0.0,45.48,4.0,1052.0,0.0
50%,28.0,957096.0,4.0,0.0,49.95,9.0,1190.0,0.0
75%,43.0,1352757.0,9.0,0.0,91.17,13.0,2712.0,1.0
max,54.0,2127114.0,10000.0,1.0,110.62,17.0,7780.0,1.0


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb

# ==============================
# Feature Engineering
# ==============================
def feature_engineering(df):
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek
    df['is_weekend'] = df['dayofweek'].isin([5,6]).astype(int)
    df['weekofyear'] = df['date'].dt.isocalendar().week.astype(int)
    df['quarter'] = df['date'].dt.quarter

    return df

df = feature_engineering(df)

# ==============================
# Train-test split
# ==============================
X = df.drop(columns=['unit_sales', 'date'])
y = df['unit_sales']

cat_features = ['city','state','type','cluster','family','class',
                'holiday_type','locale','locale_name','description','transferred']
num_features = [col for col in X.columns if col not in cat_features]

preprocessor = ColumnTransformer([
    ("num", SimpleImputer(strategy="median"), num_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
])

# ==============================
# Base Models
# ==============================
xgb_model = xgb.XGBRegressor(n_estimators=300, learning_rate=0.1, max_depth=7, subsample=0.8, colsample_bytree=0.8)
lgb_model = lgb.LGBMRegressor(n_estimators=300, learning_rate=0.05, num_leaves=31)
rf_model = RandomForestRegressor(n_estimators=200, max_depth=15)

stack_model = StackingRegressor(
    estimators=[
        ('xgb', xgb_model),
        ('lgb', lgb_model),
        ('rf', rf_model)
    ],
    final_estimator=Ridge()
)

pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("model", stack_model)
])


# ==============================
# Train
# ==============================

import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

# train model with log-transformed target
pipeline.fit(X_train, y_train_log)

# predict and inverse transform
y_pred_log = pipeline.predict(X_test)
y_pred = np.expm1(y_pred_log)  # inverse of log1p






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.123845 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1691
[LightGBM] [Info] Number of data points in the train set: 153537, number of used features: 514
[LightGBM] [Info] Start training from score 1.750314
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.162769 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1671
[LightGBM] [Info] Number of data points in the train set: 122829, number of used features: 504
[LightGBM] [Info] Start training from score 1.749481




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.097612 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1667
[LightGBM] [Info] Number of data points in the train set: 122829, number of used features: 502
[LightGBM] [Info] Start training from score 1.752463




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.097178 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1671
[LightGBM] [Info] Number of data points in the train set: 122830, number of used features: 504
[LightGBM] [Info] Start training from score 1.749129




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.098443 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1671
[LightGBM] [Info] Number of data points in the train set: 122830, number of used features: 504
[LightGBM] [Info] Start training from score 1.751412




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.114502 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1665
[LightGBM] [Info] Number of data points in the train set: 122830, number of used features: 501
[LightGBM] [Info] Start training from score 1.749086




In [None]:
import numpy as np

from sklearn.model_selection import train_test_split

X = df.drop(columns=['unit_sales', 'date'])
y = np.log1p(df['unit_sales'])  # target log transform

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
def feature_engineering(df):
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek
    df['is_weekend'] = df['dayofweek'].isin([5,6]).astype(int)
    df['weekofyear'] = df['date'].dt.isocalendar().week.astype(int)
    df['quarter'] = df['date'].dt.quarter

    return df

df = feature_engineering(df)

cat_features = ['city','state','type','cluster','family','class',
                'holiday_type','locale','locale_name','description','transferred', 'month']
num_features = [col for col in X.columns if col not in cat_features]

preprocessor = ColumnTransformer([
    ("num", SimpleImputer(strategy="median"), num_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
])

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

ada = Pipeline([
    ("preprocess", preprocessor),
    ("model", AdaBoostRegressor(n_estimators=200, learning_rate=0.1))
])

ada.fit(X_train, y_train)
y_pred_ada = ada.predict(X_test)

# Calculate and print metrics
rmse_ada = np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred_ada)))
mae_ada = mean_absolute_error(np.expm1(y_test), np.expm1(y_pred_ada))
r2_ada = r2_score(np.expm1(y_test), np.expm1(y_pred_ada))

print("AdaBoost RMSE:", rmse_ada)
print("AdaBoost MAE:", mae_ada)
print("AdaBoost R-squared:", r2_ada)

AdaBoost RMSE: 23.547396611367418
