In [1]:
import numpy as np, pandas as pd
#import matplotlib.pyplot as plt
import xgboost as xgb
import optuna
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
np.random.seed(42)

In [3]:
def rmse(arr, arr_pred):
    return np.sqrt(np.mean((arr - arr_pred)*2))

In [4]:
df = pd.read_csv('data/train.csv')

In [5]:
df['rand_'] = np.random.rand(df.shape[0])

In [6]:
train = df[df.rand_ < .8]

In [7]:
validation = df[df.rand_ >= .8]

In [8]:
df.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
rand_                           0
dtype: int64

In [9]:
validation.isna().sum()

Item_Identifier                0
Item_Weight                  254
Item_Fat_Content               0
Item_Visibility                0
Item_Type                      0
Item_MRP                       0
Outlet_Identifier              0
Outlet_Establishment_Year      0
Outlet_Size                  475
Outlet_Location_Type           0
Outlet_Type                    0
Item_Outlet_Sales              0
rand_                          0
dtype: int64

### Missing Value Imputation

Impute Item Weight

In [10]:
id_weight =  df.loc[~df.Item_Weight.isna(), ['Item_Identifier', 'Item_Weight']].drop_duplicates()

In [11]:
id_weight

Unnamed: 0,Item_Identifier,Item_Weight
0,FDA15,9.300
1,DRC01,5.920
2,FDN15,17.500
3,FDX07,19.200
4,NCD19,8.930
...,...,...
7298,NCW05,20.250
7373,FDS09,8.895
7421,FDU43,19.350
7944,FDO49,10.600


In [12]:
id_weight.to_csv('id_weight.csv', index = False)

In [13]:
map_ = dict(zip(id_weight.Item_Identifier, id_weight.Item_Weight))

In [14]:
train['Item_Weight'] = train.Item_Weight.fillna(train.Item_Identifier.map(map_))

In [15]:
train.isna().sum()

Item_Identifier                 0
Item_Weight                     3
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  1935
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
rand_                           0
dtype: int64

Impute Outlet Type

In [16]:
train[['Outlet_Identifier', 'Outlet_Size']].drop_duplicates()

Unnamed: 0,Outlet_Identifier,Outlet_Size
0,OUT049,Medium
3,OUT010,
4,OUT013,High
5,OUT018,Medium
8,OUT045,
9,OUT017,
13,OUT046,Small
18,OUT027,Medium
19,OUT035,Small
23,OUT019,Small


In [17]:
train[['Outlet_Location_Type', 'Outlet_Type', 'Outlet_Size']].drop_duplicates()

Unnamed: 0,Outlet_Location_Type,Outlet_Type,Outlet_Size
0,Tier 1,Supermarket Type1,Medium
3,Tier 3,Grocery Store,
4,Tier 3,Supermarket Type1,High
5,Tier 3,Supermarket Type2,Medium
8,Tier 2,Supermarket Type1,
13,Tier 1,Supermarket Type1,Small
18,Tier 3,Supermarket Type3,Medium
19,Tier 2,Supermarket Type1,Small
23,Tier 1,Grocery Store,Small


In [18]:
outlet_size = train[~train.Outlet_Size.isna()][['Outlet_Location_Type', 'Outlet_Type', 'Outlet_Size']].drop_duplicates()
outlet_size

Unnamed: 0,Outlet_Location_Type,Outlet_Type,Outlet_Size
0,Tier 1,Supermarket Type1,Medium
4,Tier 3,Supermarket Type1,High
5,Tier 3,Supermarket Type2,Medium
13,Tier 1,Supermarket Type1,Small
18,Tier 3,Supermarket Type3,Medium
19,Tier 2,Supermarket Type1,Small
23,Tier 1,Grocery Store,Small


In [19]:
outlet_size.to_csv('outlet_size.csv', index = False)

In [20]:
map_os1 = dict(zip(outlet_size.Outlet_Location_Type + outlet_size.Outlet_Type, outlet_size.Outlet_Size))
map_os1

{'Tier 1Supermarket Type1': 'Small',
 'Tier 3Supermarket Type1': 'High',
 'Tier 3Supermarket Type2': 'Medium',
 'Tier 3Supermarket Type3': 'Medium',
 'Tier 2Supermarket Type1': 'Small',
 'Tier 1Grocery Store': 'Small'}

In [21]:
train['Outlet_Size'] =  train.Outlet_Size.fillna((train.Outlet_Location_Type + train.Outlet_Type).map(map_os1))

In [22]:
train.isna().sum()

Item_Identifier                0
Item_Weight                    3
Item_Fat_Content               0
Item_Visibility                0
Item_Type                      0
Item_MRP                       0
Outlet_Identifier              0
Outlet_Establishment_Year      0
Outlet_Size                  453
Outlet_Location_Type           0
Outlet_Type                    0
Item_Outlet_Sales              0
rand_                          0
dtype: int64

In [23]:
map_os2 = dict(zip(outlet_size.Outlet_Type, outlet_size.Outlet_Size))

In [24]:
train['Outlet_Size'] =  train.Outlet_Size.fillna((train.Outlet_Type).map(map_os2))

In [25]:
train.isna().sum()

Item_Identifier              0
Item_Weight                  3
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
rand_                        0
dtype: int64

### Encode Categorical Variables

In [26]:
train.Item_Fat_Content.value_counts()

Item_Fat_Content
Low Fat    4050
Regular    2355
LF          248
reg          93
low fat      91
Name: count, dtype: int64

In [27]:
# Item Fat Content

map_ = {'low fat': 0, 'lf': 0, 'regular': 1, 'reg': 1}

train['Item_Fat_Content'] = train.Item_Fat_Content.str.lower().map(map_)

In [28]:
train.Item_Fat_Content.value_counts()

Item_Fat_Content
0    4389
1    2448
Name: count, dtype: int64

In [29]:
map_it = {'Fruits and Vegetables':    0,
'Snack Foods':               1,
'Household':                 2,
'Frozen Foods':              3,
'Dairy':                     4,
'Canned':                    5,
'Baking Goods':              6,
'Health and Hygiene':        7,
'Soft Drinks':               8,
'Meat':                      9,
'Breads':                    10,
'Hard Drinks':               11,
'Others':                    12,
'Starchy Foods':             13,
'Breakfast':                 14,
'Seafood':                   15}

train['Item_Type'] = train.Item_Type.map(map_it)

In [30]:
train['Outlet_Establishment_Year'] = 2025 - train.Outlet_Establishment_Year

In [31]:
train.rename(columns = {'Outlet_Establishment_Year': 'Years_Open'}, inplace = True)

In [32]:
map_os = {'Small': 0, 'Medium': 1, 'High': 2} 

train['Outlet_Size'] = train.Outlet_Size.map(map_os)

In [33]:
train.Outlet_Type.value_counts()

Outlet_Type
Supermarket Type1    4452
Grocery Store         884
Supermarket Type3     778
Supermarket Type2     723
Name: count, dtype: int64

In [34]:
map_ot = {'Grocery Store': 0, 'Supermarket Type1': 1, 'Supermarket Type2': 2, 'Supermarket Type3': 3}

train['Outlet_Type'] = train.Outlet_Type.map(map_ot)

In [35]:
train.Outlet_Location_Type.value_counts()

Outlet_Location_Type
Tier 3    2699
Tier 2    2225
Tier 1    1913
Name: count, dtype: int64

In [36]:
map_olt = {'Tier 1': 0, 'Tier 2': 1, 'Tier 3': 2}

train['Outlet_Location_Type'] = train.Outlet_Location_Type.map(map_olt)

In [37]:
X_train = train.iloc[:, ~train.columns.isin(['Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales'])]

In [38]:
X_train.isna().sum()

Item_Weight             3
Item_Fat_Content        0
Item_Visibility         0
Item_Type               0
Item_MRP                0
Years_Open              0
Outlet_Size             0
Outlet_Location_Type    0
Outlet_Type             0
rand_                   0
dtype: int64

In [39]:
y_train = train.Item_Outlet_Sales

In [40]:
dtrain = xgb.DMatrix(X_train, y_train)

In [41]:
def process_data(df, id_weight, outlet_size):

    # Item Weight
    map_iw = dict(zip(id_weight.Item_Identifier, id_weight.Item_Weight))
    df['Item_Weight'] = df.Item_Weight.fillna(df.Item_Identifier.map(map_iw))

    # Impute Outlet Size
    map_os1 = dict(zip(outlet_size.Outlet_Location_Type + outlet_size.Outlet_Type, outlet_size.Outlet_Size))
    df['Outlet_Size'] =  df.Outlet_Size.fillna((df.Outlet_Location_Type + df.Outlet_Type).map(map_os1))

    map_os2 = dict(zip(outlet_size.Outlet_Type, outlet_size.Outlet_Size))
    df['Outlet_Size'] =  df.Outlet_Size.fillna((df.Outlet_Type).map(map_os2))
    
    # Item Fat Content
    map_ = {'low fat': 0, 'lf': 0, 'regular': 1, 'reg': 1}
    df['Item_Fat_Content'] = df.Item_Fat_Content.str.lower().map(map_)

    
    # Item Type
    map_it = {'Fruits and Vegetables':    0,
    'Snack Foods':               1,
    'Household':                 2,
    'Frozen Foods':              3,
    'Dairy':                     4,
    'Canned':                    5,
    'Baking Goods':              6,
    'Health and Hygiene':        7,
    'Soft Drinks':               8,
    'Meat':                      9,
    'Breads':                    10,
    'Hard Drinks':               11,
    'Others':                    12,
    'Starchy Foods':             13,
    'Breakfast':                 14,
    'Seafood':                   15}
    
    df['Item_Type'] = df.Item_Type.map(map_it)

    # Outlet_Establishment_Year
    df['Outlet_Establishment_Year'] = 2025 - df.Outlet_Establishment_Year
    df.rename(columns = {'Outlet_Establishment_Year': 'Years_Open'}, inplace = True)
    
    # Outlet Size
    map_os = {'Small': 0, 'Medium': 1, 'High': 2}
    df['Outlet_Size'] = df.Outlet_Size.map(map_os)

    # Outlet Type
    map_ot = {'Grocery Store': 0, 'Supermarket Type1': 1, 'Supermarket Type2': 2, 'Supermarket Type3': 3}
    df['Outlet_Type'] = df.Outlet_Type.map(map_ot)

    # Outlet Location Type
    map_olt = {'Tier 1': 0, 'Tier 2': 1, 'Tier 3': 2}
    df['Outlet_Location_Type'] = df.Outlet_Location_Type.map(map_olt)

    # X
    X = df.iloc[:, ~df.columns.isin(['Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales'])]
    y = df.Item_Outlet_Sales
    
    return X, y

In [42]:
X_val, y_val = process_data(validation, id_weight, outlet_size)

In [43]:
X_val.isna().sum()

Item_Weight             1
Item_Fat_Content        0
Item_Visibility         0
Item_Type               0
Item_MRP                0
Years_Open              0
Outlet_Size             0
Outlet_Location_Type    0
Outlet_Type             0
rand_                   0
dtype: int64

In [44]:
dval = xgb.DMatrix(X_val, y_val)

In [45]:
dval = xgb.DMatrix(X_val, y_val)

def objective(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True),
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'seed': 42
        #'verbose': False
    }

    num_boost_round = trial.suggest_int('n_estimators', 100, 500)
    
    model = xgb.train(params, dtrain, num_boost_round, evals = [(dval, 'eval')], early_stopping_rounds = 20, verbose_eval=False)
    
    best_rmse = model.best_score
    return best_rmse

    
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials = 1000)


[I 2026-02-23 02:22:18,593] A new study created in memory with name: no-name-6fcd4d43-938c-433e-a439-6f2a138c4d7f
[I 2026-02-23 02:22:18,758] Trial 0 finished with value: 1167.5665063347574 and parameters: {'learning_rate': 0.08967895939639114, 'max_depth': 9, 'subsample': 0.849562755027562, 'colsample_bytree': 0.9331540759934758, 'lambda': 1.7003769746770048e-06, 'alpha': 4.6800255766248484e-08, 'n_estimators': 455}. Best is trial 0 with value: 1167.5665063347574.
[I 2026-02-23 02:22:18,901] Trial 1 finished with value: 1119.741631788252 and parameters: {'learning_rate': 0.014599649775179711, 'max_depth': 4, 'subsample': 0.7911916980828662, 'colsample_bytree': 0.627488808429843, 'lambda': 0.5094186051576881, 'alpha': 0.037728808491652034, 'n_estimators': 241}. Best is trial 1 with value: 1119.741631788252.
[I 2026-02-23 02:22:18,972] Trial 2 finished with value: 1254.5974719328763 and parameters: {'learning_rate': 0.011310283119789365, 'max_depth': 5, 'subsample': 0.6246905780404987, 

In [46]:
best_params = study.best_params

In [47]:
best_params

{'learning_rate': 0.024867459075911542,
 'max_depth': 3,
 'subsample': 0.717226893981375,
 'colsample_bytree': 0.991920556003412,
 'lambda': 0.8115351875796194,
 'alpha': 0.000553701376915715,
 'n_estimators': 388}

In [48]:
best_num_round = best_params.pop('n_estimators')

In [49]:
best_num_round

388

In [50]:
print("Best validation RMSE:", study.best_value)

Best validation RMSE: 1085.2595396786378


### Retrain Final Model on Full Data

In [51]:
X = np.concatenate([X_train, X_val])
y = np.concatenate([y_train, y_val])

In [52]:
dtrain_full = xgb.DMatrix(X,y)

In [53]:
reg = xgb.train(best_params, dtrain_full, best_num_round)
                

In [54]:
filename = 'sales_pred_model.pkl'
with open(filename, 'wb') as f:
    pickle.dump(reg, f)