In [1]:
import numpy as np, pandas as pd
#import matplotlib.pyplot as plt
import xgboost as xgb
import pickle

In [2]:
np.random.seed(42)

In [3]:
def process_X(df, id_weight, outlet_size):

    # Item Weight
    map_iw = dict(zip(id_weight.Item_Identifier, id_weight.Item_Weight))
    df['Item_Weight'] = df.Item_Weight.fillna(df.Item_Identifier.map(map_iw))

    # Impute Outlet Size
    map_os1 = dict(zip(outlet_size.Outlet_Location_Type + outlet_size.Outlet_Type, outlet_size.Outlet_Size))
    df['Outlet_Size'] =  df.Outlet_Size.fillna((df.Outlet_Location_Type + df.Outlet_Type).map(map_os1))

    map_os2 = dict(zip(outlet_size.Outlet_Type, outlet_size.Outlet_Size))
    df['Outlet_Size'] =  df.Outlet_Size.fillna((df.Outlet_Type).map(map_os2))
    
    # Item Fat Content
    map_ = {'low fat': 0, 'lf': 0, 'regular': 1, 'reg': 1}
    df['Item_Fat_Content'] = df.Item_Fat_Content.str.lower().map(map_)

    
    # Item Type
    map_it = {'Fruits and Vegetables':    0,
    'Snack Foods':               1,
    'Household':                 2,
    'Frozen Foods':              3,
    'Dairy':                     4,
    'Canned':                    5,
    'Baking Goods':              6,
    'Health and Hygiene':        7,
    'Soft Drinks':               8,
    'Meat':                      9,
    'Breads':                    10,
    'Hard Drinks':               11,
    'Others':                    12,
    'Starchy Foods':             13,
    'Breakfast':                 14,
    'Seafood':                   15}
    
    df['Item_Type'] = df.Item_Type.map(map_it)

    # Outlet_Establishment_Year
    df['Outlet_Establishment_Year'] = 2025 - df.Outlet_Establishment_Year
    df.rename(columns = {'Outlet_Establishment_Year': 'Years_Open'}, inplace = True)
    
    # Outlet Size
    map_os = {'Small': 0, 'Medium': 1, 'High': 2}
    df['Outlet_Size'] = df.Outlet_Size.map(map_os)

    # Outlet Type
    map_ot = {'Grocery Store': 0, 'Supermarket Type1': 1, 'Supermarket Type2': 2, 'Supermarket Type3': 3}
    df['Outlet_Type'] = df.Outlet_Type.map(map_ot)

    # Outlet Location Type
    map_olt = {'Tier 1': 0, 'Tier 2': 1, 'Tier 3': 2}
    df['Outlet_Location_Type'] = df.Outlet_Location_Type.map(map_olt)

    # X
    X = df.iloc[:, ~df.columns.isin(['Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales'])]

    return X

In [4]:
test = pd.read_csv('data/test.csv')

In [5]:
id_weight = pd.read_csv('id_weight.csv')

In [6]:
outlet_size = pd.read_csv('outlet_size.csv')

In [7]:
X_test = process_X(test, id_weight, outlet_size)

In [8]:
dtest = xgb.DMatrix(X_test)

In [9]:
# Load the model from the pickle file
with open('sales_pred_model.pkl', 'rb') as f:
    reg = pickle.load(f)

In [10]:
y_pred = reg.predict(dtest)

In [11]:
submission = test[['Item_Identifier', 'Outlet_Identifier']]

In [12]:
submission['Item_Outlet_Sales'] = y_pred

In [13]:
min(y_pred)

np.float32(-241.32645)

In [14]:
submission['Item_Outlet_Sales'] = np.where(submission.Item_Outlet_Sales < 0, 0, submission.Item_Outlet_Sales)

In [15]:
submission.to_csv('submission.csv', index = False)