In [1]:
import numpy as np, pandas as pd
#import matplotlib.pyplot as plt
import xgboost as xgb
import pickle

In [2]:
def process_X(df, id_weight):

    # Item Weight
    df.drop(columns = 'Item_Weight', inplace = True)
    df = pd.merge(df, id_weight, 'left', on = 'Item_Identifier')
    df = df[['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type',
       'Item_MRP', 'Outlet_Identifier', 'Outlet_Establishment_Year',
       'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']]

    # Impute Outlet Size
    
    
    # Item Fat Content
    map_ = {'low fat': 0, 'lf': 0, 'regular': 1, 'reg': 1}
    df['Item_Fat_Content'] = df.Item_Fat_Content.str.lower().map(map_)

    
    # Item Type
    map_it = {'Fruits and Vegetables':    0,
    'Snack Foods':               1,
    'Household':                 2,
    'Frozen Foods':              3,
    'Dairy':                     4,
    'Canned':                    5,
    'Baking Goods':              6,
    'Health and Hygiene':        7,
    'Soft Drinks':               8,
    'Meat':                      9,
    'Breads':                    10,
    'Hard Drinks':               11,
    'Others':                    12,
    'Starchy Foods':             13,
    'Breakfast':                 14,
    'Seafood':                   15}
    
    df['Item_Type'] = df.Item_Type.map(map_it)

    # Outlet_Establishment_Year
    df['Outlet_Establishment_Year'] = 2025 - df.Outlet_Establishment_Year
    df.rename(columns = {'Outlet_Establishment_Year': 'Years_Open'}, inplace = True)
    
    # Outlet Size
    map_os = {'Small': 0, 'Medium': 1, 'High': 2}
    df['Outlet_Size'] = df.Outlet_Size.map(map_os)

    # Outlet Type
    map_ot = {'Grocery Store': 0, 'Supermarket Type1': 1, 'Supermarket Type2': 2, 'Supermarket Type3': 3}
    df['Outlet_Type'] = df.Outlet_Type.map(map_ot)

    # Outlet Location Type
    map_olt = {'Tier 1': 0, 'Tier 2': 1, 'Tier 3': 2}
    df['Outlet_Location_Type'] = df.Outlet_Location_Type.map(map_olt)

    # X
    X = df.iloc[:, ~df.columns.isin(['Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales'])]

    return X

In [3]:
df = pd.read_csv('data/test.csv')

In [4]:
id_weight = pd.read_csv('id_weight.csv')

In [5]:
X = process_X(df, id_weight)

In [6]:
dtest = xgb.DMatrix(X)

In [7]:
# Load the model from the pickle file
with open('sales_pred_model.pkl', 'rb') as f:
    reg = pickle.load(f)

In [8]:
y_pred = reg.predict(dtest)

In [9]:
df = df[['Item_Identifier', 'Outlet_Identifier']]

In [10]:
df['Item_Outlet_Sales'] = y_pred

In [11]:
min(y_pred)

np.float32(52.42449)

In [12]:
df['Item_Outlet_Sales'] = np.where(df.Item_Outlet_Sales < 0, 0, df.Item_Outlet_Sales)

In [13]:
df.to_csv('submissions.csv', index = False)