### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import joblib

import warnings
warnings.filterwarnings('ignore')

### Read CSV file

In [2]:
test_data = pd.read_csv('test.csv')

### Display Data

In [3]:
test_data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,FDW14,8.3,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1
2,NCN55,14.6,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.034,OUT017,2007,,Tier 2,Supermarket Type1
4,FDY38,,Regular,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3


### Replicate handling of missing values and label encoding as done for train data

In [4]:
test_data['Item_Weight'] = test_data.groupby('Item_Type')['Item_Weight'].transform(lambda x: x.fillna(x.mean()))

In [5]:
test_data['Outlet_Size'] = test_data.groupby(['Outlet_Establishment_Year', 'Outlet_Location_Type', 'Outlet_Type'])['Outlet_Size'].transform(lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else 'Unknown'))

In [6]:
test_data['Item_Visibility'] = test_data.groupby('Item_Identifier')['Item_Visibility'].transform(lambda x: x.replace(0, x.mean()))

In [7]:
test_data['Item_Fat_Content'] = test_data['Item_Fat_Content'].replace({'Low Fat': 1, 'low fat': 1, 'LF': 1, 'Regular': 2, 'reg': 2  })
test_data['Outlet_Size'] = test_data['Outlet_Size'].replace({'Unknown': 0, 'Small': 1, 'Medium': 2, 'High': 3 })
test_data['Outlet_Location_Type'] = test_data['Outlet_Location_Type'].replace({'Tier 1': 1, 'Tier 2': 2, 'Tier 3': 3 })
test_data['Outlet_Type'] = test_data['Outlet_Type'].replace({'Grocery Store': 0, 'Supermarket Type1': 1, 'Supermarket Type2': 2, 'Supermarket Type3': 3 })

In [8]:
test_data['Item_Fat_Content'] = test_data['Item_Fat_Content'].astype(int)
test_data['Outlet_Size'] = test_data['Outlet_Size'].astype(int)
test_data['Outlet_Location_Type'] = test_data['Outlet_Location_Type'].astype(int)

In [9]:
label_encoder_item = LabelEncoder()
label_encoder_outlet = LabelEncoder()
label_encoder_type = LabelEncoder()

# Fit and transform the 'Item_Identifier' column
test_data['Item_Identifier'] = label_encoder_item.fit_transform(test_data['Item_Identifier'])

# Fit and transform the 'Outlet_Identifier' column
test_data['Outlet_Identifier'] = label_encoder_outlet.fit_transform(test_data['Outlet_Identifier'])

# Fit and transform the 'Outlet_Identifier' column
test_data['Item_Type'] = label_encoder_type.fit_transform(test_data['Item_Type'])

### Save the test_data file

In [10]:
test_data.to_csv('test_data.csv', index=False)

### Save the label encoder instances to files

In [11]:
joblib.dump(label_encoder_item, 'label_encoder_item.joblib')
joblib.dump(label_encoder_outlet, 'label_encoder_outlet.joblib')
joblib.dump(label_encoder_type, 'label_encoder_type.joblib')

['label_encoder_type.joblib']