# Benchmark Regression 

## Importing and loading data

In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
%matplotlib inline

print(pd.__version__)
print(np.__version__)

2.1.3
1.26.1


In [2]:
# Load the data
data = pd.read_csv('datasets/train_bm.csv')

# Check the data
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [3]:
# Shape and columns of the data
print('Shape:', data.shape)
print('Columns:', data.columns)

Shape: (8523, 12)
Columns: Index(['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
       'Item_Type', 'Item_MRP', 'Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Outlet_Sales'],
      dtype='object')


In [4]:
# Check for any NULL values
data.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

## Creating training and testing data

In [7]:
from sklearn.utils import shuffle

# Shuffling the Dataset
data = shuffle(data, random_state = 42)

# Creating 4 divisions
div = int(data.shape[0]/4)

# 3 parts to training set and 1 part to testing set
train = data.iloc[:3*div+1,:]
test = data.iloc[3*div+1:]

print(train.shape, test.shape)

(6391, 12) (2132, 12)


In [8]:
# Check the training data
train.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
876,FDQ08,,Regular,0.018839,Fruits and Vegetables,62.9536,OUT027,1985,Medium,Tier 3,Supermarket Type3,2266.3832
2619,NCK18,9.6,Low Fat,0.006709,Household,164.6184,OUT049,1999,Medium,Tier 1,Supermarket Type1,2972.1312
1371,FDD52,18.25,Regular,0.184042,Dairy,110.157,OUT018,2009,Medium,Tier 3,Supermarket Type2,1867.569
509,DRN36,,Low Fat,0.087855,Soft Drinks,95.9752,OUT019,1985,Small,Tier 1,Grocery Store,95.8752
7637,DRN35,8.01,Low Fat,0.070248,Hard Drinks,37.5532,OUT046,1997,Small,Tier 1,Supermarket Type1,1366.2216


In [9]:
# Check the testing data
test.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
2215,FDT45,15.85,Low Fat,0.057303,Snack Foods,53.3956,OUT035,2004,Small,Tier 2,Supermarket Type1,600.5516
7257,FDI04,13.65,Regular,0.072912,Frozen Foods,198.4426,OUT046,1997,Small,Tier 1,Supermarket Type1,5536.7928
1765,FDI57,,Low Fat,0.053764,Seafood,195.7768,OUT027,1985,Medium,Tier 3,Supermarket Type3,3547.3824
7679,FDK02,12.5,Low Fat,0.112203,Canned,121.144,OUT035,2004,Small,Tier 2,Supermarket Type1,1438.128
4411,FDK50,7.96,Low Fat,0.02834,Canned,162.4894,OUT013,1987,High,Tier 3,Supermarket Type1,1779.6834


## Evaluating the mean

### Mean of Item_Outlet_Sales as 'simple_mean'

In [10]:
# storing simple mean in a new column in the test set as "simple_mean"
test['simple_mean'] = train['Item_Outlet_Sales'].mean()

# Check the data
test['simple_mean'].head()

2215    2197.246782
7257    2197.246782
1765    2197.246782
7679    2197.246782
4411    2197.246782
Name: simple_mean, dtype: float64

In [12]:
from sklearn.metrics import mean_absolute_error

# Calculating mean absolute error
mean_error = mean_absolute_error(test['Item_Outlet_Sales'] , test['simple_mean'])
print('Error:', mean_error.round(3))

Error: 1318.376


### Mean of Item_Outlet_Sales with respect to Outlet_Type

In [13]:
# Create a pivot table for the sales depending on the outlet type
table = pd.pivot_table(train, values ='Item_Outlet_Sales', index = ['Outlet_Type'], aggfunc = np.mean)
table

Unnamed: 0_level_0,Item_Outlet_Sales
Outlet_Type,Unnamed: 1_level_1
Grocery Store,344.991708
Supermarket Type1,2328.89575
Supermarket Type2,2043.199376
Supermarket Type3,3735.783769


In [15]:
# Initializing new column to zero
test['Outlet_type_mean'] = 0

# For every unique entry in Outlet_Identifier
for i in train['Outlet_Type'].unique():
    
  # Assign the mean value corresponding to unique entry
  test['Outlet_type_mean'][test['Outlet_Type'] == str(i)] = train['Item_Outlet_Sales'][train['Outlet_Type'] == str(i)].mean()

# Calculating mean absolute error
outlet_type_error = mean_absolute_error(test['Item_Outlet_Sales'] , test['Outlet_type_mean'])
print('Error:', outlet_type_error.round(3))

Error: 1084.802


### Mean of Item_Outlet_Sales with respect to Outlet_Establishment_Year

In [16]:
# Create a pivot table for the sales with respect to the Outlet_Establishment_Year
table = pd.pivot_table(train, values = 'Item_Outlet_Sales', index = ['Outlet_Establishment_Year'], aggfunc = np.mean)
table

Unnamed: 0_level_0,Item_Outlet_Sales
Outlet_Establishment_Year,Unnamed: 1_level_1
1985,2496.020589
1987,2271.275425
1997,2315.893618
1998,351.671419
1999,2377.39229
2002,2224.604018
2004,2421.808702
2007,2364.067474
2009,2043.199376


In [18]:
# Initializing new column to zero
test['Out_year_mean'] = 0

# For every unique entry in Outlet_Identifier
for i in train['Outlet_Establishment_Year'].unique():
  # Assign the mean value corresponding to unique entry
  test['Out_year_mean'][test['Outlet_Establishment_Year'] == i] = train['Item_Outlet_Sales'][train['Outlet_Establishment_Year'] == i].mean()

# Calculating mean absolute error
out_year_error = mean_absolute_error(test['Item_Outlet_Sales'] , test['Out_year_mean'])
print('Error:', out_year_error.round(3))

Error: 1217.895


### Mean of Item_Outlet_Sales with respect to Outlet_Location_Type

In [19]:
# Create a pivot table for the sales with respect to the Outlet_Location_Type
table = pd.pivot_table(train, values = 'Item_Outlet_Sales', index = ['Outlet_Location_Type'], aggfunc = np.mean)
table

Unnamed: 0_level_0,Item_Outlet_Sales
Outlet_Location_Type,Unnamed: 1_level_1
Tier 1,1894.774463
Tier 2,2335.722235
Tier 3,2295.822861


In [21]:
# Initializing empty column
test['out_loc_mean'] = 0

# For every unique entry in Item_Identifier
for i in train['Outlet_Location_Type'].unique():
  # calculate and assign mean corresponding to the uniques entries
  test['out_loc_mean'][test['Outlet_Location_Type'] == str(i)] = train['Item_Outlet_Sales'][train['Outlet_Location_Type'] == str(i)].mean()

# Calculating mean absolute error
out_loc_error = mean_absolute_error(test['Item_Outlet_Sales'] , test['out_loc_mean'])
print('Error:', out_loc_error.round(3))

Error: 1308.499


### Mean of Item_Outlet_Sales with respect to Outlet_Location_Type and Outlet_Establishment_Year

In [24]:
# Create a pivot table for the sales with respect to the Outlet_Location_Type and Outlet_Establishment_Year
table = pd.pivot_table(train, values = 'Item_Outlet_Sales', index = ['Outlet_Location_Type','Outlet_Establishment_Year'], aggfunc = np.mean)
table

Unnamed: 0_level_0,Unnamed: 1_level_0,Item_Outlet_Sales
Outlet_Location_Type,Outlet_Establishment_Year,Unnamed: 2_level_1
Tier 1,1985,338.028818
Tier 1,1997,2315.893618
Tier 1,1999,2377.39229
Tier 2,2002,2224.604018
Tier 2,2004,2421.808702
Tier 2,2007,2364.067474
Tier 3,1985,3735.783769
Tier 3,1987,2271.275425
Tier 3,1998,351.671419
Tier 3,2009,2043.199376


In [26]:
# Initiating new empty column
test['Super_mean'] = 0

# Assigning variables to strings ( to shorten code length)
s2 = 'Outlet_Location_Type'
s1 = 'Outlet_Establishment_Year'

# For every Unique Value in s1
for i in test[s1].unique():
  # For every Unique Value in s2
  for j in test[s2].unique():
    # Calculate and Assign mean to new column, corresponding to both unique values of s1 and s2 simultaneously
    test['Super_mean'][(test[s1] == i) & (test[s2]==str(j))] = train['Item_Outlet_Sales'][(train[s1] == i) & (train[s2]==str(j))].mean()

# Calculating mean absolute error
combined_error = mean_absolute_error(test['Item_Outlet_Sales'] , test['Super_mean'] )
print('Error:', combined_error.round(3))

Error: 1083.556
