# Food sales predictions (ML preprocessing)

### **Author:** Gurtej Bains 

## Import Libraries

In [37]:
# imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import set_config #required for the diagram to be printed 
set_config(display='diagram')  #required for the diagram to be printed 

## Import Data

In [38]:
df = pd.read_csv(r"C:\Users\gurte\OneDrive\All About Learning\Coding Dojo\Learning Material\02 Weekly Assignments\wk5-12-12 (ML)\sales_predictions_for_ml.csv")
print ("Import successful")

Import successful


In [39]:
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


## Q1: Drop duplicates 

In [41]:
# Display the sum of missing values
df.isna().sum().sum()

3873

In [42]:
# Check to see if there are any duplicate rows
df.duplicated().sum()

0

In [43]:
# Drop duplicte rows
df.drop_duplicates(inplace=True)

In [44]:
# Confirm now duplicate rows remain
df.duplicated().sum()

0

## Q1: Fix inconsistencies in categorical data

In [45]:
# Summary stats of character vars

df.describe(include=['object','category'])

Unnamed: 0,Item_Identifier,Item_Fat_Content,Item_Type,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type
count,8523,8523,8523,8523,6113,8523,8523
unique,1559,5,16,10,3,3,4
top,FDW13,Low Fat,Fruits and Vegetables,OUT027,Medium,Tier 3,Supermarket Type1
freq,10,5089,1232,935,2793,3350,5577


In [46]:
df['Item_Identifier'].value_counts(dropna=False).sort_index(ascending=True, na_position='first').head()

DRA12    6
DRA24    7
DRA59    8
DRB01    3
DRB13    5
Name: Item_Identifier, dtype: int64

In [47]:
df['Item_Fat_Content'].value_counts(dropna=False).sort_index(ascending=True, na_position='first').head()

LF          316
Low Fat    5089
Regular    2889
low fat     112
reg         117
Name: Item_Fat_Content, dtype: int64

In [48]:
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace(['low fat', 'LF', 'reg'], ['Low Fat', 'Low Fat', 'Regular'])
df['Item_Fat_Content'].value_counts(dropna=False, ascending=True)

Regular    3006
Low Fat    5517
Name: Item_Fat_Content, dtype: int64

In [49]:
df['Item_Type'].value_counts(dropna=False).sort_index(ascending=True, na_position='first').head()

Baking Goods    648
Breads          251
Breakfast       110
Canned          649
Dairy           682
Name: Item_Type, dtype: int64

In [50]:
df['Outlet_Identifier'].value_counts(dropna=False).sort_index(ascending=True, na_position='first').head()

OUT010    555
OUT013    932
OUT017    926
OUT018    928
OUT019    528
Name: Outlet_Identifier, dtype: int64

In [51]:
df['Outlet_Size'].value_counts(dropna=False).sort_index(ascending=True, na_position='first').head()

NaN       2410
High       932
Medium    2793
Small     2388
Name: Outlet_Size, dtype: int64

In [52]:
df['Outlet_Location_Type'].value_counts(dropna=False).sort_index(ascending=True, na_position='first').head()

Tier 1    2388
Tier 2    2785
Tier 3    3350
Name: Outlet_Location_Type, dtype: int64

In [53]:
df['Outlet_Type'].value_counts(dropna=False).sort_index(ascending=True, na_position='first').head()

Grocery Store        1083
Supermarket Type1    5577
Supermarket Type2     928
Supermarket Type3     935
Name: Outlet_Type, dtype: int64

## Q2: Identify the features (X) and target (y)

In [54]:
# Split

X = df.drop(columns=['Item_Outlet_Sales'])
y = df['Item_Outlet_Sales']

## Q3: Perform train and test split

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [56]:
X_train.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
4776,NCG06,16.35,Low Fat,0.029565,Household,256.4646,OUT018,2009,Medium,Tier 3,Supermarket Type2
7510,FDV57,15.25,Regular,0.0,Snack Foods,179.766,OUT018,2009,Medium,Tier 3,Supermarket Type2
5828,FDM27,12.35,Regular,0.158716,Meat,157.2946,OUT049,1999,Medium,Tier 1,Supermarket Type1
5327,FDG24,7.975,Low Fat,0.014628,Baking Goods,82.325,OUT035,2004,Small,Tier 2,Supermarket Type1
4810,FDD05,19.35,Low Fat,0.016645,Frozen Foods,120.9098,OUT045,2002,,Tier 2,Supermarket Type1


In [57]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6392 entries, 4776 to 7270
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            6392 non-null   object 
 1   Item_Weight                5285 non-null   float64
 2   Item_Fat_Content           6392 non-null   object 
 3   Item_Visibility            6392 non-null   float64
 4   Item_Type                  6392 non-null   object 
 5   Item_MRP                   6392 non-null   float64
 6   Outlet_Identifier          6392 non-null   object 
 7   Outlet_Establishment_Year  6392 non-null   int64  
 8   Outlet_Size                4580 non-null   object 
 9   Outlet_Location_Type       6392 non-null   object 
 10  Outlet_Type                6392 non-null   object 
dtypes: float64(3), int64(1), object(7)
memory usage: 599.2+ KB


## Q4 Create a preprocessing object

In [58]:
# Selectors
cat_selector = make_column_selector(dtype_include='object')
num_selector = make_column_selector(dtype_include='number')

In [59]:
# Imputers for missing values 
freq_imputer = SimpleImputer(strategy='most_frequent')
mean_imputer = SimpleImputer(strategy='mean')

In [60]:
# One-hot encoder for categorical variables 
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

In [61]:
# Scaler for numerical variables 
scaler = StandardScaler()

In [62]:
# Numeric pipeline

numeric_pipe = make_pipeline(mean_imputer, scaler)
numeric_pipe

In [63]:
# Categorical pipeline

categorical_pipe = make_pipeline(freq_imputer, ohe)
categorical_pipe

In [64]:
# Tuples for Column Transformer
number_tuple = (numeric_pipe, num_selector)
category_tuple = (categorical_pipe, cat_selector)

# ColumnTransformer
preprocessor = make_column_transformer(number_tuple, category_tuple)
preprocessor

In [65]:
# fit on train

preprocessor.fit(X_train)

In [66]:
# transform train and test

X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [67]:
# Check for missing values and that data is scaled and one-hot encoded

print(np.isnan(X_train_processed).sum().sum(), 'missing values in training data')
print(np.isnan(X_test_processed).sum().sum(), 'missing values in testing data')
print('\n')
print('All data in X_train_processed are', X_train_processed.dtype)
print('All data in X_test_processed are', X_test_processed.dtype)
print('\n')
print('shape of data is', X_train_processed.shape)
print('\n')

X_train_processed[:10]

0 missing values in training data
0 missing values in testing data


All data in X_train_processed are float64
All data in X_test_processed are float64


shape of data is (6392, 1592)




array([[ 8.17248678e-01, -7.12775072e-01,  1.82810922e+00, ...,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00],
       [ 5.56339503e-01, -1.29105225e+00,  6.03368881e-01, ...,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00],
       [-1.31511957e-01,  1.81331864e+00,  2.44540557e-01, ...,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       ...,
       [ 4.21334361e-16, -4.79739240e-02,  1.94299739e+00, ...,
         0.00000000e+00,  0.00000000e+00,  1.00000000e+00],
       [ 1.61183571e+00,  1.85156735e+00,  1.79995405e+00, ...,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-1.11940897e+00, -5.33681676e-01, -4.16764231e-01, ...,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00]])