<a href="https://colab.research.google.com/github/cipalisoc/project1/blob/main/Project_1_Part_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# The goal of this step is to help the retailer by using machine learning to make predictions about future sales based on the data provided.

In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import set_config
set_config(display='diagram')


In [6]:
# Load and view dataset; Since each row is identified by a unique item ID, I assigned the index as the 'Item_Identifier' column
filename = '/content/drive/MyDrive/Coding Dojo/Week 2: Pandas/sales_predictions.csv'
df = pd.read_csv(filename, index_col='Item_Identifier')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 8523 entries, FDA15 to DRG01
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Weight                7060 non-null   float64
 1   Item_Fat_Content           8523 non-null   object 
 2   Item_Visibility            8523 non-null   float64
 3   Item_Type                  8523 non-null   object 
 4   Item_MRP                   8523 non-null   float64
 5   Outlet_Identifier          8523 non-null   object 
 6   Outlet_Establishment_Year  8523 non-null   int64  
 7   Outlet_Size                6113 non-null   object 
 8   Outlet_Location_Type       8523 non-null   object 
 9   Outlet_Type                8523 non-null   object 
 10  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(6)
memory usage: 799.0+ KB


Unnamed: 0_level_0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
Item_Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


Missing values occur under 'Item_Weight' and 'Outlet_Size' columns. Numeric data are the 'Item_Weight', 'Item_visibility', 'Item_MRP', and 'Outlet_Establishment_Year. Ordinal columns are 'Item_Fat_Content', 'Outlet_Size', and 'Outlet_Location_Type'. Nominal columns are 'Item_type', 'Outlet_Type', and 'Outlet_Identifier'.

In [13]:
# confirmed no duplicated rows
df.duplicated().sum()

0

In [15]:
# examining number of unique values in each column
df.nunique()

Item_Weight                   415
Item_Fat_Content                5
Item_Visibility              7880
Item_Type                      16
Item_MRP                     5938
Outlet_Identifier              10
Outlet_Establishment_Year       9
Outlet_Size                     3
Outlet_Location_Type            3
Outlet_Type                     4
Item_Outlet_Sales            3493
dtype: int64

In [20]:
# Ordinal Encoding for Item Fat Content
df['Item_Fat_Content'].value_counts()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

In [21]:
df['Item_Fat_Content'].replace({'Low Fat':0 , 'LF':0 , 'low fat':0 , 'Regular':1 , 'reg':1}, inplace=True)

In [22]:
# Ordinal Encoding for Outlet Size
df['Outlet_Size'].value_counts()

Medium    2793
Small     2388
High       932
Name: Outlet_Size, dtype: int64

In [23]:
df['Outlet_Size'].replace({'Small':0 , 'Medium':1 , 'High':2}, inplace=True)

In [24]:
# Ordinal Encoding for Outlet Location Type
df['Outlet_Location_Type'].value_counts()

Tier 3    3350
Tier 2    2785
Tier 1    2388
Name: Outlet_Location_Type, dtype: int64

In [25]:
df['Outlet_Location_Type'].replace({'Tier 1':0 , 'Tier 2':1 , 'Tier 3':2}, inplace=True)

In [28]:
# Validation split
X = df.drop('Item_Outlet_Sales', axis=1)
y = df['Item_Outlet_Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [29]:
# Instantiate Column Selectors
cat_selector = make_column_selector(dtype_include='object')
num_selector = make_column_selector(dtype_include='number')

In [31]:
# Instantiate Transformers

# For imputers, using 'most frequent' stategy for categorical values and 'median' for numerical values since I have int and float data types
freq_imputer = SimpleImputer(strategy='most_frequent')
median_imputer = SimpleImputer(strategy='median')
# Scaler
scaler = StandardScaler()
# One Hot Encoder
ohe = OneHotEncoder(handle_unknown='ignore' , sparse=False)

In [32]:
# Instantiate Pipelines

# Numeric pipeline
numeric_pipe = make_pipeline(median_imputer, scaler)
numeric_pipe

In [33]:
# Categorical pipeline
categorical_pipe = make_pipeline(freq_imputer, ohe)
categorical_pipe

In [34]:
# Instantiate Columntransformer

# Tuples for Column Transformer
number_tuple = (numeric_pipe, num_selector)
category_tuple = (categorical_pipe, cat_selector)
# ColumnTransformer
preprocessor = make_column_transformer(number_tuple, category_tuple)
preprocessor

In [35]:
# fit on training data
preprocessor.fit(X_train)

In [36]:
# transform train and test data
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [37]:
# Inspect result

# Check for missing values in train and test data
print('missing values in traning data:' , np.isnan(X_train_processed).sum().sum())
print('missing values in testing data:' , np.isnan(X_test_processed).sum().sum())

missing values in traning data: 0
missing values in testing data: 0


In [38]:
# check datatypes
print('data type in X_train_processed:' , X_train_processed.dtype)
print('data type in X_test_processed:' , X_test_processed.dtype)

data type in X_train_processed: float64
data type in X_test_processed: float64


In [39]:
X_train_processed.shape

(6392, 37)

In [40]:
X_train_processed

array([[ 0.82748547, -0.7403206 , -0.71277507, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.56664432,  1.35076614, -1.29105225, ...,  0.        ,
         1.        ,  0.        ],
       [-0.12102782,  1.35076614,  1.81331864, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 1.12389588, -0.7403206 , -0.92052713, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.77599877, -0.7403206 , -0.2277552 , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.82748547, -0.7403206 , -0.95867683, ...,  1.        ,
         0.        ,  0.        ]])