<a href="https://colab.research.google.com/github/evukich/Food-Sales-Predictions/blob/main/Sales_Predictions_Proj_1_Final_Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn import set_config
set_config(display='diagram')
from sklearn.metrics import r2_score
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [None]:
path = '/content/sales_predictions.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [None]:
df['Item_Fat_Content'].value_counts()
#Shows inconsistent value names
#Fix LF and low fat to Low Fat 
#Fix reg to Regular 
df.replace({'LF': 'Low Fat'}, inplace = True)
df.replace({'low fat': 'Low Fat'}, inplace = True)
df.replace({'reg': 'Regular'}, inplace = True)
df['Item_Fat_Content'].value_counts()

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

In [None]:
df.isnull().sum()
#Data is clean :) 

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [None]:
y = df['Item_Outlet_Sales']

X = df.drop(columns = ['Item_Identifier', 'Item_Outlet_Sales'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [None]:
# Select columns
num_selector = make_column_selector(dtype_include='number')
cat_selector = make_column_selector(dtype_include='object')
# Instantiate Transformers
mean_imputer = SimpleImputer(strategy='mean')
freq_imputer = SimpleImputer(strategy='most_frequent')
scaler = StandardScaler()
ohe_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
# Make Pipelines for each column type
num_pipe = make_pipeline(mean_imputer, scaler)
cat_pipe = make_pipeline(freq_imputer, ohe_encoder)
# Match pipe to column
num_tuple = (num_pipe, num_selector)
cat_tuple = (cat_pipe, cat_selector)
# Make column Transformer
column_transformer = make_column_transformer(num_tuple, cat_tuple)
column_transformer

In [None]:
#Building Linear Regression model

In [None]:
lin_reg = LinearRegression()

In [None]:
pipe = make_pipeline(column_transformer, lin_reg)


In [None]:
pipe.fit(X_train, y_train)

In [None]:
print(f'Train R2: {r2_score(y_train, pipe.predict(X_train))}')
print(f'Test R2: {r2_score(y_test, pipe.predict(X_test))}')

#This linear regression is currently scoring a bit low, but the training and testing data is comparable. 

Train R2: 0.5615527181926836
Test R2: 0.567134117820512


In [None]:
from sklearn.dummy import DummyRegressor
#instantiate baseling
dummy = DummyRegressor(strategy='mean')

#create pipeline
dummy_pipe = make_pipeline(column_transformer, dummy)
#fit pipeline
dummy_pipe.fit(X_train, y_train)
#create train and test predictions

#evaluate model
print('Training Scores')
evaluate_model(y_train, dummy_pipe.predict(X_train))

print('\n')

print('Testing Scores')
evaluate_model(y_test, dummy_pipe.predict(X_test))

Training Scores
scores: MAE: 1360.2184410159132, 
MSE: 2959455.7045265585, 
RMSE: 1720.306863477141,   
R2: 0.0


Testing Scores
scores: MAE: 1326.121044678208, 
MSE: 2772144.4627103633, 
RMSE: 1664.9758144520788,   
R2: -0.004772483978719766


In [None]:
y_pred_test = pipe.predict(X_test)

In [None]:
y_pred_train = pipe.predict(X_train)

In [None]:
def evaluate_model(y_true, y_pred):
  mae = mean_absolute_error(y_true, y_pred)
  mse = mean_squared_error(y_true, y_pred)
  rmse = np.sqrt(mean_squared_error(y_true, y_pred))
  r2 = r2_score(y_true, y_pred)
  print(f'scores: MAE: {mae}, \nMSE: {mse}, \nRMSE: {rmse}, \
  \nR2: {r2}')

In [None]:
evaluate_model(y_test, y_pred_test)

scores: MAE: 804.0386963866729, 
MSE: 1194267.1375995984, 
RMSE: 1092.825300585413,   
R2: 0.567134117820512


In [None]:
evaluate_model(y_train, y_pred_train)

# The RMSE is showing high reliability with $1092.83 in the testing data and $1139.11 in the training data. This tells us that the Outlet sales may be approximately $1092-1139 off from my predictions.

scores: MAE: 847.0849474342929, 
MSE: 1297565.309278826, 
RMSE: 1139.107242220339,   
R2: 0.5615527181926836


In [None]:
#Building a regression tree model to predict sales.

#Build a simple regression tree model.
#Compare the performance of your model based on r^2.
#Compare the performance of your model based on rmse.  

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier

In [None]:
dec_tree = DecisionTreeRegressor(random_state = 42)

In [None]:
pipe_dec_tree = make_pipeline(column_transformer, dec_tree)

In [None]:
pipe_dec_tree.fit(X_train, y_train)

In [None]:
train_preds = pipe_dec_tree.predict(X_train)
test_preds = pipe_dec_tree.predict(X_test)

In [None]:
train_score = pipe_dec_tree.score(X_train, y_train)
test_score = pipe_dec_tree.score(X_test, y_test)
print(train_score)
print(test_score)

1.0
0.19276592252469948


In [None]:
np.sqrt(mean_squared_error(y_test, test_preds))
#Based on the RMSE, this model appears to be performing less efficiently than the linear regression model. 

1492.3607089768584

In [None]:
evaluate_model(y_train, train_preds)

scores: MAE: 0.0, 
MSE: 0.0, 
RMSE: 0.0,   
R2: 1.0


In [None]:
evaluate_model(y_test, test_preds)

scores: MAE: 1038.3921153449085, 
MSE: 2227140.4856979116, 
RMSE: 1492.3607089768584,   
R2: 0.19276592252469948


In [None]:
#Based on these two models that I tested, I believe that the linear regression model provides a more reliable source of predictive data.