<a href="https://colab.research.google.com/github/codingbhaiya-data/Coffee-Quality-Report/blob/main/Big_mart_sales_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **🛒📊💻📈💵 Big Mart Sales Prediction Datasets By Rohit Raut**

1.Introduction


In this notebook, we aim to predict the sales of products across different BigMart outlets. Using historical sales data and various product and outlet attributes, we will build a regression model to make predictions. The model's performance will be evaluated using Root Mean Squared Error (RMSE)..

Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [None]:
warnings.filterwarnings('ignore')

In [None]:
train_data = pd.read_csv('Train.csv')
test_data = pd.read_csv('Test.csv')

In [None]:
train_data.sample(5)

In [None]:
test_data.sample(5)

Find the Shape of our Dataset

In [None]:
train_data.shape

In [None]:
test_data.shape

Getting more Information about the dataset

In [None]:
train_data.describe()

In [None]:
test_data.describe()

# Checking null values in dataset

In [None]:
train_data.isnull().sum()

In [None]:
test_data.isnull().sum()

In [None]:
per_null_train = train_data.isnull().sum()/train_data.shape[0]*100
per_null_train

In [None]:
per_null_test = test_data.isnull().sum()/test_data.shape[0]*100
per_null_test

As missing value percentage of both the datasets exceeding 5% threshold. We can't drop null value

# Checking Duplicate values

In [None]:
train_data.duplicated().sum()

In [None]:
test_data.duplicated().sum()

# Handling Missing Values

In [None]:
test_data['Item_Weight'].dtype

In [None]:
test_data['Outlet_Size'].dtype

different approaches need to handle numerical data and catagorical data **bold text**

# Univariate Imputation

In [None]:
mean_weight_train = train_data['Item_Weight'].mean()
median_weight_train = train_data['Item_Weight'].median()
print(mean_weight_train)
print(median_weight_train)

In [None]:
mean_weight_test = test_data['Item_Weight'].mean()
median_weight_test = test_data['Item_Weight'].median()
print(mean_weight_test)
print(median_weight_test)

In [None]:
test_data.sample(5)

In [None]:
train_data['Item_Weight_mean'] = train_data['Item_Weight'].fillna(mean_weight_train)
train_data['Item_Weight_median'] = train_data['Item_Weight'].fillna(median_weight_train)
test_data['Item_Weight_mean'] = test_data['Item_Weight'].fillna(mean_weight_test)
test_data['Item_Weight_median'] = test_data['Item_Weight'].fillna(median_weight_test)

In [None]:
train_data.head(3)

In [None]:
test_data.head()

Checking Variance

In [None]:
print("train:Orignal Item weight variable variance", train_data['Item_Weight'].var())
print("train:Item weight variable variance after mean imputation",train_data['Item_Weight_mean'].var())
print("train:Item weight variable variance after mean imputation",train_data['Item_Weight_median'].var())

In [None]:
print("Test:Orignal Item weight variable variance",test_data['Item_Weight'].var())
print("Test:Item weight variable variance after mean imputation",test_data['Item_Weight_mean'].var())
print("Test:Item weight variable variance after mean imputation",test_data['Item_Weight_median'].var())

In [None]:
train_data['Item_Weight'].plot(kind='kde', color='black',label='Orignal')
train_data['Item_Weight_mean'].plot(kind='kde', color='red',label='Mean')
train_data['Item_Weight_median'].plot(kind='kde', color='blue',label='Median')
plt.title('Train dataset:Item Weight Distribution')
plt.legend()
plt.show()

In [None]:
test_data['Item_Weight'].plot(kind='kde', color='black',label='Orignal')
test_data['Item_Weight_mean'].plot(kind='kde', color='red',label='Mean')
test_data['Item_Weight_median'].plot(kind='kde', color='blue',label='Median')
plt.title('Test dataset:Item Weight Distribution')
plt.legend()
plt.show()

The data distribution has been alter after mean and median imputation

In [None]:
train_data[['Item_Weight','Item_Weight_mean','Item_Weight_median']].boxplot()

In [None]:
test_data[['Item_Weight','Item_Weight_mean','Item_Weight_median']].boxplot()

After imputation :  Reduction of spread of data is obseerved(Not Desirable)

In [None]:
train_data["Item_Weight_interp"] = train_data["Item_Weight"].interpolate(method='linear')

In [None]:
train_data['Item_Weight'].plot(kind='kde', color='red',label='Orignal')
train_data['Item_Weight_interp'].plot(kind='kde', color='blue',label='Median')
plt.title('Train dataset:Item Weight Distribution')
plt.legend()
plt.show()

In [None]:
test_data["Item_Weight_interp"] = test_data["Item_Weight"].interpolate(method='linear')

In [None]:
test_data['Item_Weight'].plot(kind='kde', color='black',label='Orignal')
test_data['Item_Weight_interp'].plot(kind='kde', color='blue',label='Median')
plt.title('Test dataset:Item Weight Distribution')
plt.legend()
plt.show()

Interpolation is better than mean and median

# Multivariate Imputation

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.impute import MissingIndicator

In [None]:
knn = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')
si = SimpleImputer(missing_values=np.nan, strategy='mean')

In [None]:
train_data['Item_Weight_knn_imp'] = knn.fit_transform(train_data[['Item_Weight']])

In [None]:
train_data['Item_Weight_si_imp'] = si.fit_transform(train_data[['Item_Weight']])

In [None]:
train_data['Item_Weight'].plot(kind='kde', color='red',label='Orignal')
train_data['Item_Weight_knn_imp'].plot(kind='kde', color='blue',label='KNN')
plt.title('Train dataset:Item Weight Distribution')
plt.legend()
plt.show()

In [None]:
train_data['Item_Weight'].plot(kind='kde', color='red',label='Orignal')
train_data['Item_Weight_si_imp'].plot(kind='kde', color='blue',label='Simple')
plt.title('Train dataset:Item Weight Distribution')
plt.legend()
plt.show()

noe lets drop non valid imputed columns

In [None]:
train_data.head(1)

In [None]:
train_data = train_data.drop(columns=['Item_Weight','Item_Weight_si_imp','Item_Weight_knn_imp','Item_Weight_mean','Item_Weight_median'],axis=1)

In [None]:
train_data.head(1)

In [None]:
train_data.isnull().sum()

Deleting non rational imputational columns

In [None]:
test_data.head(1)

In [None]:
test_data = test_data.drop(columns=['Item_Weight','Item_Weight_mean','Item_Weight_median'],axis=1)

In [None]:
#hndling Missing data in Outlet_size

# Handling Missing data in Outlet_size

In [None]:
a = train_data['Outlet_Size'].value_counts()
b = train_data['Outlet_Type'].value_counts()
print(a)
print(b)

In [None]:
train_outlet_mode = train_data.pivot_table(values='Outlet_Size',columns='Outlet_Type',aggfunc= (lambda x: x.mode()[0]))

In [None]:
train_outlet_mode

In [None]:
missing_value = train_data['Outlet_Size'].isnull()

In [None]:
missing_value

In [None]:
train_data.loc[missing_value,'Outlet_Size'] = train_data.loc[missing_value,'Outlet_Type'].apply(lambda x: train_outlet_mode[x])

In [None]:
train_data.isnull().sum()

In [None]:
c = test_data['Outlet_Size'].value_counts()
d = test_data['Outlet_Type'].value_counts()
print(c)
print(d)

In [None]:
test_outlet_mode = test_data.pivot_table(values='Outlet_Size',columns='Outlet_Type',aggfunc= (lambda x: x.mode()[0]))

In [None]:
test_outlet_mode

In [None]:
missing_value = test_data['Outlet_Size'].isnull()

In [None]:
test_data.loc[missing_value,'Outlet_Size'] = test_data.loc[missing_value,'Outlet_Type'].apply(lambda x: train_outlet_mode[x])

In [None]:
test_data.isnull().sum()

Item Fat Content

In [None]:
train_data.head(2)

In [None]:
train_data['Item_Fat_Content'].value_counts()

In [None]:
train_data = train_data.replace({'Item_Fat_Content': {'Low Fat':'LF','low fat' :'LF', 'Regular' : 'reg'}})

In [None]:
train_data['Item_Fat_Content'].value_counts()

In [None]:
test_data['Item_Fat_Content'].value_counts()

In [None]:
test_data = test_data.replace({'Item_Fat_Content': {'Low Fat':'LF','low fat' :'LF', 'Regular' : 'reg'}})

In [None]:
test_data['Item_Fat_Content'].value_counts()

# Item Visibility

In [None]:
train_data.head(5)

In [None]:
train_data['Item_Visibility'].value_counts()

In [None]:
train_data['Item_Visibility_interpolate'] = train_data['Item_Visibility'].replace(0,np.nan).interpolate(method='linear')

In [None]:
train_data['Item_Visibility_interpolate'].value_counts()

In [None]:
train_data['Item_Visibility'].plot(kind='kde', color='red',label='Orignal')
train_data['Item_Visibility_interpolate'].plot(kind='kde', color='blue',label='Interpolate')
plt.title('Train dataset:Item Visibility Distribution')
plt.legend()
plt.show()

In [None]:
train_data.drop(columns=['Item_Visibility'],axis=1,inplace=True)

In [None]:
train_data.head(2)

In [None]:
# Same for Test dataset
test_data['Item_Visibility'].value_counts()

In [None]:
test_data['Item_Visibility_interpolate'] = test_data['Item_Visibility'].replace(0,np.nan).interpolate(method='linear')

In [None]:
test_data['Item_Visibility'].plot(kind='kde', color='red',label='Orignal')
test_data['Item_Visibility_interpolate'].plot(kind='kde', color='blue',label='Interpolate')
plt.title('Test dataset:Item Visibility Distribution')
plt.legend()
plt.show()

In [None]:
test_data.drop(columns=['Item_Visibility'],axis=1,inplace=True)

In [None]:
test_data.head(2)

# Item_Type

In [None]:
train_data['Item_Type'].value_counts()

# Item_Identifier

In [None]:
train_data['Item_Identifier'].value_counts().sample(7)

FD = Food & Drinks
NC = Non-Consumables

In [None]:
train_data['Item_Identifier'] = train_data['Item_Identifier'].apply(lambda x: x[0:2])

In [None]:
train_data['Item_Identifier'].value_counts()

for testing dataset

In [None]:
test_data['Item_Identifier'].value_counts().sample(7)

In [None]:
test_data['Item_Identifier'] = train_data['Item_Identifier'].apply(lambda x: x[0:2])

In [None]:
test_data['Item_Identifier'].value_counts()

# Outlat_Establishment_Year

In [None]:
train_data.columns

In [None]:
train_data['Outlet_Establishment_Year']

Converting Years to age

In [None]:
import datetime as dt
current_year = dt.datetime.today().year

In [None]:
current_year

In [None]:
train_data['Outlet_age'] = current_year - train_data['Outlet_Establishment_Year']

In [None]:
train_data.drop(columns=['Outlet_Establishment_Year'],axis=1,inplace=True)

In [None]:
train_data.head(3)

Same process for Test dataset

In [None]:
test_data['Outlet_age'] = current_year - test_data['Outlet_Establishment_Year']

In [None]:
test_data.drop(columns=['Outlet_Establishment_Year'],axis=1,inplace=True)

In [None]:
test_data.head(3)

# Handleing Catagorical Columns
using ordinal encoding

In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
encoded_train_data = train_data.copy()
encoded_test_data = test_data.copy()

In [None]:
cat_cols_train = train_data.select_dtypes(include='object').columns
cat_cols_test = test_data.select_dtypes(include='object').columns

for col in cat_cols_train:
  oe_train = OrdinalEncoder()
  encoded_train_data[col] = oe_train.fit_transform(train_data[[col]])
  oe_test = OrdinalEncoder()
  encoded_test_data[col] = oe_test.fit_transform(test_data[[col]])
  print(col)
  print(oe_train.categories_)
  print(oe_test.categories_)

In [None]:
encoded_test_data.head(5)

In [None]:
encoded_train_data.head(5)

Spliting Features and Target from Training Data

In [None]:
X = encoded_train_data.drop(columns=['Item_Outlet_Sales'],axis=1)
y = encoded_train_data['Item_Outlet_Sales']

In [None]:
X

We are dealing with coninuous target varialbles So we are goining to use

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

# Linear Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_train = lr.predict(X_train)
lr_test = lr.predict(X_test)

In [None]:
#Lets define a function for Model Evaluation
from sklearn.metrics import mean_squared_error, r2_score
def model_eval(actual, predicted):
  rmse = np.sqrt(mean_squared_error(actual, predicted))
  r2 = r2_score(actual, predicted)
  print('The RMSE value for the model is: ', round(rmse,3))
  print('The R2 Score for the model is: ', round(r2, 3))

In [None]:
model_eval(y_train, lr_train)

In [None]:
model_eval(y_test, lr_test)

# Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf1= rf.fit(X_train, y_train)
cross_val_scores = cross_val_score(rf1, X, y, cv=5, scoring='r2')
print("Cross-validation R-squared scores:", cross_val_scores)
print("Mean R-squared score:", cross_val_scores.mean())

# Ada Boost Regression

In [None]:
from sklearn.ensemble import AdaBoostRegressor
ada = AdaBoostRegressor()
ada.fit(X_train, y_train)
preds_ada_train = ada.predict(X_train)
preds_ada_test = ada.predict(X_test)

In [None]:
model_eval(y_train, preds_ada_train)

In [None]:
model_eval(y_test, preds_ada_test)

XGBRF Regressor

In [None]:
from xgboost import XGBRFRegressor

xg =XGBRFRegressor(n_estimators=100, random_state=42)

cross_val_scores = cross_val_score(xg, X, y, cv=5, scoring='r2')
print("Cross-validation R-squared scores:", cross_val_scores)
print("Mean R-squared score:", cross_val_scores.mean())

XGBRF Regressor Feature Importance

In [None]:
xg =XGBRFRegressor(n_estimators=100, random_state=42)

xg1 = xg.fit(X,y)

feature_importances = pd.Series(xg1.feature_importances_, index=X.columns)
feature_importances.sort_values(ascending=False).plot(kind='bar')
plt.show()

In [None]:
encoded_train_data.head(2)

In [None]:
# lets drop other columns
# ['Item_Fat_Content', 'Item_Identifier','Item_Type', 'Item_Weight_interp' , 'Item_Visibility_interpolate' , 'Outlet_Location_Type'  ]

In [None]:
xg =XGBRFRegressor(n_estimators=100, random_state=42)

cross_val_scores = cross_val_score(xg1, encoded_train_data.drop(columns=['Item_Fat_Content', 'Item_Identifier','Item_Type',
                                                 'Item_Weight_interp' , 'Item_Visibility_interpolate' , 'Outlet_Location_Type','Item_Outlet_Sales'],axis=1),
                                    y, cv=5, scoring='r2')
print("Cross-validation R-squared scores:", cross_val_scores)
print("Mean R-squared score:", cross_val_scores.mean())

# Gradient Boosting Regressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
gb = GradientBoostingRegressor()
gb.fit(X_train, y_train)
preds_gb_train = gb.predict(X_train)
preds_gb_test = gb.predict(X_test)

In [None]:
model_eval(y_train, preds_gb_train)

In [None]:
model_eval(y_test,preds_gb_test )

# XG Boost Regressor

In [None]:
import xgboost as xg
xgb = xg.XGBRegressor()

In [None]:
xgb.fit(X_train, y_train)
preds_xgb_train = xgb.predict(X_train)
preds_xgb_test = xgb.predict(X_test)

In [None]:
model_eval(y_train, preds_xgb_train)

In [None]:
model_eval(y_test, preds_xgb_test)

TEST DATA APPLICATION

In [None]:
encoded_test_data.head(2)

In [None]:
encoded_test_data['Item_Outlet_Sales'] = rf.predict(encoded_test_data)

In [None]:
encoded_test_data.head()

In [None]:
sample_submission = pd.read_csv('/content/sample_submission.csv')

In [None]:
sample_submission.head()

In [None]:
sample_submission['Item_Outlet_Sales'] = encoded_test_data['Item_Outlet_Sales']

In [None]:
sample_submission.to_csv('Rohit_final_output.csv', index = False)

In [None]:
subm = pd.read_csv('/content/Rohit_final_output.csv')

In [None]:
subm

In [None]:
subm['Item_Outlet_Sales'].describe()