# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

## Reading Data

In [2]:
sample_sub = pd.read_csv(r"D:\Data Science Course\Projects\Analytics vidya\Big Mart Sales Prediction\sample_submission_8RXa3c6.csv")
train = pd.read_csv(r"D:\Data Science Course\Projects\Analytics vidya\Big Mart Sales Prediction\train_v9rqX0R.csv")
test = pd.read_csv(r"D:\Data Science Course\Projects\Analytics vidya\Big Mart Sales Prediction\test_AbJTz2l.csv")

In [3]:
print('Shape of train data: {}'.format(train.shape))
print('Shape of test data: {}'.format(test.shape))

Shape of train data: (8523, 12)
Shape of test data: (5681, 11)


In [4]:
train.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


# Feature Dictionary
Item_Identifier : Unique product ID (Informative)

Item_Weight : Weight of product (Continuous)

Item_Fat_Content : Whether the product is low fat or not (Categorical)

Item_Visibility : The % of total display area of all products in a store allocated to the particular product (Continuous)

Item_Type : The category to which the product belongs (Categorical)

Item_MRP : Maximum Retail Price (list price) of the product (Continuous)

Outlet_Identifier : Unique store ID (Informative)

Outlet_Establishment_Year : The year in which store was established (Informative - Descrete)

Outlet_Size : The size of the store in terms of ground area covered (Categorical)

Outlet_Location_Type : The type of city in which the store is located (Categorical)

Outlet_Type : Whether the outlet is just a grocery store or some sort of supermarket (Categorical)

Item_Outlet_Sales : Sales of the product in the particular store. This is the outcome variable to be predicted. (Target)

In [5]:
train.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [6]:
sample_sub.head()

Unnamed: 0,Item_Identifier,Outlet_Identifier,Item_Outlet_Sales
0,FDW58,OUT049,1000
1,FDW14,OUT017,1000
2,NCN55,OUT010,1000
3,FDQ58,OUT017,1000
4,FDY38,OUT027,1000


In [7]:
train['is_train'] = 1
test['is_train'] = 0
test['Item_Outlet_Sales'] = None

In [8]:
data = pd.concat((train,test))

In [9]:
data.shape

(14204, 13)

# ML Pipeline

### Missing Values

In [10]:
data.isna().sum()/data.shape[0]*100

Item_Identifier               0.000000
Item_Weight                  17.171219
Item_Fat_Content              0.000000
Item_Visibility               0.000000
Item_Type                     0.000000
Item_MRP                      0.000000
Outlet_Identifier             0.000000
Outlet_Establishment_Year     0.000000
Outlet_Size                  28.273726
Outlet_Location_Type          0.000000
Outlet_Type                   0.000000
Item_Outlet_Sales            39.995776
is_train                      0.000000
dtype: float64

In [11]:
weight_dict = data.groupby('Item_Identifier')['Item_Weight'].median().to_dict()

In [12]:
weight_dict

{'DRA12': 11.6,
 'DRA24': 19.35,
 'DRA59': 8.27,
 'DRB01': 7.39,
 'DRB13': 6.115,
 'DRB24': 8.785,
 'DRB25': 12.3,
 'DRB48': 16.75,
 'DRC01': 5.92,
 'DRC12': 17.85,
 'DRC13': 8.26,
 'DRC24': 17.85,
 'DRC25': 5.73,
 'DRC27': 13.8,
 'DRC36': 13.0,
 'DRC49': 8.67,
 'DRD01': 12.1,
 'DRD12': 6.96,
 'DRD13': 15.0,
 'DRD15': 10.6,
 'DRD24': 13.85,
 'DRD25': 6.135,
 'DRD27': 18.75,
 'DRD37': 9.8,
 'DRD49': 9.895,
 'DRD60': 15.7,
 'DRE01': 10.1,
 'DRE03': 19.6,
 'DRE12': 4.59,
 'DRE13': 6.28,
 'DRE15': 13.35,
 'DRE25': 15.35,
 'DRE27': 11.85,
 'DRE37': 13.5,
 'DRE48': 8.43,
 'DRE49': 20.75,
 'DRE60': 9.395,
 'DRF01': 5.655,
 'DRF03': 19.1,
 'DRF13': 12.1,
 'DRF15': 18.35,
 'DRF23': 4.61,
 'DRF25': 9.0,
 'DRF27': 8.93,
 'DRF36': 16.1,
 'DRF37': 17.25,
 'DRF48': 5.73,
 'DRF49': 7.27,
 'DRF51': 15.75,
 'DRF60': 10.8,
 'DRG01': 14.8,
 'DRG03': 14.5,
 'DRG11': 6.385,
 'DRG13': 17.25,
 'DRG15': 6.13,
 'DRG23': 8.88,
 'DRG25': 10.5,
 'DRG27': 8.895,
 'DRG36': 14.15,
 'DRG37': 16.2,
 'DRG39': 14.15,
 '

In [13]:
data.loc[ (data['Item_Weight'].isna()), 'Item_Weight'] = data[data['Item_Weight'].isna()].apply(lambda row: weight_dict[row['Item_Identifier']], axis=1)

In [14]:
#data[data['Item_Weight'].isna()]['Item_Identifier'].apply(lambda row: weight_dict[row['Item_Identifier']], axis=1)

In [15]:
#data[data['Item_Weight'].isna()]['Item_Identifier'].apply(lambda x: weight_dict[x])

In [16]:
data['Outlet_Size'].fillna(value = 'Medium', inplace=True) # Imputing Mode Value

### One Hot Encoding

In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14204 entries, 0 to 5680
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            14204 non-null  object 
 1   Item_Weight                14204 non-null  float64
 2   Item_Fat_Content           14204 non-null  object 
 3   Item_Visibility            14204 non-null  float64
 4   Item_Type                  14204 non-null  object 
 5   Item_MRP                   14204 non-null  float64
 6   Outlet_Identifier          14204 non-null  object 
 7   Outlet_Establishment_Year  14204 non-null  int64  
 8   Outlet_Size                14204 non-null  object 
 9   Outlet_Location_Type       14204 non-null  object 
 10  Outlet_Type                14204 non-null  object 
 11  Item_Outlet_Sales          8523 non-null   object 
 12  is_train                   14204 non-null  int64  
dtypes: float64(3), int64(2), object(8)
memory usage

In [18]:
col_ls = ['Item_Identifier', 'Item_Fat_Content',
       'Item_Type', 'Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type']

In [19]:
data = pd.get_dummies(data = data,columns = col_ls)

In [20]:
data.shape

(14204, 1614)

In [21]:
data.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Item_Outlet_Sales,is_train,Item_Identifier_DRA12,Item_Identifier_DRA24,Item_Identifier_DRA59,Item_Identifier_DRB01,Item_Identifier_DRB13,...,Outlet_Size_High,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,9.3,0.016047,249.8092,3735.14,1,0,0,0,0,0,...,0,1,0,1,0,0,0,1,0,0
1,5.92,0.019278,48.2692,443.423,1,0,0,0,0,0,...,0,1,0,0,0,1,0,0,1,0
2,17.5,0.01676,141.618,2097.27,1,0,0,0,0,0,...,0,1,0,1,0,0,0,1,0,0
3,19.2,0.0,182.095,732.38,1,0,0,0,0,0,...,0,1,0,0,0,1,1,0,0,0
4,8.93,0.0,53.8614,994.705,1,0,0,0,0,0,...,1,0,0,0,0,1,0,1,0,0


### Min Max Scaling

In [22]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [23]:
data[['Item_Weight', 'Item_Visibility', 'Item_MRP']] = scaler.fit_transform(data[['Item_Weight', 'Item_Visibility', 'Item_MRP']])

### Train Test Split

##### Seperating labeled and unlabeled data.

In [24]:
train_df = data[ data['is_train'] == 1]
test_df = data[ data['is_train'] == 0]

In [25]:
train_df.shape, test_df.shape

((8523, 1614), (5681, 1614))

In [26]:
train_df.drop(['is_train'], inplace=True, axis=1)
test_df.drop(['is_train'], inplace=True, axis=1)
test_df.drop(['Item_Outlet_Sales'], inplace=True, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [27]:
train_df.columns[-1]

'Outlet_Type_Supermarket Type3'

In [28]:
train_df = train_df[ [col for col in train_df if col!='Item_Outlet_Sales'] + ['Item_Outlet_Sales']]

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_df.iloc[:,:-1],train_df['Item_Outlet_Sales'], test_size = 0.30,random_state = 1999)

In [30]:
X_train.shape,X_test.shape,train_df.shape,

((5966, 1612), (2557, 1612), (8523, 1613))

---
# Linear Regression

In [31]:
from sklearn.linear_model import LinearRegression

model = LinearRegression(fit_intercept=True, normalize=False, copy_X=True)

In [32]:
model.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [33]:
from sklearn.metrics import mean_squared_error,mean_absolute_error

In [34]:
y_pred = model.predict(X_test)

In [35]:
mse = mean_squared_error(y_test,y_pred)

In [36]:
import math
rmse = math.sqrt(mse)
rmse

26198859271468.617

---
# Decision Tree

In [42]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

In [64]:
model = DecisionTreeRegressor(max_depth=8)
model.fit(X_train, y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=8,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [65]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test,y_pred)

In [66]:
import math
rmse = math.sqrt(mse)
rmse

1114.2025178296135

---
# Random Forest

In [52]:
from sklearn.ensemble import RandomForestRegressor

In [67]:
model = RandomForestRegressor()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [68]:
mse = mean_squared_error(y_test,y_pred)
rmse = math.sqrt(mse)
rmse

1145.775971961617

--- 
# LASSO

In [56]:
from sklearn.linear_model import LassoCV
model = LassoCV()
model.fit(X_train, y_train)

LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
        max_iter=1000, n_alphas=100, n_jobs=None, normalize=False,
        positive=False, precompute='auto', random_state=None,
        selection='cyclic', tol=0.0001, verbose=False)

In [70]:
y_pred = model.predict(X_test)

In [71]:
mse = mean_squared_error(y_test,y_pred)
rmse = math.sqrt(mse)
rmse

1145.775971961617

In [72]:
min(y_pred)

46.54607799999997

---
# XGBoost

In [73]:
from xgboost.sklearn import XGBRegressor 

model = XGBRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [74]:
mse = mean_squared_error(y_test,y_pred)

In [75]:
rmse = math.sqrt(mse)
rmse

1101.0518130571259

In [76]:
min(y_pred)

-765.7961

# Final Prediction

In [79]:
train1 = data[ data['is_train'] == 1]
test1 = data[ data['is_train'] == 0]

In [80]:
train1.drop(['is_train'], inplace=True, axis=1)
test1.drop(['is_train'], inplace=True, axis=1)
test1.drop(['Item_Outlet_Sales'], inplace=True, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [81]:
train1.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Item_Outlet_Sales,Item_Identifier_DRA12,Item_Identifier_DRA24,Item_Identifier_DRA59,Item_Identifier_DRB01,Item_Identifier_DRB13,Item_Identifier_DRB24,...,Outlet_Size_High,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,0.282525,0.048866,0.927507,3735.14,0,0,0,0,0,0,...,0,1,0,1,0,0,0,1,0,0
1,0.081274,0.058705,0.072068,443.423,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,1,0
2,0.770765,0.051037,0.468288,2097.27,0,0,0,0,0,0,...,0,1,0,1,0,0,0,1,0,0
3,0.871986,0.0,0.640093,732.38,0,0,0,0,0,0,...,0,1,0,0,0,1,1,0,0,0
4,0.260494,0.0,0.095805,994.705,0,0,0,0,0,0,...,1,0,0,0,0,1,0,1,0,0


In [82]:
test1.head()


Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Item_Identifier_DRA12,Item_Identifier_DRA24,Item_Identifier_DRA59,Item_Identifier_DRB01,Item_Identifier_DRB13,Item_Identifier_DRB24,Item_Identifier_DRB25,...,Outlet_Size_High,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,0.964275,0.023036,0.325012,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,1,0,0
1,0.222983,0.117018,0.237819,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
2,0.598095,0.303221,0.893316,0,0,0,0,0,0,0,...,0,1,0,0,0,1,1,0,0,0
3,0.164335,0.04686,0.525233,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
4,0.538553,0.361153,0.861381,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,1


In [83]:
train1 = train1[ [col for col in train1 if col!='Item_Outlet_Sales'] + ['Item_Outlet_Sales']]

In [84]:
X_train = train1.iloc[:,:-1]
y_train = train1.iloc[:,-1]
X_test = test1

In [85]:
y_train.shape

(8523,)

In [86]:
model = XGBRegressor()
model.fit(X_train, y_train) 
y_pred = model.predict(X_test)

In [87]:
y_pred

array([1688.916 , 1356.9481,  834.9644, ..., 1774.6781, 3387.772 ,
       1543.2808], dtype=float32)

# Replacing negative values
model = model = DecisionTreeRegressor(max_depth=8)

model.fit(X_train, y_train)
y_pred = model.predict(X_test.iloc[index])

In [91]:
submit = sample_sub.copy()

In [89]:
submit['Item_Outlet_Sales'] = y_pred

In [94]:
submit.iloc[5572]

Item_Identifier        NCH43
Outlet_Identifier     OUT010
Item_Outlet_Sales    7.51194
Name: 5572, dtype: object

In [985]:
submit.to_csv(r"D:\Data Science Course\Projects\Analytics vidya\Big Mart Sales Prediction\submit.csv", index=False)

In [986]:
min(submit['Item_Outlet_Sales'])

7.5119428634643555