***Importing packages***

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.tree import DecisionTreeRegressor

***Analyzing data***

In [2]:
preprocessed_data = pd.read_csv('preprocessed_data.csv')
bigmart_df = preprocessed_data.copy()

In [3]:
bigmart_df.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Item_Type_Baking Goods,...,Item_Type_Fruits and Vegetables,Item_Type_Hard Drinks,Item_Type_Health and Hygiene,Item_Type_Household,Item_Type_Meat,Item_Type_Others,Item_Type_Seafood,Item_Type_Snack Foods,Item_Type_Soft Drinks,Item_Type_Starchy Foods
0,9.3,0,0.016047,249.8092,1999,2,1,1,3735.138,0,...,0,0,0,0,0,0,0,0,0,0
1,5.92,1,0.019278,48.2692,2009,2,3,2,443.4228,0,...,0,0,0,0,0,0,0,0,1,0
2,17.5,0,0.01676,141.618,1999,2,1,1,2097.27,0,...,0,0,0,0,1,0,0,0,0,0
3,8.93,0,0.0,53.8614,1987,3,3,1,994.7052,0,...,0,0,0,1,0,0,0,0,0,0
4,10.395,1,0.0,51.4008,2009,2,3,2,556.6088,1,...,0,0,0,0,0,0,0,0,0,0


In [4]:
bigmart_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4650 entries, 0 to 4649
Data columns (total 25 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Item_Weight                      4650 non-null   float64
 1   Item_Fat_Content                 4650 non-null   int64  
 2   Item_Visibility                  4650 non-null   float64
 3   Item_MRP                         4650 non-null   float64
 4   Outlet_Establishment_Year        4650 non-null   int64  
 5   Outlet_Size                      4650 non-null   int64  
 6   Outlet_Location_Type             4650 non-null   int64  
 7   Outlet_Type                      4650 non-null   int64  
 8   Item_Outlet_Sales                4650 non-null   float64
 9   Item_Type_Baking Goods           4650 non-null   int64  
 10  Item_Type_Breads                 4650 non-null   int64  
 11  Item_Type_Breakfast              4650 non-null   int64  
 12  Item_Type_Canned    

In [6]:
bigmart_df.shape

(4650, 25)

***Splitting data into training and test sets***

In [7]:
X = bigmart_df.drop(['Item_Outlet_Sales'], axis=1)
y = bigmart_df['Item_Outlet_Sales']

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

***Fitting the data into a decision tree regressor and predicting it***

In [9]:
dt_regressor = DecisionTreeRegressor(random_state=0)
dt_regressor.fit(X_train, y_train)
dt_pred = dt_regressor.predict(X_test)

***Evaluating the test set results***

In [10]:
from sklearn.metrics import mean_squared_error
dt_rmse = np.sqrt(mean_squared_error(y_test, dt_pred))
dt_rmse

1557.304723790186

***Scaling the data***

In [11]:
X.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Type_Baking Goods,Item_Type_Breads,...,Item_Type_Fruits and Vegetables,Item_Type_Hard Drinks,Item_Type_Health and Hygiene,Item_Type_Household,Item_Type_Meat,Item_Type_Others,Item_Type_Seafood,Item_Type_Snack Foods,Item_Type_Soft Drinks,Item_Type_Starchy Foods
0,9.3,0,0.016047,249.8092,1999,2,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5.92,1,0.019278,48.2692,2009,2,3,2,0,0,...,0,0,0,0,0,0,0,0,1,0
2,17.5,0,0.01676,141.618,1999,2,1,1,0,0,...,0,0,0,0,1,0,0,0,0,0
3,8.93,0,0.0,53.8614,1987,3,3,1,0,0,...,0,0,0,1,0,0,0,0,0,0
4,10.395,1,0.0,51.4008,2009,2,3,2,1,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train_scaled = scale.fit_transform(X_train)
X_test_scaled = scale.fit_transform(X_test)

***Parameter tuning***

In [14]:
from sklearn.model_selection import GridSearchCV
dt_reg_model = DecisionTreeRegressor()
param_grid = dict(max_depth=[2,3,4,5,6,7,8,9,10], min_samples_leaf=[1,2,3,4,5,6,7,8,9,10])
grid = GridSearchCV(dt_reg_model, param_grid, cv=5)
grid_fit = grid.fit(X_train_scaled, y_train)

In [15]:
grid_fit.best_params_

{'max_depth': 4, 'min_samples_leaf': 7}

In [16]:
dt_reg = DecisionTreeRegressor(max_depth=4, min_samples_leaf=7)
dt_reg.fit(X_train_scaled, y_train)
pred_dt_scaled = dt_reg.predict(X_test_scaled)

In [18]:
rmse_scaled = np.sqrt(mean_squared_error(y_test, pred_dt_scaled))
rmse_scaled

1109.6646356584802