In [72]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoCV
from xgboost.sklearn import XGBRegressor 
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.feature_selection import VarianceThreshold
import math
sns.set()

In [73]:
data = pd.read_csv(r"D:\Data Science Course\Projects\Analytics vidya\Big Mart Sales Prediction\concat_data.csv")

In [74]:
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,is_train
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,1
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,1
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,1
3,FDX07,19.2,Regular,0.055073,Fruits and Vegetables,182.095,OUT010,1998,Small,Tier 3,Grocery Store,732.38,1
4,NCD19,8.93,Low Fat,0.044638,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,1


In [75]:
data.groupby('Outlet_Location_Type')['Item_Visibility'].median()

Outlet_Location_Type
Tier 1    0.056329
Tier 2    0.055073
Tier 3    0.056329
Name: Item_Visibility, dtype: float64

---
# Extracting Features

In [76]:
data['Item_Class'] = data['Item_Identifier'].apply(lambda x: x[:2])

In [77]:
data['Item_Class'].replace({'FD':'Food', 'DR':'Drink', 'NC':'Non Consumable'},inplace=True)

# Creating Features

In [78]:
data['Working_Years'] = 2013 - data['Outlet_Establishment_Year'] # Working years

### `Item_Weight` with `Item_Visibility`

In [79]:
data['Item_Weight_+_Visi'] = data['Item_Weight'] + data['Item_Visibility']
data['Item_Weight_*_Visi'] = data['Item_Weight'] * data['Item_Visibility']
data['Item_Weight_/_Visi'] = data['Item_Weight'] / data['Item_Visibility']

### `Item_MRP` with `Item_Visibility`

In [80]:
data['Item_MRP_+_Visi']=data['Item_MRP'] + data['Item_Visibility']
data['Item_MRP_X_Visi']=data['Item_MRP'] * data['Item_Visibility']
data['Item_MRP_/_Visi']=data['Item_MRP'] / data['Item_Visibility']

### `Item_MRP` with `Item_Weight`

In [81]:
data['Item_MRP_/_Weight'] = data['Item_MRP']/data['Item_Weight']
data['Item_MRP_*_Weight'] = data['Item_MRP']*data['Item_Weight']

### `Item_MRP` * `Item_Weight` * `Item_Visibility`

In [82]:
data['VMW'] = data['Item_MRP']*data['Item_Weight']*data['Item_Visibility']

### `Outlet_Identifier`  (median_visibility) with `Item_Visibility`

In [83]:
out_visi = data.groupby('Outlet_Identifier')['Item_Visibility'].median().to_dict()
data['Outlet_Identifier_/_Visi'] = data.apply(lambda row: out_visi[row['Outlet_Identifier']]/row['Item_Visibility'], axis=1)

### `Outlet_Size`  (median_visibility)  with `Item_Visibility`

In [84]:
out_visi = data.groupby('Outlet_Size')['Item_Visibility'].median().to_dict()
data['Outlet_Size_/_Visi'] = data.apply(lambda row: out_visi[row['Outlet_Size']]/row['Item_Visibility'], axis=1)

### `Outlet_Type`  (median_visibility)  with `Item_Visibility`

In [85]:
out_visi = data.groupby('Outlet_Type')['Item_Visibility'].median().to_dict()
data['Outlet_Type_/_Visi'] = data.apply(lambda row: out_visi[row['Outlet_Type']]/row['Item_Visibility'], axis=1)

### `Outlet_Location_Type`  (median_visibility)  with `Item_Visibility`

In [86]:
out_visi = data.groupby('Outlet_Location_Type')['Item_Visibility'].median().to_dict()
data['Outlet_Location_Type_/_Visi'] = data.apply(lambda row: out_visi[row['Outlet_Location_Type']]/row['Item_Visibility'], axis=1)

### `Outlet_Unique_Items`

In [87]:
out_uni_item = data.groupby('Outlet_Identifier')['Item_Identifier'].count().to_dict()
data['Outlet_Unique_Items'] = data['Outlet_Identifier'].apply(lambda x: out_uni_item[x])

### `Total_Outlet_Sale`

In [88]:
out_sale = data.groupby('Outlet_Identifier')['Item_Outlet_Sales'].sum().to_dict()
data['Total_Outlet_Sale'] = data['Outlet_Identifier'].apply(lambda x: out_sale[x])
data['Total_Outlet_Sale'] = data['Total_Outlet_Sale'].apply(lambda x: int(x))

### `Average_Item_MRP`

In [89]:
avg_item_dict = data.groupby(['Item_Identifier'])['Item_MRP'].mean().to_dict()
data["Average_Item_MRP"] = data['Item_Identifier'].apply(lambda x:avg_item_dict[x])

In [90]:
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,is_train,Item_Class,Working_Years,Item_Weight_+_Visi,Item_Weight_*_Visi,Item_Weight_/_Visi,Item_MRP_+_Visi,Item_MRP_X_Visi,Item_MRP_/_Visi,Item_MRP_/_Weight,Item_MRP_*_Weight,VMW,Outlet_Identifier_/_Visi,Outlet_Size_/_Visi,Outlet_Type_/_Visi,Outlet_Location_Type_/_Visi,Outlet_Unique_Items,Total_Outlet_Sale,Average_Item_MRP
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,1,Food,14,9.316047,0.14924,579.536708,249.825247,4.008763,15567.053924,26.861204,2323.22556,37.2815,3.408523,3.43188,3.426001,3.510168,1550,2183969,249.542533
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,1,Drink,4,5.939278,0.114127,307.082357,48.288478,0.930544,2503.820893,8.153581,285.753664,5.508821,2.85677,2.856717,2.85677,2.921884,1546,1851822,48.991422
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,1,Food,14,17.51676,0.293301,1044.148072,141.63476,2.373528,8449.723525,8.092457,2478.315,41.536745,3.263565,3.285929,3.2803,3.360887,1550,2183969,140.088
3,FDX07,19.2,Regular,0.055073,Fruits and Vegetables,182.095,OUT010,1998,Small,Tier 3,Grocery Store,732.38,1,Food,15,19.255073,1.05741,348.625462,182.150073,10.028596,3306.403832,9.484115,3496.224,192.549048,1.104644,1.02815,1.121335,1.022793,925,188340,183.350556
4,NCD19,8.93,Low Fat,0.044638,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,1,Non Consumable,26,8.974638,0.398621,200.051863,53.906038,2.404288,1206.615166,6.031512,480.982302,21.470292,1.229542,1.229542,1.231631,1.261889,1553,2142663,54.8614


In [92]:
Feature_Data = data.copy()

In [93]:
Feature_Data.to_csv(r"D:\Data Science Course\Projects\Analytics vidya\Big Mart Sales Prediction\Feature_Data.csv", index=False)

# Label Encoding

In [20]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [21]:
label_col = ['Item_Fat_Content','Outlet_Size']

In [22]:
for col in label_col:
    data[col] = le.fit_transform(data[col])

# OHE

In [23]:
data.drop(['Item_Identifier','Outlet_Identifier','Outlet_Establishment_Year'],axis = 1,inplace=True)

In [24]:
cat_col = data.select_dtypes(include='object').columns

In [25]:
cat_col

Index(['Item_Type', 'Outlet_Location_Type', 'Outlet_Type', 'Item_Class'], dtype='object')

In [26]:
data = pd.get_dummies(data = data,columns = cat_col, drop_first=True)

In [27]:
data.shape

(14204, 46)

# Removing Highly Corelated Features

In [28]:
correlated_features = set()
correlation_matrix = data.corr()

In [94]:
for i in range(len(correlation_matrix .columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

In [30]:
len(correlated_features)

10

In [31]:
drop_labels = list(correlated_features)

In [32]:
data.drop(labels=drop_labels,inplace=True,axis=1)

In [33]:
data.shape

(14204, 36)

# Min Max Scaling

In [40]:
scaler = MinMaxScaler()

In [41]:
col_ls = ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Working_Years',
       'Item_Weight_/_Visi', 'Item_MRP_X_Visi', 'Item_MRP_/_Visi',
       'Item_MRP_/_Weight', 'Item_MRP_*_Weight', 'Outlet_Unique_Items']

In [42]:
data[col_ls] = scaler.fit_transform(data[col_ls])

# Train_test_Split

In [45]:
train_df = data[ data['is_train'] == 1]
test_df = data[ data['is_train'] == 0]

In [46]:
train_df.drop(['is_train'], inplace=True, axis=1)
test_df.drop(['is_train'], inplace=True, axis=1)
test_df.drop(['Item_Outlet_Sales'], inplace=True, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [47]:
train_df = train_df[ [col for col in train_df if col!='Item_Outlet_Sales'] + ['Item_Outlet_Sales']]

In [48]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_df.iloc[:,:-1],train_df['Item_Outlet_Sales'], test_size = 0.30,random_state = 1999)

In [49]:
from sklearn.linear_model import LinearRegression

model = LinearRegression(fit_intercept=True, normalize=False, copy_X=True)

In [50]:
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test,y_pred)
rmse = math.sqrt(mse)
rmse

1128.3761387513287

In [51]:
model = DecisionTreeRegressor(max_depth=5)
model.fit(X_train, y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=5,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [52]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test,y_pred)
rmse = math.sqrt(mse)
rmse

1083.6702754529392

In [53]:
model = DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=5,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [54]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test,y_pred)
rmse = math.sqrt(mse)
rmse

1083.6702754529392

In [55]:
model = DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=5,
                      max_features='auto', max_leaf_nodes=62,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=41, min_samples_split=93,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [56]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test,y_pred)
rmse = math.sqrt(mse)
rmse

1083.5565062672997

In [57]:
model = RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=5, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0,
                      n_estimators=150, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [58]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test,y_pred)
rmse = math.sqrt(mse)
rmse

1082.504571486568

In [59]:
sample_sub = pd.read_csv(r"D:\Data Science Course\Projects\Analytics vidya\Big Mart Sales Prediction\sample_submission_8RXa3c6.csv")

In [60]:
train1 = data[ data['is_train'] == 1]
test1 = data[ data['is_train'] == 0]

In [61]:
train1.drop(['is_train'], inplace=True, axis=1)
test1.drop(['is_train'], inplace=True, axis=1)
test1.drop(['Item_Outlet_Sales'], inplace=True, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [62]:
train1 = train1[ [col for col in train1 if col!='Item_Outlet_Sales'] + ['Item_Outlet_Sales']]

In [63]:
X_train = train1.iloc[:,:-1]
y_train = train1.iloc[:,-1]
X_test = test1

In [64]:
X_test.reset_index(drop=True)

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Size,Working_Years,Item_Weight_/_Visi,Item_MRP_X_Visi,Item_MRP_/_Visi,Item_MRP_/_Weight,Item_MRP_*_Weight,Outlet_Unique_Items,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,Item_Type_Dairy,Item_Type_Frozen Foods,Item_Type_Fruits and Vegetables,Item_Type_Hard Drinks,Item_Type_Health and Hygiene,Item_Type_Household,Item_Type_Meat,Item_Type_Others,Item_Type_Seafood,Item_Type_Snack Foods,Item_Type_Soft Drinks,Item_Type_Starchy Foods,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3,Item_Class_Food,Item_Class_Non Consumable
0,0.964275,0,0.022012,0.325012,1,0.416667,0.717724,0.011498,0.277466,0.071939,0.382371,0.986745,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0
1,0.222983,1,0.192272,0.237819,2,0.083333,0.049324,0.065578,0.041250,0.180312,0.102260,0.976436,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0
2,0.598095,0,0.529601,0.893316,2,0.458333,0.030976,0.506759,0.044316,0.303260,0.621406,0.066274,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1
3,0.164335,0,0.065172,0.525233,2,0.083333,0.117929,0.044927,0.195017,0.397649,0.178020,0.976436,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0
4,0.538553,1,0.634552,0.861381,1,1.000000,0.022525,0.585699,0.035390,0.316787,0.557721,1.000000,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5676,0.353974,1,0.054735,0.467004,2,0.500000,0.197975,0.034738,0.202818,0.240140,0.242753,0.986745,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0
5677,0.181304,1,0.769113,0.585126,1,0.000000,0.006252,0.509177,0.019781,0.419272,0.206048,0.980854,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0
5678,0.324204,0,0.385912,0.371199,2,0.291667,0.028166,0.180054,0.028295,0.207882,0.187898,0.983800,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1
5679,0.639774,1,0.258107,0.778154,2,0.083333,0.072551,0.224297,0.080455,0.251725,0.575894,0.976436,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0


In [65]:
model = RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=5, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0,
                      n_estimators=150, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [66]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [67]:
submit = sample_sub.copy()
submit['Item_Outlet_Sales'] = y_pred

In [69]:
submit.to_csv(r"D:\Data Science Course\Projects\Analytics vidya\Big Mart Sales Prediction\submit.csv", index=False)