In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

In [2]:
data_train = pd.read_csv('train_v9rqX0R.csv')
data_test = pd.read_csv('test_AbJTz2l.csv')

# feature engineering

In [3]:
data_train.head() 

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [4]:
data = data_train.copy()

In [5]:
test = data_test.copy()

In [6]:
# numerical columns
data['years'] = 2020 - data['Outlet_Establishment_Year']
test['years'] = 2020 - test['Outlet_Establishment_Year']

In [7]:
data = data.drop('Outlet_Establishment_Year' , axis = 1 )
test = test.drop('Outlet_Establishment_Year' , axis = 1)

In [8]:
# filtering the continuous variables
# here I have considered at max 25 categories ,...this depends on the size of the data
conti_feature = [feature for feature in data.columns if data[feature].nunique() > 25 and data[feature].dtypes != 'O']
conti_test = [feature for feature in test.columns if test[feature].nunique() > 25 and test[feature].dtypes != 'O']

In [9]:
# seperating the numerical features from dataframe
numerical_columns = [feature for feature in data.columns if data[feature].dtypes != 'O']
numerical__test = [feature for feature in test.columns if test[feature].dtypes != 'O']

In [10]:
#  fixing the missing values in continuous features
data['Item_Weight'] = data['Item_Weight'].fillna(data['Item_Weight'].median())
test['Item_Weight'] = test['Item_Weight'].fillna(data['Item_Weight'].median())

In [11]:
data['Item_Weight'].isnull().sum() , test['Item_Weight'].isnull().sum()

(0, 0)

In [12]:
#lognormal transformation of the data

In [13]:
# for training set
for feature in numerical_columns:
    if 0 in data[feature].unique():
        pass
    else:
        data[feature] = np.log(data[feature])

In [14]:
# for test set
for feature in numerical__test:
    if 0 in test[feature].unique():
        pass
    else:
        test[feature] = np.log(test[feature])

In [15]:
test.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type,years
0,FDW58,3.032546,Low Fat,0.007565,Snack Foods,4.680854,OUT049,Medium,Tier 1,Supermarket Type1,3.044522
1,FDW14,2.116256,reg,0.038428,Dairy,4.469577,OUT017,,Tier 2,Supermarket Type1,2.564949
2,NCN55,2.681022,Low Fat,0.099575,Others,5.48792,OUT010,,Tier 3,Grocery Store,3.091042
3,FDQ58,1.989927,Low Fat,0.015388,Snack Foods,5.043644,OUT017,,Tier 2,Supermarket Type1,2.564949
4,FDY38,2.533697,Regular,0.118599,Dairy,5.456304,OUT027,Medium,Tier 3,Supermarket Type3,3.555348


In [16]:
# missing values in categorical_features

In [17]:
categorical_columns = [feature for feature in data.columns if data[feature].dtypes == 'O']
categorical_test = [feature for feature in test.columns if test[feature].dtypes == 'O']

In [18]:
for i in ['Item_Identifier' , 'Outlet_Identifier']:
    categorical_columns.remove(i)
    categorical_test.remove(i)

In [19]:
print(categorical_columns) 
print(categorical_test)

['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']


In [20]:
# fixing the missing entities in the categorical type of features in both the dataset
for feature in categorical_columns:
    data[feature] = np.where(data[feature].isnull() , 'Missing' , data[feature])

for feature in categorical_test:
    test[feature] = np.where(test[feature].isnull() , 'Missing' , test[feature])

In [21]:
# labelling the rare categories in categorical features in training_set
for feature in categorical_columns:
    temp = data.groupby(feature)[feature].count()/len(data)
    temp_df = temp[temp > 0.02].index
    data[feature] = np.where(data[feature].isin(temp_df) , data[feature] , 'Rare')

In [22]:
# labelling the rare categories in categorical features in test set
for feature in categorical_test:
    temp = test.groupby(feature)[feature].count()/len(test)
    temp_df = temp[temp > 0.02].index
    test[feature] = np.where(test[feature].isin(temp_df) , test[feature] , 'Rare')

In [23]:
# putting the right encoding to the categorical features
for feature in categorical_columns:
    temp = data.groupby(feature)['Item_Outlet_Sales'].count().sort_values().index
    temp = {k:i for i , k in enumerate(temp,0)}
    data[feature] = data[feature].map(temp)
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,years
0,FDA15,2.230014,3,0.016047,8,5.520697,OUT049,3,0,3,8.22554,3.044522
1,DRC01,1.778336,2,0.019278,3,3.876794,OUT018,3,2,0,6.094524,2.397895
2,FDN15,2.862201,3,0.01676,2,4.953133,OUT049,3,0,3,7.648392,3.044522
3,FDX07,2.95491,2,0.0,12,5.204529,OUT010,2,2,2,6.5963,3.091042
4,NCD19,2.189416,3,0.0,10,3.986414,OUT013,0,2,3,6.902446,3.496508


In [24]:
# putting the right encoding to the categorical features
for feature in categorical_test:
    temp = test.groupby(feature)['Item_Weight'].count().sort_values().index
    temp = {k:i for i , k in enumerate(temp,0)}
    test[feature] = test[feature].map(temp)
test.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type,years
0,FDW58,3.032546,3,0.007565,13,4.680854,OUT049,3,0,3,3.044522
1,FDW14,2.116256,0,0.038428,9,4.469577,OUT017,2,1,3,2.564949
2,NCN55,2.681022,3,0.099575,3,5.48792,OUT010,2,2,2,3.091042
3,FDQ58,1.989927,3,0.015388,13,5.043644,OUT017,2,1,3,2.564949
4,FDY38,2.533697,2,0.118599,9,5.456304,OUT027,3,2,1,3.555348


## Feature Scaling

In [25]:
scaling_feature = [feature for feature in data.columns if feature not in ['Item_Identifier' , 'Outlet_Identifier' , 'Item_Outlet_Sales']]
from sklearn.preprocessing import MinMaxScaler
scale = MinMaxScaler()
scale.fit(data[scaling_feature])
scaled_train = scale.transform(data[scaling_feature])

  return self.partial_fit(X, y)


In [26]:
y_feature = ['Item_Outlet_Sales']
scale_y = MinMaxScaler()
scaled_y = scale_y.fit_transform(data[y_feature])

In [27]:
ftr = [col for col in test.columns if col not in ['Item_Identifier' , 'Outlet_Identifier']]
scaled_test = scale.transform(test[ftr])

In [28]:
training_data = pd.concat([data[['Item_Identifier' , 'Outlet_Identifier']].reset_index(drop=True),
                          pd.DataFrame((scaled_train) , columns = scaling_feature) , pd.DataFrame((scaled_y) , columns = y_feature) ], axis = 1)

In [29]:
training_data.head()

Unnamed: 0,Item_Identifier,Outlet_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type,years,Item_Outlet_Sales
0,FDA15,OUT049,0.462051,1.0,0.048866,0.666667,0.969148,1.0,0.0,1.0,0.558664,0.790123
1,DRC01,OUT018,0.16967,0.666667,0.058705,0.25,0.202234,1.0,1.0,0.0,0.0,0.433414
2,FDN15,OUT049,0.871279,1.0,0.051037,0.166667,0.704368,1.0,0.0,1.0,0.558664,0.693514
3,FDX07,OUT010,0.931292,0.666667,0.0,1.0,0.821649,0.666667,1.0,0.666667,0.598856,0.517406
4,NCD19,OUT013,0.435771,1.0,0.0,0.833333,0.253374,0.0,1.0,1.0,0.949164,0.568651


In [30]:
testing_data = pd.concat([test[['Item_Identifier' , 'Outlet_Identifier']].reset_index(drop=True),
                          pd.DataFrame((scaled_test) , columns = ftr) ], axis = 1)


In [31]:
testing_data.head()

Unnamed: 0,Item_Identifier,Outlet_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type,years
0,FDW58,OUT049,0.981548,1.0,0.023036,1.083333,0.577344,1.0,0.0,1.0,0.558664
1,FDW14,OUT017,0.388413,0.0,0.117018,0.75,0.478779,0.666667,0.5,1.0,0.144329
2,NCN55,OUT010,0.753998,1.0,0.303221,0.25,0.953856,0.666667,1.0,0.666667,0.598856
3,FDQ58,OUT017,0.306637,1.0,0.04686,1.083333,0.746593,0.666667,0.5,1.0,0.144329
4,FDY38,OUT027,0.658632,0.666667,0.361153,0.75,0.939107,1.0,1.0,0.333333,1.0


In [32]:
training_data.to_csv('train.csv' , index = False)
testing_data.to_csv('test.csv' , index = False)

# Feature Selection 

In [33]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [34]:
y_train = training_data[['Item_Outlet_Sales']]
x_train = training_data[[col for col in data.columns if col not in ['Item_Identifier' , 'Outlet_Identifier' , 'Item_Outlet_Sales']]]

In [35]:
x_train.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type,years
0,0.462051,1.0,0.048866,0.666667,0.969148,1.0,0.0,1.0,0.558664
1,0.16967,0.666667,0.058705,0.25,0.202234,1.0,1.0,0.0,0.0
2,0.871279,1.0,0.051037,0.166667,0.704368,1.0,0.0,1.0,0.558664
3,0.931292,0.666667,0.0,1.0,0.821649,0.666667,1.0,0.666667,0.598856
4,0.435771,1.0,0.0,0.833333,0.253374,0.0,1.0,1.0,0.949164


In [36]:
feature_model = SelectFromModel(Lasso(alpha=0.005, random_state = 42))
feature_model.fit(x_train , y_train)

SelectFromModel(estimator=Lasso(alpha=0.005, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=42,
   selection='cyclic', tol=0.0001, warm_start=False),
        max_features=None, norm_order=1, prefit=False, threshold=None)

In [37]:
selected_feat = x_train.columns[(feature_model.get_support())]
selected_feat

Index(['Item_Visibility', 'Item_MRP', 'Outlet_Size'], dtype='object')

In [38]:
# hence the variable selected_feat contains the features of significance

In [39]:
# at very first we apply different algorithms to our preprocessed training data

In [40]:
from sklearn.model_selection import train_test_split
x_tr , x_vl , y_tr , y_vl = train_test_split(x_train , y_train , test_size = 0.2)

In [41]:
x_tr.shape , y_tr.shape

((6818, 9), (6818, 1))

In [42]:
from sklearn.metrics import mean_squared_error

In [43]:
models = []

In [44]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
models.append(lin_reg)

In [45]:
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor()
models.append(rf_reg)

In [46]:
from xgboost import XGBRegressor
xgb_reg = XGBRegressor()
models.append(xgb_reg)

In [47]:
from sklearn.linear_model import BayesianRidge
ridge_reg = BayesianRidge()
models.append(ridge_reg)

In [48]:
for model in models:
    model.fit(x_tr , y_tr)
    score_tr = model.score(x_tr , y_tr)
    print('---------------------------------------------------------------------------')
    print('model fitted successfully ,.. model score is' , score_tr )
    y_pred = model.predict(x_vl)
    mse = mean_squared_error(y_vl , y_pred)
    print( model.score(x_vl , y_vl) , ' is the validation set score ')
    print('mse is' , mse)
    print('---------------------------------------------------------------------------')

---------------------------------------------------------------------------
model fitted successfully ,.. model score is 0.3720483205671793
0.3811157806578277  is the validation set score 
mse is 0.01892367118001545
---------------------------------------------------------------------------


  


---------------------------------------------------------------------------
model fitted successfully ,.. model score is 0.9433346586999486
0.6990184942580786  is the validation set score 
mse is 0.009203135041930993
---------------------------------------------------------------------------
---------------------------------------------------------------------------
model fitted successfully ,.. model score is 0.7506648359907002
0.7482031180603634  is the validation set score 
mse is 0.007699212953019883
---------------------------------------------------------------------------
---------------------------------------------------------------------------
model fitted successfully ,.. model score is 0.3720462342037667
0.3811092811731752  is the validation set score 
mse is 0.018923869915266015
---------------------------------------------------------------------------


  y = column_or_1d(y, warn=True)


In [49]:
X = training_data[selected_feat]
y = training_data[['Item_Outlet_Sales']]

In [50]:
from sklearn.model_selection import train_test_split
x , x_val , y , y_val = train_test_split(X , y , test_size = 0.2)

In [51]:
new_models = []

In [52]:
from sklearn.linear_model import LinearRegression
lin_reg2 = LinearRegression()
new_models.append(lin_reg)

In [53]:
from sklearn.ensemble import RandomForestRegressor
rf_reg2 = RandomForestRegressor()
new_models.append(rf_reg)

In [54]:
from xgboost import XGBRegressor
xgb_reg2 = XGBRegressor()
new_models.append(xgb_reg)

In [55]:
from sklearn.linear_model import BayesianRidge
ridge_reg2 = BayesianRidge()
new_models.append(ridge_reg)

In [56]:
for model in new_models:
    model.fit(x , y)
    score_tr = model.score(x , y)
    print('---------------------------------------------------------------------------')
    print('model fitted successfully ,.. model score is' , score_tr )
    y_pre = model.predict(x_val)
    mse = mean_squared_error(y_val , y_pre)
    print( model.score(x_val , y_val) , ' is the validation set score ')
    print('mse is' , mse)
    print('---------------------------------------------------------------------------')

---------------------------------------------------------------------------
model fitted successfully ,.. model score is 0.3393283111789361
0.29640186879532726  is the validation set score 
mse is 0.019999985391410575
---------------------------------------------------------------------------


  


---------------------------------------------------------------------------
model fitted successfully ,.. model score is 0.877337366094608
0.2421724004161313  is the validation set score 
mse is 0.021541474101039314
---------------------------------------------------------------------------
---------------------------------------------------------------------------
model fitted successfully ,.. model score is 0.4462204703397035
0.35488675404691455  is the validation set score 
mse is 0.018337535196087627
---------------------------------------------------------------------------
---------------------------------------------------------------------------
model fitted successfully ,.. model score is 0.33932802262194095
0.29645362141696086  is the validation set score 
mse is 0.019998514307803708
---------------------------------------------------------------------------


  y = column_or_1d(y, warn=True)
