In [35]:
# loading required libraries
import featuretools as ft
import numpy as np
import pandas as pd

train = pd.read_csv("../input/Train.csv")
test = pd.read_csv("../input/Test.csv")

In [36]:
# Data preperation

# saving identifiers
test_Item_Identifier = test['Item_Identifier']
test_Outlet_Identifier = test['Outlet_Identifier']

sales = train['Item_Outlet_Sales']
train.drop(['Item_Outlet_Sales'], axis=1, inplace=True)

In [37]:
#combine the train and test set as it saves us the trouble of performing the same step(s) twice.

combi = train.append(test, ignore_index=True)

In [38]:
combi.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1


In [39]:
combi['Outlet_Size'].value_counts()

Medium    4655
Small     3980
High      1553
Name: Outlet_Size, dtype: int64

In [40]:
combi.isnull().sum()

Item_Identifier                 0
Item_Weight                  2439
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  4016
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64

In [41]:
# imputing missing data
combi['Item_Weight'].fillna(combi['Item_Weight'].mean(), inplace = True)
combi['Outlet_Size'].fillna("missing", inplace = True)

In [42]:
combi.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
dtype: int64

In [43]:
# Data processing

combi['Item_Fat_Content'].value_counts()

Low Fat    8485
Regular    4824
LF          522
reg         195
low fat     178
Name: Item_Fat_Content, dtype: int64

In [44]:
# dictionary to replace the categories
fat_content_dict = {'Low Fat':0, 'Regular':1, 'LF':0, 'reg':1, 'low fat':0}

combi['Item_Fat_Content'] = combi['Item_Fat_Content'].replace(fat_content_dict, regex=True)

# Feature Engineering using Featuretools

In [45]:
#perform automated feature engineering! 
#It is necessary to have a unique identifier feature in the dataset
# so first we will create a unique identifier

combi['id'] = combi['Item_Identifier'] + combi['Outlet_Identifier']

combi.drop(['Item_Identifier'], axis=1, inplace=True)



In [46]:
#creating an EntitySet. 
#An EntitySet is a structure that contains multiple dataframes and relationships between them. 


# creating and entity set 'es'
es = ft.EntitySet(id = 'sales')

# adding a dataframe 
es.entity_from_dataframe(entity_id = 'bigmart', dataframe = combi, index = 'id')

Entityset: sales
  Entities:
    bigmart [Rows: 14204, Columns: 11]
  Relationships:
    No relationships

In [47]:
#Our data contains information at two levels—item level and outlet level.
#Featuretools offers a functionality to split a dataset into multiple tables. 
#so created a new table ‘outlet’ from the BigMart table based on the outlet ID Outlet_Identifier.

es.normalize_entity(base_entity_id='bigmart', new_entity_id='outlet', index = 'Outlet_Identifier', 
additional_variables = ['Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type'])

Entityset: sales
  Entities:
    bigmart [Rows: 14204, Columns: 7]
    outlet [Rows: 10, Columns: 5]
  Relationships:
    bigmart.Outlet_Identifier -> outlet.Outlet_Identifier

In [48]:
print(es)

Entityset: sales
  Entities:
    bigmart [Rows: 14204, Columns: 7]
    outlet [Rows: 10, Columns: 5]
  Relationships:
    bigmart.Outlet_Identifier -> outlet.Outlet_Identifier


so we have two entitysets one named "bigmart" and the other "outlet"

In [49]:
#Now we will use Deep Feature Synthesis to create new features automatically.
#DFS uses Feature Primitives to create features using multiple tables present in the EntitySet.

feature_matrix, feature_names = ft.dfs(entityset=es, 
target_entity = 'bigmart', 
max_depth = 2, 
verbose = 1, 
n_jobs = 3)

Built 37 features
EntitySet scattered to workers in 1.091 seconds
Elapsed: 00:01 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 11/11 chunks


tornado.application - ERROR - Exception in Future <Future cancelled> after timeout
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/site-packages/tornado/gen.py", line 970, in error_callback
    future.result()
concurrent.futures._base.CancelledError


target_entity is nothing but the entity ID for which we wish to create new features 

(in this case, it is the entity ‘bigmart’).

The parameter max_depth controls the complexity of the features being generated by stacking the primitives. 

The parameter n_jobs helps in parallel feature computation by using multiple cores.

In [50]:
#It has generated a bunch of new features on its own.

#Let’s have a look at these newly created features.
feature_matrix.columns

Index(['Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type',
       'Item_MRP', 'Outlet_Identifier', 'outlet.Outlet_Establishment_Year',
       'outlet.Outlet_Size', 'outlet.Outlet_Location_Type',
       'outlet.Outlet_Type', 'outlet.SUM(bigmart.Item_Weight)',
       'outlet.SUM(bigmart.Item_Fat_Content)',
       'outlet.SUM(bigmart.Item_Visibility)', 'outlet.SUM(bigmart.Item_MRP)',
       'outlet.STD(bigmart.Item_Weight)',
       'outlet.STD(bigmart.Item_Fat_Content)',
       'outlet.STD(bigmart.Item_Visibility)', 'outlet.STD(bigmart.Item_MRP)',
       'outlet.MAX(bigmart.Item_Weight)',
       'outlet.MAX(bigmart.Item_Fat_Content)',
       'outlet.MAX(bigmart.Item_Visibility)', 'outlet.MAX(bigmart.Item_MRP)',
       'outlet.SKEW(bigmart.Item_Weight)',
       'outlet.SKEW(bigmart.Item_Fat_Content)',
       'outlet.SKEW(bigmart.Item_Visibility)', 'outlet.SKEW(bigmart.Item_MRP)',
       'outlet.MIN(bigmart.Item_Weight)',
       'outlet.MIN(bigmart.Item_Fat_Content)',
       

In [51]:
feature_matrix.head()

Unnamed: 0_level_0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,outlet.Outlet_Establishment_Year,outlet.Outlet_Size,outlet.Outlet_Location_Type,outlet.Outlet_Type,outlet.SUM(bigmart.Item_Weight),outlet.SUM(bigmart.Item_Fat_Content),outlet.SUM(bigmart.Item_Visibility),outlet.SUM(bigmart.Item_MRP),outlet.STD(bigmart.Item_Weight),outlet.STD(bigmart.Item_Fat_Content),outlet.STD(bigmart.Item_Visibility),outlet.STD(bigmart.Item_MRP),outlet.MAX(bigmart.Item_Weight),outlet.MAX(bigmart.Item_Fat_Content),outlet.MAX(bigmart.Item_Visibility),outlet.MAX(bigmart.Item_MRP),outlet.SKEW(bigmart.Item_Weight),outlet.SKEW(bigmart.Item_Fat_Content),outlet.SKEW(bigmart.Item_Visibility),outlet.SKEW(bigmart.Item_MRP),outlet.MIN(bigmart.Item_Weight),outlet.MIN(bigmart.Item_Fat_Content),outlet.MIN(bigmart.Item_Visibility),outlet.MIN(bigmart.Item_MRP),outlet.MEAN(bigmart.Item_Weight),outlet.MEAN(bigmart.Item_Fat_Content),outlet.MEAN(bigmart.Item_Visibility),outlet.MEAN(bigmart.Item_MRP),outlet.COUNT(bigmart),outlet.NUM_UNIQUE(bigmart.Item_Type),outlet.MODE(bigmart.Item_Type)
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
DRA12OUT010,11.6,0,0.068535,Soft Drinks,143.0154,OUT010,1998,missing,Tier 3,Grocery Store,11768.655,330,94.293418,130572.7618,4.67507,0.479301,0.073604,62.010835,21.35,1,0.313935,266.6884,0.112759,0.599012,0.776902,0.104693,4.61,0,0.0,32.6558,12.72287,0.356757,0.101939,141.159742,925,16,Fruits and Vegetables
DRA12OUT013,11.6,0,0.040912,Soft Drinks,142.3154,OUT013,1987,High,Tier 3,Supermarket Type1,19859.98,549,93.555174,219172.4492,4.650214,0.478213,0.044005,62.140848,21.35,1,0.185913,266.6884,0.104392,0.613449,0.759033,0.130888,4.555,0,0.0,31.49,12.788139,0.353509,0.060242,141.128428,1553,16,Fruits and Vegetables
DRA12OUT017,11.6,0,0.041178,Soft Drinks,140.3154,OUT017,2007,missing,Tier 2,Supermarket Type1,19722.75,544,94.34221,217561.35,4.655234,0.477922,0.044152,62.295513,21.35,1,0.18862,266.8884,0.106563,0.617805,0.774783,0.12995,4.555,0,0.0,32.09,12.78208,0.35256,0.061142,140.998931,1543,16,Snack Foods
DRA12OUT018,11.6,0,0.041113,Soft Drinks,142.0154,OUT018,2009,Medium,Tier 3,Supermarket Type2,19794.425,547,92.723425,217987.3906,4.650874,0.478308,0.044489,62.022851,21.35,1,0.188323,266.3226,0.102602,0.612046,0.783017,0.133528,4.555,0,0.0,31.89,12.803638,0.353816,0.059976,141.000899,1546,16,Fruits and Vegetables
DRA12OUT027,12.792854,0,0.040748,Soft Drinks,140.0154,OUT027,1985,Medium,Tier 3,Supermarket Type3,19944.059742,551,94.075671,219838.2488,0.0,0.478189,0.044228,62.05966,12.792854,1,0.18665,265.2884,0.0,0.613802,0.774028,0.129638,12.792854,0,0.0,31.29,12.792854,0.353432,0.060344,141.012347,1559,16,Fruits and Vegetables


In [52]:
#There is one issue with this dataframe – it is not sorted properly. 
#So We will have to sort it based on the id variable from the combi dataframe.

feature_matrix = feature_matrix.reindex(index=combi['id'])
feature_matrix = feature_matrix.reset_index()


#Now the dataframe feature_matrix will be in proper order.

In [53]:
#let's check
feature_matrix.head()

Unnamed: 0,id,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,outlet.Outlet_Establishment_Year,outlet.Outlet_Size,outlet.Outlet_Location_Type,outlet.Outlet_Type,outlet.SUM(bigmart.Item_Weight),outlet.SUM(bigmart.Item_Fat_Content),outlet.SUM(bigmart.Item_Visibility),outlet.SUM(bigmart.Item_MRP),outlet.STD(bigmart.Item_Weight),outlet.STD(bigmart.Item_Fat_Content),outlet.STD(bigmart.Item_Visibility),outlet.STD(bigmart.Item_MRP),outlet.MAX(bigmart.Item_Weight),outlet.MAX(bigmart.Item_Fat_Content),outlet.MAX(bigmart.Item_Visibility),outlet.MAX(bigmart.Item_MRP),outlet.SKEW(bigmart.Item_Weight),outlet.SKEW(bigmart.Item_Fat_Content),outlet.SKEW(bigmart.Item_Visibility),outlet.SKEW(bigmart.Item_MRP),outlet.MIN(bigmart.Item_Weight),outlet.MIN(bigmart.Item_Fat_Content),outlet.MIN(bigmart.Item_Visibility),outlet.MIN(bigmart.Item_MRP),outlet.MEAN(bigmart.Item_Weight),outlet.MEAN(bigmart.Item_Fat_Content),outlet.MEAN(bigmart.Item_Visibility),outlet.MEAN(bigmart.Item_MRP),outlet.COUNT(bigmart),outlet.NUM_UNIQUE(bigmart.Item_Type),outlet.MODE(bigmart.Item_Type)
0,FDA15OUT049,9.3,0,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,19844.655,547,91.450099,218802.9588,4.650796,0.478027,0.043924,62.144594,21.35,1,0.18785,266.4884,0.099024,0.616228,0.790782,0.126294,4.555,0,0.0,32.4558,12.803003,0.352903,0.059,141.163199,1550,16,Fruits and Vegetables
1,DRC01OUT018,5.92,1,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,19794.425,547,92.723425,217987.3906,4.650874,0.478308,0.044489,62.022851,21.35,1,0.188323,266.3226,0.102602,0.612046,0.783017,0.133528,4.555,0,0.0,31.89,12.803638,0.353816,0.059976,141.000899,1546,16,Fruits and Vegetables
2,FDN15OUT049,17.5,0,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,19844.655,547,91.450099,218802.9588,4.650796,0.478027,0.043924,62.144594,21.35,1,0.18785,266.4884,0.099024,0.616228,0.790782,0.126294,4.555,0,0.0,32.4558,12.803003,0.352903,0.059,141.163199,1550,16,Fruits and Vegetables
3,FDX07OUT010,19.2,1,0.0,Fruits and Vegetables,182.095,OUT010,1998,missing,Tier 3,Grocery Store,11768.655,330,94.293418,130572.7618,4.67507,0.479301,0.073604,62.010835,21.35,1,0.313935,266.6884,0.112759,0.599012,0.776902,0.104693,4.61,0,0.0,32.6558,12.72287,0.356757,0.101939,141.159742,925,16,Fruits and Vegetables
4,NCD19OUT013,8.93,0,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,19859.98,549,93.555174,219172.4492,4.650214,0.478213,0.044005,62.140848,21.35,1,0.185913,266.6884,0.104392,0.613449,0.759033,0.130888,4.555,0,0.0,31.49,12.788139,0.353509,0.060242,141.128428,1553,16,Fruits and Vegetables


# Model building

In [54]:
# using cataboost algorithm

from catboost import CatBoostRegressor

#atBoost requires all the categorical variables to be in the string format. 
#so, we will convert the categorical variables in our data to string first:

categorical_features = np.where(feature_matrix.dtypes == 'object')[0]

for i in categorical_features:
    feature_matrix.iloc[:,i] = feature_matrix.iloc[:,i].astype('str')

In [55]:
#Let’s split feature_matrix back into train and test sets.

feature_matrix.drop(['id'], axis=1, inplace=True)
train = feature_matrix[:8523]
test = feature_matrix[8523:]

In [56]:
# removing uneccesary variables
train.drop(['Outlet_Identifier'], axis=1, inplace=True)
test.drop(['Outlet_Identifier'], axis=1, inplace=True)

In [57]:
# identifying categorical features
categorical_features = np.where(train.dtypes == 'object')[0]

In [58]:
#Now splitting the train data into training and validation set to check the model’s performance locally.

from sklearn.model_selection import train_test_split

# splitting train data into training and validation set
xtrain, xvalid, ytrain, yvalid = train_test_split(train, sales, test_size=0.25, random_state=11)

In [59]:
#we can now train our model. The evaluation metric we will use is RMSE (Root Mean Squared Error).

model_cat = CatBoostRegressor(iterations=100, learning_rate=0.3, depth=6, eval_metric='RMSE', random_seed=7)

# training model
model_cat.fit(xtrain, ytrain, cat_features=categorical_features, use_best_model=True)

You should provide test set for use best model. use_best_model parameter swiched to false value.
0:	learn: 2136.8880891	total: 6.48ms	remaining: 641ms
1:	learn: 1703.1661771	total: 178ms	remaining: 8.73s
2:	learn: 1431.2867924	total: 281ms	remaining: 9.1s
3:	learn: 1281.6678532	total: 383ms	remaining: 9.19s
4:	learn: 1192.1684008	total: 497ms	remaining: 9.45s
5:	learn: 1136.4815192	total: 679ms	remaining: 10.6s
6:	learn: 1108.4994983	total: 782ms	remaining: 10.4s
7:	learn: 1090.4404507	total: 879ms	remaining: 10.1s
8:	learn: 1083.3918071	total: 973ms	remaining: 9.84s
9:	learn: 1078.0916303	total: 1.08s	remaining: 9.7s
10:	learn: 1075.1552515	total: 1.18s	remaining: 9.57s
11:	learn: 1072.8882621	total: 1.28s	remaining: 9.37s
12:	learn: 1071.4932371	total: 1.38s	remaining: 9.23s
13:	learn: 1071.2754705	total: 1.48s	remaining: 9.12s
14:	learn: 1070.5455577	total: 1.57s	remaining: 8.9s
15:	learn: 1068.8250832	total: 1.67s	remaining: 8.78s
16:	learn: 1068.0881381	total: 1.77s	remaining: 8.6

<catboost.core.CatBoostRegressor at 0x7f1f601a94e0>

In [60]:
# validation score
model_cat.score(xvalid, yvalid)

1093.6992804410283