# Featuretools

## Importing and loading data

### Featuretools Installation

Before Running this notebook, you would need to install featuretools in your system with the help of **`!pip install featuretools`** command. 

In [1]:
# Install featuretools
!pip install featuretools

Collecting featuretools
  Downloading featuretools-1.31.0-py3-none-any.whl.metadata (15 kB)
Collecting cloudpickle>=1.5.0 (from featuretools)
  Downloading cloudpickle-3.0.0-py3-none-any.whl.metadata (7.0 kB)
Collecting holidays>=0.17 (from featuretools)
  Downloading holidays-0.52-py3-none-any.whl.metadata (23 kB)
Collecting woodwork>=0.28.0 (from featuretools)
  Downloading woodwork-0.31.0-py3-none-any.whl.metadata (10 kB)
Collecting importlib-resources>=5.10.0 (from woodwork>=0.28.0->featuretools)
  Downloading importlib_resources-6.4.0-py3-none-any.whl.metadata (3.9 kB)
Downloading featuretools-1.31.0-py3-none-any.whl (587 kB)
   ---------------------------------------- 0.0/587.9 kB ? eta -:--:--
   --------------------------------------  583.7/587.9 kB 17.9 MB/s eta 0:00:01
   ---------------------------------------- 587.9/587.9 kB 9.2 MB/s eta 0:00:00
Downloading cloudpickle-3.0.0-py3-none-any.whl (20 kB)
Downloading holidays-0.52-py3-none-any.whl (1.0 MB)
   --------------------


[notice] A new release of pip is available: 24.1 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Importing necessary libraries
import pandas as pd
import numpy as np

import featuretools as ft

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline

print(pd.__version__)
print(np.__version__)

2.1.1
1.26.1


In [3]:
# Loading the data
data = pd.read_csv('datasets/train_bm.csv')

# Check the data
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [4]:
# Splitting dependent and independent variables
X = data.drop('Item_Outlet_Sales', axis=1)
y = data['Item_Outlet_Sales']

print(X.shape, y.shape)

(8523, 11) (8523,)


## Entityset and it's features

### Creating an entity set

In [5]:
# Create an entity set
es = ft.EntitySet(id = 'sales_data')
print(es)

Entityset: sales_data
  DataFrames:
  Relationships:
    No relationships


### Add data to this entityset created

In [6]:
es.add_dataframe(dataframe_name = 'data', dataframe = X, make_index = True, index = 'index')

print(es)

Entityset: sales_data
  DataFrames:
    data [Rows: 8523, Columns: 12]
  Relationships:
    No relationships


### Feature Engineering 

In [9]:
# Run deep feature synthesis with transformation primitives
feature_matrix, feature_defs = ft.dfs(entityset = es, target_dataframe_name = 'data', max_depth=1, trans_primitives = ['add_numeric', 'multiply_numeric'])

# Check the data
feature_matrix.head()

Unnamed: 0_level_0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,...,Item_MRP + Outlet_Establishment_Year,Item_Visibility + Item_Weight,Item_Visibility + Outlet_Establishment_Year,Item_Weight + Outlet_Establishment_Year,Item_MRP * Item_Visibility,Item_MRP * Item_Weight,Item_MRP * Outlet_Establishment_Year,Item_Visibility * Item_Weight,Item_Visibility * Outlet_Establishment_Year,Item_Weight * Outlet_Establishment_Year
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,...,2248.8092,9.316047,1999.016047,2008.3,4.008763,2323.22556,499368.5908,0.14924,32.078555,18590.7
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,...,2057.2692,5.939278,2009.019278,2014.92,0.930544,285.753664,96972.8228,0.114127,38.729936,11893.28
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,...,2140.618,17.51676,1999.01676,2016.5,2.373528,2478.315,283094.382,0.293301,33.50339,34982.5
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,...,2180.095,19.2,1998.0,2017.2,0.0,3496.224,363825.81,0.0,0.0,38361.6
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,...,2040.8614,8.93,1987.0,1995.93,0.0,480.982302,107022.6018,0.0,0.0,17743.91


In [11]:
# Check the feature definitions
print('Definitions:', feature_defs)
print('Shape:', feature_matrix.shape)

Definitions: [<Feature: Item_Identifier>, <Feature: Item_Weight>, <Feature: Item_Fat_Content>, <Feature: Item_Visibility>, <Feature: Item_Type>, <Feature: Item_MRP>, <Feature: Outlet_Identifier>, <Feature: Outlet_Establishment_Year>, <Feature: Outlet_Size>, <Feature: Outlet_Location_Type>, <Feature: Outlet_Type>, <Feature: Item_MRP + Item_Visibility>, <Feature: Item_MRP + Item_Weight>, <Feature: Item_MRP + Outlet_Establishment_Year>, <Feature: Item_Visibility + Item_Weight>, <Feature: Item_Visibility + Outlet_Establishment_Year>, <Feature: Item_Weight + Outlet_Establishment_Year>, <Feature: Item_MRP * Item_Visibility>, <Feature: Item_MRP * Item_Weight>, <Feature: Item_MRP * Outlet_Establishment_Year>, <Feature: Item_Visibility * Item_Weight>, <Feature: Item_Visibility * Outlet_Establishment_Year>, <Feature: Item_Weight * Outlet_Establishment_Year>]
Shape: (8523, 23)


In [12]:
# Run deep feature synthesis with transformation primitives
feature_matrix, feature_defs = ft.dfs(entityset = es, target_dataframe_name = 'data', max_depth=2,
                                      trans_primitives = ['add_numeric', 'multiply_numeric'])

feature_matrix.head()

Unnamed: 0_level_0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,...,Item_MRP + Outlet_Establishment_Year,Item_Visibility + Item_Weight,Item_Visibility + Outlet_Establishment_Year,Item_Weight + Outlet_Establishment_Year,Item_MRP * Item_Visibility,Item_MRP * Item_Weight,Item_MRP * Outlet_Establishment_Year,Item_Visibility * Item_Weight,Item_Visibility * Outlet_Establishment_Year,Item_Weight * Outlet_Establishment_Year
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,...,2248.8092,9.316047,1999.016047,2008.3,4.008763,2323.22556,499368.5908,0.14924,32.078555,18590.7
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,...,2057.2692,5.939278,2009.019278,2014.92,0.930544,285.753664,96972.8228,0.114127,38.729936,11893.28
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,...,2140.618,17.51676,1999.01676,2016.5,2.373528,2478.315,283094.382,0.293301,33.50339,34982.5
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,...,2180.095,19.2,1998.0,2017.2,0.0,3496.224,363825.81,0.0,0.0,38361.6
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,...,2040.8614,8.93,1987.0,1995.93,0.0,480.982302,107022.6018,0.0,0.0,17743.91


In [10]:
# Check the feature definitions
print('Definitions:', feature_defs)
print('Shape:', feature_matrix.shape)

[<Feature: Item_Identifier>,
 <Feature: Item_Weight>,
 <Feature: Item_Fat_Content>,
 <Feature: Item_Visibility>,
 <Feature: Item_Type>,
 <Feature: Item_MRP>,
 <Feature: Outlet_Identifier>,
 <Feature: Outlet_Establishment_Year>,
 <Feature: Outlet_Size>,
 <Feature: Outlet_Location_Type>,
 <Feature: Outlet_Type>,
 <Feature: Item_MRP + Item_Visibility>,
 <Feature: Item_Weight + Outlet_Establishment_Year>,
 <Feature: Item_Visibility + Outlet_Establishment_Year>,
 <Feature: Item_MRP + Item_Weight>,
 <Feature: Item_MRP + Outlet_Establishment_Year>,
 <Feature: Item_Visibility + Item_Weight>,
 <Feature: Item_Visibility * Outlet_Establishment_Year>,
 <Feature: Item_MRP * Item_Visibility>,
 <Feature: Item_Visibility * Item_Weight>,
 <Feature: Item_Weight * Outlet_Establishment_Year>,
 <Feature: Item_MRP * Item_Weight>,
 <Feature: Item_MRP * Outlet_Establishment_Year>,
 <Feature: Item_MRP * Item_MRP + Item_Weight>,
 <Feature: Item_MRP + Outlet_Establishment_Year * Outlet_Establishment_Year>,
 <Fea