# IESB - CIA035 - Aula 07 - Ensemble de Modelos

## Dados
https://datahack.analyticsvidhya.com/contest/practice-problem-big-mart-sales-iii/

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/big-mart-sales-prediction-datasets/sample_submission.csv
/kaggle/input/big-mart-sales-prediction-datasets/train.csv
/kaggle/input/big-mart-sales-prediction-datasets/test.csv


In [2]:
# Importando os dados
train = pd.read_csv('/kaggle/input/big-mart-sales-prediction-datasets/train.csv')
test = pd.read_csv('/kaggle/input/big-mart-sales-prediction-datasets/test.csv')

train.shape, test.shape

((8523, 12), (5681, 11))

In [3]:
# Verificando os tipos
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [4]:
# Verificando valores únicos em cada coluna
train.nunique()

Item_Identifier              1559
Item_Weight                   415
Item_Fat_Content                5
Item_Visibility              7880
Item_Type                      16
Item_MRP                     5938
Outlet_Identifier              10
Outlet_Establishment_Year       9
Outlet_Size                     3
Outlet_Location_Type            3
Outlet_Type                     4
Item_Outlet_Sales            3493
dtype: int64

In [5]:
# Verificando os valores nulos
train.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [6]:
# Analisando a variável target
train['Item_Outlet_Sales'].describe()

count     8523.000000
mean      2181.288914
std       1706.499616
min         33.290000
25%        834.247400
50%       1794.331000
75%       3101.296400
max      13086.964800
Name: Item_Outlet_Sales, dtype: float64

## Tratamento de Dados

In [7]:
# Item_Fat_Content tem valores em excesso
train['Item_Fat_Content'].unique()

array(['Low Fat', 'Regular', 'low fat', 'LF', 'reg'], dtype=object)

In [8]:
# Criando um dicionário para mapear todos os registros em 'low' ou 'regular'
item_fat = {'Low Fat':'low', 'Regular':'regular', 'LF':'low', 'reg':'regular','low fat':'low'}

train['Item_Fat_Content'] = train['Item_Fat_Content'].map(item_fat)

# Verificando
train['Item_Fat_Content'].unique()

array(['low', 'regular'], dtype=object)

In [9]:
# Verificando Outlet_Size
train['Outlet_Size'].unique()

array(['Medium', nan, 'High', 'Small'], dtype=object)

In [10]:
# Verificando Outlet_Location_Type
train['Outlet_Location_Type'].unique()

array(['Tier 1', 'Tier 3', 'Tier 2'], dtype=object)

In [11]:
# Verificando Outlet_Type
train['Outlet_Type'].unique()

array(['Supermarket Type1', 'Supermarket Type2', 'Grocery Store',
       'Supermarket Type3'], dtype=object)

In [12]:
# Verificando Item_Type
train['Item_Type'].unique()

array(['Dairy', 'Soft Drinks', 'Meat', 'Fruits and Vegetables',
       'Household', 'Baking Goods', 'Snack Foods', 'Frozen Foods',
       'Breakfast', 'Health and Hygiene', 'Hard Drinks', 'Canned',
       'Breads', 'Starchy Foods', 'Others', 'Seafood'], dtype=object)

In [13]:
# Verificando Outlet_Identifier
train['Outlet_Identifier'].unique()

array(['OUT049', 'OUT018', 'OUT010', 'OUT013', 'OUT027', 'OUT045',
       'OUT017', 'OUT046', 'OUT035', 'OUT019'], dtype=object)

In [14]:
# Quantos valores únicos de Item_Identifier
train['Item_Identifier'].nunique()

1559

In [15]:
# As colunas 'Item_Weight' e 'Outlet_Size' possuem valores nulos

# Vamos usar a média para imputar valores em Item_Weight
train['Item_Weight'].fillna(train['Item_Weight'].mean(), inplace=True)

# Vamos usar a moda para imputar valores em Outlet_Size
train['Outlet_Size'].fillna(train['Outlet_Size'].mode()[0], inplace=True)

In [16]:
# Precisamos codificar as variáveis categóricas
train.dtypes

Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object

In [17]:
# Vamos codificar as colunas categóricas usando one hot encoding
cat_cols = ['Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']

train = pd.get_dummies(train, columns=cat_cols)

train.shape

(8523, 44)

## Modelo inicial

In [18]:
# Verificando as colunas
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 44 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Item_Identifier                  8523 non-null   object 
 1   Item_Weight                      8523 non-null   float64
 2   Item_Visibility                  8523 non-null   float64
 3   Item_MRP                         8523 non-null   float64
 4   Outlet_Establishment_Year        8523 non-null   int64  
 5   Item_Outlet_Sales                8523 non-null   float64
 6   Item_Fat_Content_low             8523 non-null   uint8  
 7   Item_Fat_Content_regular         8523 non-null   uint8  
 8   Item_Type_Baking Goods           8523 non-null   uint8  
 9   Item_Type_Breads                 8523 non-null   uint8  
 10  Item_Type_Breakfast              8523 non-null   uint8  
 11  Item_Type_Canned                 8523 non-null   uint8  
 12  Item_Type_Dairy     

In [19]:
# Separando o dataframe
from sklearn.model_selection import train_test_split

train, valid = train_test_split(train, random_state=42)

train.shape, valid.shape

((6392, 44), (2131, 44))

In [20]:
# Obtendo as colunas para treinamento
features = [c for c in train.columns if c not in ['Item_Identifier', 'Item_Outlet_Sales']]

features

['Item_Weight',
 'Item_Visibility',
 'Item_MRP',
 'Outlet_Establishment_Year',
 'Item_Fat_Content_low',
 'Item_Fat_Content_regular',
 'Item_Type_Baking Goods',
 'Item_Type_Breads',
 'Item_Type_Breakfast',
 'Item_Type_Canned',
 'Item_Type_Dairy',
 'Item_Type_Frozen Foods',
 'Item_Type_Fruits and Vegetables',
 'Item_Type_Hard Drinks',
 'Item_Type_Health and Hygiene',
 'Item_Type_Household',
 'Item_Type_Meat',
 'Item_Type_Others',
 'Item_Type_Seafood',
 'Item_Type_Snack Foods',
 'Item_Type_Soft Drinks',
 'Item_Type_Starchy Foods',
 'Outlet_Identifier_OUT010',
 'Outlet_Identifier_OUT013',
 'Outlet_Identifier_OUT017',
 'Outlet_Identifier_OUT018',
 'Outlet_Identifier_OUT019',
 'Outlet_Identifier_OUT027',
 'Outlet_Identifier_OUT035',
 'Outlet_Identifier_OUT045',
 'Outlet_Identifier_OUT046',
 'Outlet_Identifier_OUT049',
 'Outlet_Size_High',
 'Outlet_Size_Medium',
 'Outlet_Size_Small',
 'Outlet_Location_Type_Tier 1',
 'Outlet_Location_Type_Tier 2',
 'Outlet_Location_Type_Tier 3',
 'Outlet_Type_

In [21]:
# Vamos executar o RandomForest padrão
from sklearn.ensemble import RandomForestRegressor

rf_padrao = RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=42)

rf_padrao.fit(train[features], train['Item_Outlet_Sales'])

RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=42)

In [22]:
# Obtendo os valores de predição
preds_rf = rf_padrao.predict(valid[features])

In [23]:
# Calculando a métrica
from sklearn.metrics import mean_squared_error

rmse_rf = mean_squared_error(valid['Item_Outlet_Sales'], preds_rf, squared=False)

rmse_rf

1103.0781815728149

# Outros Modelos

In [24]:
# Vamos executar o GBM padrão
from sklearn.ensemble import GradientBoostingRegressor

# Executando o modelo
gbm_padrao = GradientBoostingRegressor(n_estimators=200, random_state=42)

gbm_padrao.fit(train[features], train['Item_Outlet_Sales'])

# Obtendo os valores de predição
preds_gbm = gbm_padrao.predict(valid[features])

# Calculando a métrica
rmse_gbm = mean_squared_error(valid['Item_Outlet_Sales'], preds_gbm, squared=False)

rmse_gbm

1064.9487042895971

In [25]:
# Vamos executar o AdaBoost padrão
from sklearn.ensemble import AdaBoostRegressor

# Executando o modelo
ada_padrao = AdaBoostRegressor(n_estimators=200, random_state=42)

ada_padrao.fit(train[features], train['Item_Outlet_Sales'])

# Obtendo os valores de predição
preds_ada = ada_padrao.predict(valid[features])

# Calculando a métrica
rmse_ada = mean_squared_error(valid['Item_Outlet_Sales'], preds_ada, squared=False)

rmse_ada

1164.380592406447

# Criando nosso próprio Ensemble de Métodos

In [26]:
# Usaremos o VotingRegressor
from sklearn.ensemble import VotingRegressor

In [27]:
# Definindo nossos estimadores
estimators = [('rf_padrao', rf_padrao),('gbm_padrao',gbm_padrao),('ada_padrao', ada_padrao)]

# Criando o VotingRegressor
ensemble1 = VotingRegressor(estimators=estimators, n_jobs=-1)

ensemble1.fit(train[features], train['Item_Outlet_Sales'])

# Obtendo os valores de predição
preds_ens = ensemble1.predict(valid[features])

# Calculando a métrica
rmse_ens = mean_squared_error(valid['Item_Outlet_Sales'], preds_ens, squared=False)

rmse_ens

1072.234851975407