# Modelo: Random Forest

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

In [2]:
def calc_metricas(obs, pred):
    r2 = metrics.r2_score(obs, pred).round(3)
    mape = (100 * metrics.mean_absolute_percentage_error(obs, pred)).round(3)
    mae = metrics.mean_absolute_error(obs, pred).round(2)
    rmse = (metrics.mean_squared_error(obs, pred)**0.5).round(2)
    msle = metrics.mean_squared_log_error(obs, pred).round(3)
    
    return [r2, mape, mae, rmse, msle]

def print_avaliacao(obs, pred):
    print('R² = %.3f' % metrics.r2_score(obs, pred))
    print('MAPE = %.3f %%' % (100 * metrics.mean_absolute_percentage_error(obs, pred)))
    print('MAE = U$S %.2f' % (metrics.mean_absolute_error(obs, pred)))
    print('RMSE = U$S %.2f' % metrics.mean_squared_error(obs, pred)**0.5)
    print('MSLE = %.3f' % metrics.mean_squared_log_error(obs, pred))


In [3]:
dados_treino = pd.read_csv('../data/processed/train_data.csv')
dados_teste = pd.read_csv('../data/processed/test_data.csv')

In [4]:
le_cat1 = LabelEncoder()
le_cat2 = LabelEncoder()
le_cat3 = LabelEncoder()
le_brand = LabelEncoder()

le_cat1 = le_cat1.fit(dados_treino.category_1)
le_cat2 = le_cat2.fit(dados_treino.category_2)
le_cat3 = le_cat3.fit(dados_treino.category_3)
le_brand = le_brand.fit(dados_treino.brand_name)

dados_treino['category_1'] = le_cat1.transform(dados_treino.category_1)
dados_treino['category_2'] = le_cat2.transform(dados_treino.category_2)
dados_treino['category_3'] = le_cat3.transform(dados_treino.category_3)
dados_treino['brand_name'] = le_brand.transform(dados_treino.brand_name)

In [5]:
dic_cat1 = dict(zip(le_cat1.classes_, le_cat1.transform(le_cat1.classes_)))
dic_cat2 = dict(zip(le_cat2.classes_, le_cat2.transform(le_cat2.classes_)))
dic_cat3 = dict(zip(le_cat3.classes_, le_cat3.transform(le_cat3.classes_)))
dic_brand = dict(zip(le_brand.classes_, le_brand.transform(le_brand.classes_)))


In [6]:
dados_teste = pd.read_csv('../data/processed/test_data.csv')
dados_teste['category_1'] = dados_teste['category_1'].map(dic_cat1).fillna(-1)
dados_teste['category_2'] = dados_teste['category_2'].map(dic_cat2).fillna(-1)
dados_teste['category_3'] = dados_teste['category_3'].map(dic_cat3).fillna(-1)
dados_teste['brand_name'] = dados_teste['brand_name'].map(dic_brand).fillna(-1)

## Modelo geral

In [7]:
rfr = RandomForestRegressor(n_jobs=-1, random_state=101, criterion='poisson', warm_start=True)
X_treino = dados_treino[['category_1', 'category_2', 'category_3', 'brand_name', 'item_condition_id', 'shipping']]
y_treino = dados_treino['price']

rfr.fit(X_treino, y_treino)


Tempo para treinamento: 3m3.1s

In [8]:
dados_teste['pred'] = rfr.predict(dados_teste[['category_1', 
                                                 'category_2', 
                                                 'category_3', 
                                                 'brand_name', 
                                                 'item_condition_id', 
                                                 'shipping']])
dados_teste.loc[dados_teste.pred < 3, 'pred'] = 3

Tempo para previsão: 14.4s

Avaliação geral

In [9]:
print_avaliacao(dados_teste['price'], dados_teste['pred'])

R² = 0.343
MAPE = 65.466 %
MAE = U$S 13.15
RMSE = U$S 31.09
MSLE = 0.348


Avaliação por categoria

In [10]:
avaliacoes = {}

for categoria in dados_teste.category_1.unique():
    dados_cat = dados_teste.query(f'category_1 == {categoria}')
    avaliacoes[categoria] = calc_metricas(dados_cat['price'], dados_cat['pred'])

avaliacoes = pd.DataFrame(avaliacoes, index=['R²', 'MAPE', 'MAE', 'RMSE', 'MSLE'])

avaliacoes.columns = ['Women', 'Men', 'Vintage & Collectibles', 'Electronics',
                      'Beauty', 'Kids', 'Other', 'Home', 'Sports & Outdoors',
                      'Handmade', 'No category']

avaliacoes

Unnamed: 0,Women,Men,Vintage & Collectibles,Electronics,Beauty,Kids,Other,Home,Sports & Outdoors,Handmade,No category
R²,0.341,0.461,0.246,0.41,0.072,0.234,0.115,0.169,0.206,0.181,-0.005
MAPE,57.748,59.186,95.545,90.763,64.931,65.272,81.856,65.61,75.15,72.415,93.258
MAE,12.97,15.69,17.68,21.3,9.85,10.38,12.22,12.25,13.85,10.18,15.9
RMSE,31.77,29.42,44.01,50.55,23.34,19.46,25.8,21.24,26.47,22.7,30.36
MSLE,0.298,0.31,0.552,0.503,0.344,0.342,0.449,0.356,0.419,0.384,0.576


## Modelo por categorias

In [11]:
modelos = {}

for category in dados_treino.category_1.unique():
    model = RandomForestRegressor(n_jobs=-1, random_state=101, criterion='poisson', warm_start=True)
    modelos[category] = model
    X_treino = dados_treino.query(f'category_1 == {category}')[['category_2', 'category_3', 'brand_name', 'item_condition_id', 'shipping']]
    y_treino = dados_treino.query(f'category_1 == {category}')['price']
    # pipe.fit(X_treino, y_treino)
    modelos[category].fit(X_treino, y_treino)

Tempo para treino: 1m 14.9s

In [12]:
dados_teste = dados_teste.assign(pred=0)
for category in dados_teste.category_1.unique():
    index = dados_teste.query(f'category_1 == {category}').index
    dados_teste.loc[index, 'pred'] = modelos[category].predict(dados_teste.loc[index][['category_2', 'category_3', 'brand_name', 'item_condition_id', 'shipping']])

Tempo para previsão: 6.8s

Avaliação geral

In [13]:
print_avaliacao(dados_teste['price'], dados_teste['pred'])

R² = 0.341
MAPE = 65.428 %
MAE = U$S 13.16
RMSE = U$S 31.14
MSLE = 0.348


Avaliação por categoria

In [14]:
avaliacoes = {}

for categoria in dados_teste.category_1.unique():
    dados_cat = dados_teste.query(f'category_1 == {categoria}')
    avaliacoes[categoria] = calc_metricas(dados_cat['price'], dados_cat['pred'])

avaliacoes = pd.DataFrame(avaliacoes, index=['R²', 'MAPE', 'MAE', 'RMSE', 'MSLE'],
                          )
avaliacoes.columns = ['Women', 'Men', 'Vintage & Collectibles', 'Electronics',
                      'Beauty', 'Kids', 'Other', 'Home', 'Sports & Outdoors',
                      'Handmade', 'No category']
avaliacoes

Unnamed: 0,Women,Men,Vintage & Collectibles,Electronics,Beauty,Kids,Other,Home,Sports & Outdoors,Handmade,No category
R²,0.337,0.457,0.251,0.41,0.072,0.232,0.113,0.161,0.214,0.189,-0.072
MAPE,57.764,59.389,95.262,90.495,65.016,65.067,81.335,65.523,75.018,72.207,95.863
MAE,12.98,15.76,17.68,21.28,9.86,10.38,12.19,12.26,13.81,10.17,16.22
RMSE,31.86,29.53,43.88,50.57,23.34,19.49,25.84,21.34,26.32,22.59,31.36
MSLE,0.299,0.312,0.552,0.502,0.344,0.342,0.447,0.357,0.419,0.383,0.592
