# Modelo: Random Forest

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

In [2]:
def calc_metricas(obs, pred):
    r2 = metrics.r2_score(obs, pred).round(3)
    mape = (100 * metrics.mean_absolute_percentage_error(obs, pred)).round(3)
    mae = metrics.mean_absolute_error(obs, pred).round(2)
    rmse = (metrics.mean_squared_error(obs, pred)**0.5).round(2)
    msle = metrics.mean_squared_log_error(obs, pred).round(3)
    
    return [r2, mape, mae, rmse, msle]

def print_avaliacao(obs, pred):
    print('R² = %.3f' % metrics.r2_score(obs, pred))
    print('MAPE = %.3f %%' % (100 * metrics.mean_absolute_percentage_error(obs, pred)))
    print('MAE = U$S %.2f' % (metrics.mean_absolute_error(obs, pred)))
    print('RMSE = U$S %.2f' % metrics.mean_squared_error(obs, pred)**0.5)
    print('MSLE = %.3f' % metrics.mean_squared_log_error(obs, pred))


In [3]:
dados_treino = pd.read_csv('../data/processed/train_data.csv')
dados_teste = pd.read_csv('../data/processed/test_data.csv')

In [4]:
le_cat1 = LabelEncoder()
le_cat2 = LabelEncoder()
le_cat3 = LabelEncoder()
le_brand = LabelEncoder()

le_cat1 = le_cat1.fit(dados_treino.category_1)
le_cat2 = le_cat2.fit(dados_treino.category_2)
le_cat3 = le_cat3.fit(dados_treino.category_3)
le_brand = le_brand.fit(dados_treino.brand_name)

dados_treino['category_1'] = le_cat1.transform(dados_treino.category_1)
dados_treino['category_2'] = le_cat2.transform(dados_treino.category_2)
dados_treino['category_3'] = le_cat3.transform(dados_treino.category_3)
dados_treino['brand_name'] = le_brand.transform(dados_treino.brand_name)

In [5]:
dic_cat1 = dict(zip(le_cat1.classes_, le_cat1.transform(le_cat1.classes_)))
dic_cat2 = dict(zip(le_cat2.classes_, le_cat2.transform(le_cat2.classes_)))
dic_cat3 = dict(zip(le_cat3.classes_, le_cat3.transform(le_cat3.classes_)))
dic_brand = dict(zip(le_brand.classes_, le_brand.transform(le_brand.classes_)))


In [6]:
dados_teste = pd.read_csv('../data/processed/test_data.csv')
dados_teste['category_1'] = dados_teste['category_1'].map(dic_cat1).fillna(-1)
dados_teste['category_2'] = dados_teste['category_2'].map(dic_cat2).fillna(-1)
dados_teste['category_3'] = dados_teste['category_3'].map(dic_cat3).fillna(-1)
dados_teste['brand_name'] = dados_teste['brand_name'].map(dic_brand).fillna(-1)

## Modelo geral

In [7]:
rfr = RandomForestRegressor(n_jobs=-1, random_state=101, criterion='poisson', warm_start=True)
X_treino = dados_treino[['category_1', 'category_2', 'category_3', 'brand_name', 'item_condition_id', 'shipping']]
y_treino = np.log(dados_treino['price'])

rfr.fit(X_treino, y_treino)


Tempo para treinamento: 3m3.1s

In [8]:
dados_teste['pred'] = np.exp(rfr.predict(dados_teste[['category_1', 
                                                 'category_2', 
                                                 'category_3', 
                                                 'brand_name', 
                                                 'item_condition_id', 
                                                 'shipping']]))
dados_teste.loc[dados_teste.pred < 3, 'pred'] = 3

Tempo para previsão: 14.4s

Avaliação geral

In [9]:
print_avaliacao(dados_teste['price'], dados_teste['pred'])

R² = 0.308
MAPE = 48.711 %
MAE = U$S 12.18
RMSE = U$S 31.91
MSLE = 0.306


Avaliação por categoria

In [10]:
avaliacoes = {}

for categoria in dados_teste.category_1.unique():
    dados_cat = dados_teste.query(f'category_1 == {categoria}')
    avaliacoes[categoria] = calc_metricas(dados_cat['price'], dados_cat['pred'])

avaliacoes = pd.DataFrame(avaliacoes, index=['R²', 'MAPE', 'MAE', 'RMSE', 'MSLE'])

avaliacoes.columns = ['Women', 'Men', 'Vintage & Collectibles', 'Electronics',
                      'Beauty', 'Kids', 'Other', 'Home', 'Sports & Outdoors',
                      'Handmade', 'No category']

avaliacoes

Unnamed: 0,Women,Men,Vintage & Collectibles,Electronics,Beauty,Kids,Other,Home,Sports & Outdoors,Handmade,No category
R²,0.313,0.43,0.219,0.347,0.054,0.202,0.106,0.145,0.172,0.158,-0.018
MAPE,44.457,47.156,62.032,61.364,49.247,48.84,55.564,49.847,54.332,50.573,62.988
MAE,12.14,14.89,15.52,19.67,9.1,9.46,10.85,11.3,12.76,9.09,14.31
RMSE,32.42,30.24,44.81,53.18,23.57,19.86,25.95,21.54,27.03,23.01,30.55
MSLE,0.266,0.281,0.462,0.427,0.308,0.301,0.38,0.32,0.372,0.328,0.497


## Modelo por categorias

In [11]:
modelos = {}

for category in dados_treino.category_1.unique():
    model = RandomForestRegressor(n_jobs=-1, random_state=101, criterion='poisson', warm_start=True)
    modelos[category] = model
    X_treino = dados_treino.query(f'category_1 == {category}')[['category_2', 'category_3', 'brand_name', 'item_condition_id', 'shipping']]
    y_treino = np.log(dados_treino.query(f'category_1 == {category}')['price'])
    # pipe.fit(X_treino, y_treino)
    modelos[category].fit(X_treino, y_treino)

Tempo para treino: 1m 14.9s

In [12]:
dados_teste = dados_teste.assign(pred=0)
for category in dados_teste.category_1.unique():
    index = dados_teste.query(f'category_1 == {category}').index
    dados_teste.loc[index, 'pred'] = np.exp(modelos[category].predict(dados_teste.loc[index][['category_2', 
                                                                                              'category_3', 
                                                                                              'brand_name', 
                                                                                              'item_condition_id',
                                                                                              'shipping']]))

Tempo para previsão: 6.8s

Avaliação geral

In [13]:
print_avaliacao(dados_teste['price'], dados_teste['pred'])

R² = 0.308
MAPE = 48.786 %
MAE = U$S 12.19
RMSE = U$S 31.92
MSLE = 0.307


Avaliação por categoria

In [14]:
avaliacoes = {}

for categoria in dados_teste.category_1.unique():
    dados_cat = dados_teste.query(f'category_1 == {categoria}')
    avaliacoes[categoria] = calc_metricas(dados_cat['price'], dados_cat['pred'])

avaliacoes = pd.DataFrame(avaliacoes, index=['R²', 'MAPE', 'MAE', 'RMSE', 'MSLE'],
                          )
avaliacoes.columns = ['Women', 'Men', 'Vintage & Collectibles', 'Electronics',
                      'Beauty', 'Kids', 'Other', 'Home', 'Sports & Outdoors',
                      'Handmade', 'No category']
avaliacoes

Unnamed: 0,Women,Men,Vintage & Collectibles,Electronics,Beauty,Kids,Other,Home,Sports & Outdoors,Handmade,No category
R²,0.312,0.43,0.217,0.347,0.054,0.203,0.111,0.145,0.177,0.162,-0.029
MAPE,44.57,47.256,62.28,61.486,49.276,48.765,55.484,49.975,54.299,50.464,63.843
MAE,12.16,14.91,15.57,19.67,9.1,9.46,10.83,11.32,12.74,9.08,14.41
RMSE,32.45,30.24,44.85,53.18,23.57,19.85,25.86,21.54,26.94,22.96,30.72
MSLE,0.268,0.282,0.466,0.428,0.308,0.3,0.379,0.321,0.37,0.328,0.506
