# Modelo: Random Forest

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

In [2]:
dados_treino = pd.read_csv('../data/processed/train_data.csv')
dados_teste = pd.read_csv('../data/processed/test_data.csv')

In [3]:
le_cat1 = LabelEncoder()
le_cat2 = LabelEncoder()
le_cat3 = LabelEncoder()
le_brand = LabelEncoder()

le_cat1 = le_cat1.fit(dados_treino.category_1)
le_cat2 = le_cat2.fit(dados_treino.category_2)
le_cat3 = le_cat3.fit(dados_treino.category_3)
le_brand = le_brand.fit(dados_treino.brand_name)

dados_treino['category_1'] = le_cat1.transform(dados_treino.category_1)
dados_treino['category_2'] = le_cat2.transform(dados_treino.category_2)
dados_treino['category_3'] = le_cat3.transform(dados_treino.category_3)
dados_treino['brand_name'] = le_brand.transform(dados_treino.brand_name)

In [4]:
dic_cat1 = dict(zip(le_cat1.classes_, le_cat1.transform(le_cat1.classes_)))
dic_cat2 = dict(zip(le_cat2.classes_, le_cat2.transform(le_cat2.classes_)))
dic_cat3 = dict(zip(le_cat3.classes_, le_cat3.transform(le_cat3.classes_)))
dic_brand = dict(zip(le_brand.classes_, le_brand.transform(le_brand.classes_)))


In [5]:
dados_teste = pd.read_csv('../data/processed/test_data.csv')
dados_teste['category_1'] = dados_teste['category_1'].map(dic_cat1).fillna(-1)
dados_teste['category_2'] = dados_teste['category_2'].map(dic_cat2).fillna(-1)
dados_teste['category_3'] = dados_teste['category_3'].map(dic_cat3).fillna(-1)
dados_teste['brand_name'] = dados_teste['brand_name'].map(dic_brand).fillna(-1)
dados_teste

Unnamed: 0,name,category_1,category_2,category_3,item_condition_id,brand_name,price,shipping,item_description,date,stock
0,women s motorcycle tshirt,10,104,751.0,3,1709.0,6.0,1,xl fits like a small,9-4-2018,8
1,alex and ani necklace,10,58,530.0,2,47.0,11.0,1,path of life alex and ani necklace in gold has...,25-1-2018,8
2,steve madden red lace up heels,10,88,629.0,2,3682.0,36.0,0,never been worn isn t true to size runs small,30-11-2018,9
3,men s black tuxedo,5,94,811.0,3,2795.0,54.0,0,worn a few times single pleated pants pants ar...,30-7-2018,7
4,red open back mermaid style dress xs,10,35,334.0,1,2795.0,316.0,0,this full length wine red mermaid style backle...,29-7-2018,34
...,...,...,...,...,...,...,...,...,...,...,...
444494,pink vs sleep shorts,10,5,687.0,3,2928.0,12.0,0,size small great condition barely ever worn to...,16-12-2018,7
444495,coach hand bag,10,112,392.0,3,926.0,86.0,0,this large black hobo coach is in a very good ...,15-11-2018,3
444496,waffle knit sweater forever,10,97,226.0,3,1384.0,14.0,0,great condition tags forever american apparel ...,2-3-2018,4
444497,dr dre beats solo,1,100,381.0,3,457.0,69.0,0,blue fits any head amazing sound no longer in use,19-3-2018,6


In [6]:
rfr = RandomForestRegressor(n_jobs=-1, random_state=101, criterion='poisson', warm_start=True)
X_treino = dados_treino[['category_1', 'category_2', 'category_3', 'brand_name', 'item_condition_id', 'shipping']]
y_treino = dados_treino['price']

rfr.fit(X_treino, y_treino)


Tempo para treinamento: 2m38.4s

In [10]:
pred = rfr.predict(dados_teste[['category_1', 'category_2', 'category_3', 'brand_name', 'item_condition_id', 'shipping']])

Tempo para previsão: 14.4s

In [7]:
def calc_metricas(obs, pred):
    r2 = metrics.r2_score(obs, pred).round(3)
    mape = (100 * metrics.mean_absolute_percentage_error(obs, pred)).round(3)
    mae = metrics.mean_absolute_error(obs, pred).round(2)
    rmse = (metrics.mean_squared_error(obs, pred)**0.5).round(2)
    msle = metrics.mean_squared_log_error(obs, pred).round(3)
    
    return [r2, mape, mae, rmse, msle]

def print_avaliacao(obs, pred):
    print('R² = %.3f' % metrics.r2_score(obs, pred))
    print('MAPE = %.3f %%' % (100 * metrics.mean_absolute_percentage_error(obs, pred)))
    print('MAE = U$S %.2f' % (metrics.mean_absolute_error(obs, pred)))
    print('RMSE = U$S %.2f' % metrics.mean_squared_error(obs, pred)**0.5)
    print('MSLE = %.3f' % metrics.mean_squared_log_error(obs, pred))


In [8]:
print_avaliacao(dados_teste.price, pred)

R² = 0.343
MAPE = 65.466 %
MAE = U$S 13.15
RMSE = U$S 31.09
MSLE = 0.348


In [13]:
modelos = {}

for category in dados_treino.category_1.unique():
    model = RandomForestRegressor(n_jobs=-1, random_state=101, criterion='poisson', warm_start=True)
    modelos[category] = model
    X_treino = dados_treino.query(f'category_1 == {category}')[['category_1', 'category_2', 'category_3', 'brand_name', 'item_condition_id', 'shipping']]
    y_treino = dados_treino.query(f'category_1 == {category}')['price']
    # pipe.fit(X_treino, y_treino)
    modelos[category].fit(X_treino, y_treino)

Tempo para treino: 1m 14.9s

In [14]:
dados_teste = dados_teste.assign(pred=0)
for category in dados_teste.category_1.unique():
    index = dados_teste.query(f'category_1 == {category}').index
    dados_teste.loc[index, 'pred'] = modelos[category].predict(dados_teste.loc[index][['category_1', 'category_2', 'category_3', 'brand_name', 'item_condition_id', 'shipping']])

Tempo para previsão: 6.8s

In [26]:
avaliacoes = {}

for categoria in dados_teste.category_1.unique():
    dados_cat = dados_teste.query(f'category_1 == {categoria}')
    avaliacoes[categoria] = calc_metricas(dados_cat['price'], dados_cat['pred'])

avaliacoes = pd.DataFrame(avaliacoes, index=['R²', 'MAPE', 'MAE', 'RMSE', 'MSLE'],
                          )
avaliacoes.columns = ['Women', 'Men', 'Vintage & Collectibles', 'Electronics',
                      'Beauty', 'Kids', 'Other', 'Home', 'Sports & Outdoors',
                      'Handmade', 'No category']
avaliacoes

Unnamed: 0,Women,Men,Vintage & Collectibles,Electronics,Beauty,Kids,Other,Home,Sports & Outdoors,Handmade,No category
R²,0.337,0.457,0.25,0.41,0.072,0.232,0.113,0.161,0.214,0.189,-0.071
MAPE,57.764,59.399,95.312,90.499,65.019,65.062,81.344,65.52,75.021,72.226,95.787
MAE,12.98,15.76,17.69,21.28,9.86,10.38,12.19,12.26,13.81,10.17,16.21
RMSE,31.86,29.53,43.9,50.57,23.34,19.5,25.84,21.34,26.32,22.59,31.34
MSLE,0.299,0.312,0.553,0.502,0.344,0.342,0.447,0.357,0.419,0.383,0.591
