# Modelo: XGBoost com Tf-Idf

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline

In [2]:
def calc_metricas(obs, pred):
    r2 = metrics.r2_score(obs, pred).round(3)
    mape = (100 * metrics.mean_absolute_percentage_error(obs, pred)).round(3)
    mae = metrics.mean_absolute_error(obs, pred).round(2)
    rmse = (metrics.mean_squared_error(obs, pred)**0.5).round(2)
    msle = metrics.mean_squared_log_error(obs, pred).round(3)

    return [r2, mape, mae, rmse, msle]

def print_avaliacao(obs, pred):
    print('R² = %.3f' % metrics.r2_score(obs, pred))
    print('MAPE = %.3f %%' % (100 * metrics.mean_absolute_percentage_error(obs, pred)))
    print('MAE = U$S %.2f' % (metrics.mean_absolute_error(obs, pred)))
    print('RMSE = U$S %.2f' % metrics.mean_squared_error(obs, pred)**0.5)
    print('MSLE = %.3f' % metrics.mean_squared_log_error(obs, pred))


In [3]:
dados_treino = pd.read_csv('dados_treino_xgboost_categorizado.csv')
dados_teste = pd.read_csv('dados_teste_xgboost_categorizado.csv')

In [17]:
print(dados_teste.price.std())
print(dados_teste.price.mean())
print(dados_teste.price.std()/dados_teste.price.mean())


38.37208546812446
26.724370519793954
1.4358461854023237


In [5]:
import pickle

dados_teste = dados_teste.assign(pred=0)


for category in dados_treino.col_att.unique():
    treino = dados_treino.query(f'col_att == "{category}"')
    treino = treino[treino.price < treino.price.std()*3]
    X_treino = treino['comb_name_description']
    tfidf = TfidfVectorizer(min_df=10,
                            max_df=0.6,
                            ngram_range=(1,2))
    tfidf = tfidf.fit(X_treino)
    X_treino = tfidf.transform(X_treino)
    df = dados_teste.query(f'col_att == "{category}"')
    index = df.index
    X_teste = tfidf.transform(df['comb_name_description'])
    
    with open(f'../models/modelo_{category}.pickle', 'rb') as handle:
        model = pickle.load(handle)
    if type(model) == dict:
        model = model[category]
        
    model = model.best_estimator_
    
    model.fit(X_treino, np.log(treino.price))
    
    dados_teste.loc[index, 'pred'] = np.exp(model.predict(X_teste))
    

In [8]:
dados_teste.loc[dados_teste.pred < 3, 'pred'] = 3

In [9]:
print_avaliacao(dados_teste['price'], dados_teste['pred'])

R² = 0.193
MAPE = 43.003 %
MAE = U$S 12.05
RMSE = U$S 34.47
MSLE = 0.283


com outliers

R² = 0.393
MAPE = 43.596 %
MAE = U$S 11.18
RMSE = U$S 29.90
MSLE = 0.249

sem outliers

R² = 0.207
MAPE = 41.908 %
MAE = U$S 11.71
RMSE = U$S 34.17
MSLE = 0.265

sem outliers categorizado
R² = 0.230
MAPE = 41.676 %
MAE = U$S 11.64
RMSE = U$S 33.68
MSLE = 0.264


Avaliação por categoria principal

In [14]:
dados_teste2 = dados_teste[dados_teste.price <= dados_teste.price.std()*3]
dados_teste2.head()

Unnamed: 0,col_att,comb_name_description,price,pred
0,Women-1,women motorcycle tshirt xl fits like small,6.0,8.194081
1,Women-1,alex ani necklace path life alex ani necklace ...,11.0,10.059308
2,Women-0,steve madden red lace heels never worn true si...,36.0,20.577404
3,Men-0,men black tuxedo worn times single pleated pan...,54.0,21.995853
5,Vintage & Collectibles-0,badtz maru funko pop never opened nib great op...,25.0,30.340559


In [37]:
# dados_teste2 = dados_teste2.assign(pred=0)
# dados_teste2.head()

Unnamed: 0,col_att,comb_name_description,price,pred
0,Women-1,women motorcycle tshirt xl fits like small,6.0,0
1,Women-1,alex ani necklace path life alex ani necklace ...,11.0,0
2,Women-0,steve madden red lace heels never worn true si...,36.0,0
3,Men-0,men black tuxedo worn times single pleated pan...,54.0,0
5,Vintage & Collectibles-0,badtz maru funko pop never opened nib great op...,25.0,0


In [15]:
print_avaliacao(dados_teste2['price'], dados_teste2['pred'])

R² = 0.391
MAPE = 42.311 %
MAE = U$S 8.68
RMSE = U$S 14.28
MSLE = 0.229


In [62]:
for category in dados_teste2.col_att.unique():
    index = dados_teste2.query(f'col_att == "{category}"').index
    dados_teste2.loc[index, 'pred'] = np.exp(modelos[category].predict(dados_teste2.loc[index]['comb_name_description']))

dados_teste2.loc[dados_teste2.pred < 3, 'pred'] = 3

In [16]:
print(dados_teste2.price.std())
print(dados_teste2.price.mean())
print(dados_teste2.price.std()/dados_teste2.price.mean())


18.28703815039701
22.629095637232208
0.8081205914525764


In [63]:
print_avaliacao(dados_teste2['price'], dados_teste2['pred'])

R² = 0.424
MAPE = 41.025 %
MAE = U$S 8.40
RMSE = U$S 13.88
MSLE = 0.215


sem outliers

R² = 0.439
MAPE = 41.211 %
MAE = U$S 8.36
RMSE = U$S 13.70
MSLE = 0.213

sem ouliers categorizados

R² = 0.424
MAPE = 41.025 %
MAE = U$S 8.40
RMSE = U$S 13.88
MSLE = 0.215

In [18]:
avaliacoes = {}

for categoria in dados_teste1.category_1.unique():
    dados_cat = dados_teste1.query(f'category_1 == "{categoria}"')
    avaliacoes[categoria] = calc_metricas(dados_cat['price'], dados_cat['pred'])

avaliacoes = pd.DataFrame(avaliacoes, index=['R²', 'MAPE', 'MAE', 'RMSE', 'MSLE'])

avaliacoes

AttributeError: 'DataFrame' object has no attribute 'category_1'