# Modelo: XGBoost com Tf-Idf

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline

In [2]:
def calc_metricas(obs, pred):
    r2 = metrics.r2_score(obs, pred).round(3)
    mape = (100 * metrics.mean_absolute_percentage_error(obs, pred)).round(3)
    mae = metrics.mean_absolute_error(obs, pred).round(2)
    rmse = (metrics.mean_squared_error(obs, pred)**0.5).round(2)
    msle = metrics.mean_squared_log_error(obs, pred).round(3)
    
    return [r2, mape, mae, rmse, msle]

def print_avaliacao(obs, pred):
    print('R² = %.3f' % metrics.r2_score(obs, pred))
    print('MAPE = %.3f %%' % (100 * metrics.mean_absolute_percentage_error(obs, pred)))
    print('MAE = U$S %.2f' % (metrics.mean_absolute_error(obs, pred)))
    print('RMSE = U$S %.2f' % metrics.mean_squared_error(obs, pred)**0.5)
    print('MSLE = %.3f' % metrics.mean_squared_log_error(obs, pred))


In [3]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))


def prep_text(text):
    try:
        tokens = word_tokenize(text)
        words = [word.lower() for word in tokens if word.isalpha()]
        words = [word for word in words if not word in stop_words]
    except:
        return None
    return ' '.join(words)

In [4]:
dados_treino = pd.read_csv('../data/processed/train_data.csv')
dados_teste = pd.read_csv('../data/processed/test_data.csv')

## Preparação das colunas de texto

In [5]:
dados_treino.columns

Index(['name', 'category_1', 'category_2', 'category_3', 'item_condition_id',
       'brand_name', 'price', 'shipping', 'item_description', 'date', 'stock'],
      dtype='object')

In [6]:
dados_treino = dados_treino[['category_1', 'name', 'item_description', 'price']]
dados_treino.loc[:, 'name'] = dados_treino.apply(lambda x: prep_text(x['name']), axis=1)
dados_treino.loc[:, 'item_description'] = dados_treino.apply(lambda x: prep_text(x['item_description']), axis=1)
dados_treino['comb_name_description'] = dados_treino['name'] + ' ' + dados_treino['item_description']
# dados_treino.dropna(inplace=True)

dados_teste = dados_teste[['category_1', 'name', 'item_description', 'price']]
dados_teste.loc[:, 'name'] = dados_teste.apply(lambda x: prep_text(x['name']), axis=1)
dados_teste.loc[:, 'item_description'] = dados_teste.apply(lambda x: prep_text(x['item_description']), axis=1)
dados_teste['comb_name_description'] = dados_teste['name'] + ' ' + dados_teste['item_description']

## Apenas names como característica

In [None]:
dados_treino1 = dados_treino[['category_1', 'name', 'price']]
dados_treino1.dropna(inplace=True)

dados_teste1 = dados_teste[['category_1', 'name', 'price']]
dados_teste1.dropna(inplace=True)

In [8]:
pipe = Pipeline([('tiidf', TfidfVectorizer(min_df = 2, max_df = 0.5, ngram_range = (1, 2))),
                    ('xgboost', XGBRegressor(random_state=101, n_jobs=-1))])
X_treino = dados_treino1['name']
y_treino = np.log(dados_treino1['price'])
pipe.fit(X_treino, y_treino)

Tempo de treino: 2m 11.3s

In [None]:
dados_teste1['pred'] = np.exp(pipe.predict(dados_teste1['name']))
dados_teste1.loc[dados_teste1.pred < 3, 'pred'] = 3

Tempo previsão: 11.8s

Avaliação geral

In [10]:
print_avaliacao(dados_teste1['price'], dados_teste1['pred'])

R² = 0.165
MAPE = 54.739 %
MAE = U$S 13.45
RMSE = U$S 35.05
MSLE = 0.372


Avaliação por categoria

In [11]:
avaliacoes = {}

for categoria in dados_teste1.category_1.unique():
    dados_cat = dados_teste1.query(f'category_1 == "{categoria}"')
    avaliacoes[categoria] = calc_metricas(dados_cat['price'], dados_cat['pred'])

avaliacoes = pd.DataFrame(avaliacoes, index=['R²', 'MAPE', 'MAE', 'RMSE', 'MSLE'])

avaliacoes

Unnamed: 0,Women,Men,Vintage & Collectibles,Electronics,Beauty,Kids,Other,Home,Sports & Outdoors,Handmade,No category
R²,0.172,0.134,0.153,0.156,0.04,0.135,0.026,0.07,0.053,0.18,0.127
MAPE,48.092,46.0,74.852,64.582,59.398,59.127,70.826,54.538,53.55,76.903,59.133
MAE,13.55,17.87,16.1,22.23,9.66,10.04,11.84,11.78,13.35,9.96,13.1
RMSE,35.61,37.29,46.64,60.48,23.74,20.68,27.07,22.47,28.89,22.72,28.3
MSLE,0.327,0.413,0.531,0.572,0.344,0.34,0.456,0.358,0.401,0.413,0.409


## Modelo por categoria usando apenas a coluna name

In [12]:
modelos = {}

for category in dados_treino.category_1.unique():
    pipe = Pipeline([('tiidf', TfidfVectorizer(min_df = 2, max_df = 0.5, ngram_range = (1, 2))),
                    ('xgboost', XGBRegressor(random_state=101, n_jobs=-1))])
    modelos[category] = pipe
    X_treino = dados_treino1.query(f'category_1 == "{category}"')['name']
    y_treino = np.log(dados_treino1.query(f'category_1 == "{category}"')['price'])
    # pipe.fit(X_treino, y_treino)
    modelos[category].fit(X_treino, y_treino)


Tempo de treino: 2m 16.2s

In [13]:
dados_teste1 = dados_teste1.assign(pred=0)

In [14]:
for category in dados_teste1.category_1.unique():
    index = dados_teste1.query(f'category_1 == "{category}"').index
    dados_teste1.loc[index, 'pred'] = np.exp(modelos[category].predict(dados_teste1.loc[index]['name']))

dados_teste1.loc[dados_teste1.pred < 3, 'pred'] = 3

Tempo de previsão: 5.7s

Avaliação geral:

In [15]:
print_avaliacao(dados_teste1['price'], dados_teste1['pred'])

R² = 0.269
MAPE = 49.182 %
MAE = U$S 12.34
RMSE = U$S 32.81
MSLE = 0.310


Avaliação por categoria principal

In [16]:
avaliacoes = {}

for categoria in dados_teste1.category_1.unique():
    dados_cat = dados_teste1.query(f'category_1 == "{categoria}"')
    avaliacoes[categoria] = calc_metricas(dados_cat['price'], dados_cat['pred'])

avaliacoes = pd.DataFrame(avaliacoes, index=['R²', 'MAPE', 'MAE', 'RMSE', 'MSLE'])

avaliacoes

Unnamed: 0,Women,Men,Vintage & Collectibles,Electronics,Beauty,Kids,Other,Home,Sports & Outdoors,Handmade,No category
R²,0.235,0.376,0.232,0.316,0.1,0.214,0.14,0.261,0.291,0.243,0.095
MAPE,47.635,48.841,60.915,59.441,46.821,47.584,52.567,47.042,47.342,48.913,61.726
MAE,12.94,15.43,14.99,19.56,8.61,9.12,10.29,10.32,11.3,8.52,13.56
RMSE,34.24,31.64,44.42,54.44,22.99,19.72,25.44,20.02,25.01,21.82,28.81
MSLE,0.297,0.298,0.443,0.429,0.275,0.286,0.35,0.282,0.303,0.3,0.44


## Modelo geral usando coluna item_description

In [None]:
dados_treino1 = dados_treino[['category_1', 'item_description', 'price']]
dados_treino1.dropna(inplace=True)

dados_teste1 = dados_teste[['category_1', 'item_description', 'price']]
dados_teste1.dropna(inplace=True)

In [18]:
pipe = Pipeline([('tiidf', TfidfVectorizer(min_df = 2, max_df = 0.5, ngram_range = (1, 2))),
                 ('xgboost', XGBRegressor(random_state=101, n_jobs=-1))])
X_treino = dados_treino1['item_description']
y_treino = np.log(dados_treino1['price'])
pipe.fit(X_treino, y_treino)

Tempo de treino: 9m 37.8s

In [None]:
dados_teste1['pred'] = np.exp(pipe.predict(dados_teste1['item_description']))
dados_teste1.loc[dados_teste1.pred < 3, 'pred'] = 3

Tempo de previsão: 20.8s

Avaliação geral

In [20]:
print_avaliacao(dados_teste1['price'], dados_teste1['pred'])

R² = 0.170
MAPE = 57.278 %
MAE = U$S 13.86
RMSE = U$S 34.96
MSLE = 0.398


Avaliação por categoria principal

In [21]:
avaliacoes = {}

for categoria in dados_teste1.category_1.unique():
    dados_cat = dados_teste1.query(f'category_1 == "{categoria}"')
    avaliacoes[categoria] = calc_metricas(dados_cat['price'], dados_cat['pred'])

avaliacoes = pd.DataFrame(avaliacoes, index=['R²', 'MAPE', 'MAE', 'RMSE', 'MSLE'])

avaliacoes

Unnamed: 0,Women,Men,Vintage & Collectibles,Electronics,Beauty,Kids,Other,Home,Sports & Outdoors,Handmade,No category
R²,0.173,0.096,0.146,0.189,0.049,0.11,0.047,0.028,0.078,0.183,0.065
MAPE,51.937,49.03,72.864,68.093,60.032,61.623,69.16,54.419,55.638,78.521,60.266
MAE,14.35,18.54,15.99,21.8,9.64,10.35,11.68,12.1,13.39,10.07,13.51
RMSE,35.6,38.1,46.87,59.28,23.63,20.98,26.81,22.98,28.53,22.64,29.28
MSLE,0.375,0.463,0.524,0.552,0.343,0.36,0.446,0.379,0.403,0.432,0.44


## Modelo por categoria usando item_description

In [22]:
modelos = {}

for category in dados_treino.category_1.unique():
    pipe = Pipeline([('tiidf', TfidfVectorizer(min_df = 2, max_df = 0.5, ngram_range = (1, 2))),
                    ('xgboost', XGBRegressor(random_state=101, n_jobs=-1))])
    modelos[category] = pipe
    X_treino = dados_treino1.query(f'category_1 == "{category}"')['item_description']
    y_treino = np.log(dados_treino1.query(f'category_1 == "{category}"')['price'])
    # pipe.fit(X_treino, y_treino)
    modelos[category].fit(X_treino, y_treino)


Tempo de treino: 7m 18.3s

In [23]:
dados_teste1 = dados_teste1.assign(pred=0)

In [24]:
for category in dados_teste1.category_1.unique():
    index = dados_teste1.query(f'category_1 == "{category}"').index
    dados_teste1.loc[index, 'pred'] = np.exp(modelos[category].predict(dados_teste1.loc[index]['item_description']))

dados_teste1.loc[dados_teste1.pred < 3, 'pred'] = 3

Tempo de previsão: 18.8s

Avaliação geral

In [25]:
print_avaliacao(dados_teste1['price'], dados_teste1['pred'])

R² = 0.236
MAPE = 53.190 %
MAE = U$S 13.12
RMSE = U$S 33.54
MSLE = 0.352


Avaliação por categoria geral

In [26]:
avaliacoes = {}

for categoria in dados_teste1.category_1.unique():
    dados_cat = dados_teste1.query(f'category_1 == "{categoria}"')
    avaliacoes[categoria] = calc_metricas(dados_cat['price'], dados_cat['pred'])

avaliacoes = pd.DataFrame(avaliacoes, index=['R²', 'MAPE', 'MAE', 'RMSE', 'MSLE'])

avaliacoes

Unnamed: 0,Women,Men,Vintage & Collectibles,Electronics,Beauty,Kids,Other,Home,Sports & Outdoors,Handmade,No category
R²,0.218,0.29,0.157,0.276,0.097,0.158,0.116,0.161,0.269,0.185,0.033
MAPE,53.294,54.388,62.307,60.902,48.401,50.562,54.224,50.902,52.101,52.143,65.209
MAE,13.95,16.85,15.62,20.22,8.81,9.59,10.61,11.18,12.18,8.98,14.31
RMSE,34.6,33.77,46.56,56.0,23.03,20.41,25.82,21.34,25.39,22.61,29.78
MSLE,0.351,0.364,0.475,0.463,0.293,0.318,0.367,0.324,0.342,0.337,0.479


## Modelo geral usando comb_name_description

In [None]:
dados_treino1 = dados_treino[['category_1', 'comb_name_description', 'price']]
dados_treino1.dropna(inplace=True)

dados_teste1 = dados_teste[['category_1', 'comb_name_description', 'price']]
dados_teste1.dropna(inplace=True)

In [28]:
pipe = Pipeline([('tiidf', TfidfVectorizer(min_df = 2, max_df = 0.5, ngram_range = (1, 2))),
                    ('xgboost', XGBRegressor(random_state=101, n_jobs=-1))])
X_treino = dados_treino1['comb_name_description']
y_treino = np.log(dados_treino1['price'])
pipe.fit(X_treino, y_treino)

Tempo de treino: 10m 49.4s

In [None]:
dados_teste1['pred'] = np.exp(pipe.predict(dados_teste1['comb_name_description']))
dados_teste1.loc[dados_teste1.pred < 3, 'pred'] = 3

Tempo previsão: 21.6s

Avaliação geral

In [30]:
print_avaliacao(dados_teste1['price'], dados_teste1['pred'])

R² = 0.229
MAPE = 51.808 %
MAE = U$S 12.86
RMSE = U$S 33.69
MSLE = 0.337


Avaliação por categoria

In [31]:
avaliacoes = {}

for categoria in dados_teste1.category_1.unique():
    dados_cat = dados_teste1.query(f'category_1 == "{categoria}"')
    avaliacoes[categoria] = calc_metricas(dados_cat['price'], dados_cat['pred'])

avaliacoes = pd.DataFrame(avaliacoes, index=['R²', 'MAPE', 'MAE', 'RMSE', 'MSLE'])

avaliacoes

Unnamed: 0,Women,Men,Vintage & Collectibles,Electronics,Beauty,Kids,Other,Home,Sports & Outdoors,Handmade,No category
R²,0.264,0.212,0.196,0.188,0.066,0.19,0.059,0.092,0.1,0.231,0.154
MAPE,46.377,44.691,68.584,61.543,55.121,55.156,63.779,51.797,51.862,67.867,55.19
MAE,12.95,17.01,15.45,21.01,9.27,9.63,11.34,11.54,13.04,9.36,12.74
RMSE,33.57,35.56,45.47,59.31,23.42,20.02,26.63,22.2,28.19,21.98,27.86
MSLE,0.298,0.371,0.482,0.496,0.316,0.312,0.416,0.341,0.379,0.363,0.386


## Modelo por categoria usando comb_name_description

In [32]:
modelos = {}

for category in dados_treino.category_1.unique():
    pipe = Pipeline([('tiidf', TfidfVectorizer(min_df = 2, max_df = 0.5, ngram_range = (1, 2))),
                    ('xgboost', XGBRegressor(random_state=101, n_jobs=-1))])
    modelos[category] = pipe
    X_treino = dados_treino1.query(f'category_1 == "{category}"')['comb_name_description']
    y_treino = np.log(dados_treino1.query(f'category_1 == "{category}"')['price'])
    # pipe.fit(X_treino, y_treino)
    modelos[category].fit(X_treino, y_treino)


Tempo de treino: 8m 46.4s

In [33]:
dados_teste1 = dados_teste1.assign(pred=0)

In [34]:
for category in dados_teste1.category_1.unique():
    index = dados_teste1.query(f'category_1 == "{category}"').index
    dados_teste1.loc[index, 'pred'] = np.exp(modelos[category].predict(dados_teste1.loc[index]['comb_name_description']))

dados_teste1.loc[dados_teste1.pred < 3, 'pred'] = 3

Tempo previsão: 25.4s

Avaliação geral

In [35]:
print_avaliacao(dados_teste1['price'], dados_teste1['pred'])

R² = 0.320
MAPE = 46.681 %
MAE = U$S 11.85
RMSE = U$S 31.64
MSLE = 0.283


Avaliação por categoria principal

In [36]:
avaliacoes = {}

for categoria in dados_teste1.category_1.unique():
    dados_cat = dados_teste1.query(f'category_1 == "{categoria}"')
    avaliacoes[categoria] = calc_metricas(dados_cat['price'], dados_cat['pred'])

avaliacoes = pd.DataFrame(avaliacoes, index=['R²', 'MAPE', 'MAE', 'RMSE', 'MSLE'])

avaliacoes

Unnamed: 0,Women,Men,Vintage & Collectibles,Electronics,Beauty,Kids,Other,Home,Sports & Outdoors,Handmade,No category
R²,0.324,0.444,0.267,0.302,0.144,0.277,0.165,0.299,0.339,0.238,0.072
MAPE,45.714,46.247,56.942,54.254,43.903,45.331,49.776,44.999,46.405,46.513,59.973
MAE,12.38,14.71,14.47,18.83,8.24,8.77,9.99,10.08,11.09,8.34,13.56
RMSE,32.17,29.89,43.41,54.97,22.43,18.91,25.1,19.51,24.16,21.86,29.17
MSLE,0.271,0.272,0.406,0.38,0.25,0.263,0.324,0.264,0.288,0.279,0.437
