# Projeto 1 - módulo 6

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Precificação dinâmica - e-commerce

### Mercari Price Suggestion Challenge - Kaggle

Mercari é um site de revenda de produtos online. Uma dos desafios desse tipo de plataforma é auxiliar o usuário, muitas vezes com pouco conhecimento de vendas, a determinar um preço para os seus produtos de modo a maximizar as chances de venda.

### Sobre este projeto

O presente projeto tem o objetivo de desenvolver um algoritmo que identifique produtos já vendidos similares e sugira ao usuário um preço ótimo para novos produtos cadastrados.


### Preparação do ambiente

Para este projeto, acesse o link https://www.kaggle.com/competitions/mercari-price-suggestion-challenge/overview 


In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
from tensorflow.keras.models import Model

!pip install tensorflow_addons
import tensorflow_addons as tfa
from sklearn import metrics

from sklearn.pipeline import Pipeline
import pickle


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
# função de tokenização e preenchimento de comprimento

def text_vectorizer(feature):

  # TOKENIZER
  tk = Tokenizer()
  # FIT ON TRAIN 
  tk.fit_on_texts(base_train[feature].apply(str))
  # TOKENIZES THE TRAIN DATASET
  tk_train = tk.texts_to_sequences(base_train[feature].apply(str))
  # TOKENIZES THE TEST DATASET
  tk_test = tk.texts_to_sequences(base_test[feature].apply(str))
    
  # COMPUTES THE MAX LENGTH
  max_length = base_train[feature].apply(lambda x :len(str(x).split())).max()
    
  # COMPUTE THE VOCAB SIZE
  vocab_size = len(tk.word_index) + 1
    
  # PADDING THE TRAIN SEQUENCES
  train_pad= pad_sequences(tk_train,padding="post",maxlen = max_length)
  # PADDING THE TEST SEQUENCES
  test_pad = pad_sequences(tk_test,padding = "post", maxlen = max_length)
    
  # RETURN THE TOKENIZER, MAX LENGTH , PADDED TRAIN SEQUENCES , PADDED VALIDATION SEQUENCES 
  return tk , max_length, vocab_size, train_pad , test_pad

In [2]:
base_train = pd.read_csv ('/content/drive/MyDrive/dados_treino.csv') 
base_test = pd.read_csv ('/content/drive/MyDrive/dados_teste.csv') 
base_valid = pd.read_csv ('/content/drive/MyDrive/dados_validacao.csv') 

In [5]:
base_train['item_description'] = base_train['item_description'].fillna("No description")
base_train['name'] = base_train['name'].fillna("No name")

base_test['item_description'] = base_test['item_description'].fillna("No description")
base_test['name'] = base_test['name'].fillna("No name")

base_valid['item_description'] = base_valid['item_description'].fillna("No description")
base_valid['name'] = base_valid['name'].fillna("No name")

In [6]:
# rodando a função text_vectorizer para todos os atributos

tk_name , max_length_name, vocab_size_name, train_name_pad , test_name_pad = text_vectorizer('name')
tk_category_1 , max_length_category_1, vocab_size_category_1, train_category_1_pad , test_category_1_pad = text_vectorizer('category_1')
tk_category_2 , max_length_category_2, vocab_size_category_2, train_category_2_pad , test_category_2_pad = text_vectorizer('category_2')
tk_category_3 , max_length_category_3, vocab_size_category_3, train_category_3_pad , test_category_3_pad = text_vectorizer('category_3')
tk_brand_name , max_length_brand_name, vocab_size_brand_name, train_brand_name_pad , test_brand_name_pad = text_vectorizer('brand_name')
tk_item_description , max_length_item_description, vocab_size_item_description, train_item_description_pad , test_item_description_pad = text_vectorizer('item_description')

In [11]:
test_shipping = base_test.shipping
test_item_cond = base_test.item_condition_id

x_test = [test_item_cond,
          test_shipping,
          test_brand_name_pad,
          test_category_1_pad, 
          test_category_2_pad, 
          test_category_3_pad, 
          test_name_pad,
          test_item_description_pad]
base_test['log_price'] = np.log(base_test['price'])
y_test = base_test.log_price

In [7]:
# arquitetura do deep learning

tf.keras.backend.clear_session()

# ITEM CONDITION ID
inp1 = layers.Input(shape=(1)) # INPUT 1 
emb1  = layers.Embedding(6, 10, input_length=1)(inp1) # EMBEDDING 1
flat1 = layers.Flatten()(emb1) # FLATTEN

# SHIPPING 
inp2 = layers.Input(shape=(1)) # INPUT 2 
d2 = layers.Dense(10, activation="relu")(inp2) # DENSE LAYER 2

# BRAND NAME
inp3 = layers.Input(shape=(6)) # INPUT 3
emb3 = layers.Embedding(vocab_size_brand_name, 16, input_length=6)(inp3) # EMBEDDING 3
flat3 = layers.Flatten()(emb3) # FLATTEN

# CATEGORY_1
inp4 = layers.Input(shape=(3)) # INPUT 4
emb4 = layers.Embedding(vocab_size_category_1, 16, input_length=3)(inp4) # EMBEDDING 4
flat4 = layers.Flatten()(emb4) # FLATTEN 

# CATEGORY_2
inp5= layers.Input(shape=(5)) # INPUT 5
emb5 = layers.Embedding(vocab_size_category_2, 16, input_length=5)(inp5) # EMBEDDING 5
flat5 = layers.Flatten()(emb5) # FLATTEN

# CATEGORY_3
inp6= layers.Input(shape=(7)) # INPUT 6 
emb6 = layers.Embedding(vocab_size_category_3, 40 ,input_length= 7 )(inp6) # EMBEDDING 6
flat6 = layers.Flatten()(emb6) # FLATTEN

# NAME
inp7= layers.Input(shape=(17)) # INPUT 7
emb7 = layers.Embedding(vocab_size_name, 20, input_length=17)(inp7) # EMBEDDING 7
lstm7 = layers.GRU(64, return_sequences=True)(emb7) # GRU
flat7 = layers.Flatten()(lstm7) # FLATTEN

# ITEM DESCRIPTION
inp8= layers.Input(shape=(245)) # INPUT 8 
emb8 = layers.Embedding(vocab_size_item_description, 40, input_length=207)(inp8) # EMBEDDING 8
lstm8 = layers.GRU(64, return_sequences=True)(emb8) # GRU
flat8 = layers.Flatten()(lstm8) # FLATTEN

# CONCATENAÇÃO
concat = layers.Concatenate()([flat1, d2, flat3, flat4, flat5, flat6, flat7, flat8])

# DENSE LAYERS
dense1 = layers.Dense(512, activation="relu")(concat)

# DROPOUT LAYER
drop2 = layers.Dropout(0.2)(dense1)

# DENSE LAYER
dense2 = layers.Dense(256, activation="relu")(drop2)

# DROPOUT LAYER
drop2 = layers.Dropout(0.3)(dense2)

# DENSE LAYER
dense3 = layers.Dense(128, activation="relu")(drop2)

# DROPOUT LAYER
drop2 = layers.Dropout(0.4)(dense3)

# BATCHNORM LAYER
bn2  = layers.BatchNormalization()(drop2)

# DENSE LAYER
dense4 = layers.Dense(1, activation="linear")(bn2)

# MODEL
model =  Model(inputs=[inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8], outputs=dense4)

# SCHEDULE
def shedule(epoch,lr):
    if epoch<=2:
        return lr
    else:
        return lr*0.1

# CALLBACKS
lr = tf.keras.callbacks.LearningRateScheduler(shedule, verbose=1)
save = tf.keras.callbacks.ModelCheckpoint("content/drive/MyDrive/Blue Edtech/notebooks",
                                          monitor="val_root_mean_squared_error",
                                          mode="min",
                                          save_best_only=True,
                                          save_weights_only=True,
                                          verbose=1)
earlystop = tf.keras.callbacks.EarlyStopping(monitor="val_root_mean_squared_error",
                                             min_delta= 0.01, 
                                             patience=2,
                                             mode="min" )

model.compile(optimizer="adam",
              loss="mse",
              metrics=[tf.keras.losses.MeanAbsoluteError(), 
                       tfa.metrics.r_square.RSquare(),
                       tf.keras.metrics.RootMeanSquaredError(), 
                       tf.keras.metrics.mean_absolute_percentage_error,
                       tf.keras.metrics.mean_squared_logarithmic_error ])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_7 (InputLayer)           [(None, 17)]         0           []                               
                                                                                                  
 input_8 (InputLayer)           [(None, 245)]        0           []                               
                                                                                                  
 input_1 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, 6)]          0           []                               
                                                                                              

In [4]:
class PrepData():

    def __init__(self,
                 features=['brand_name', 'category_1', 'category_2', 'category_3', 'name', 'item_description'],
                 max_lengths=[6, 3, 5, 7, 17, 245]):
        self.features = features
        self.max_lengths = max_lengths
        self.tokenizers = {}
        for feature in self.features:
            self.tokenizers[feature] = Tokenizer()

    def fillna(self, X):
        X.item_description = X.item_description.fillna("No description")
        X.name = X.name.fillna("No name")
        return X
    
    def fit(self, X, y=None):
        print('fitting_data')
        X = self.fillna(X)

        for feature in self.features:
            self.tokenizers[feature].fit_on_texts(X[feature].apply(str))
        print('data fitted')
        return self
    
    def transform(self, X, y=None):
        print('transforming data')
        item_condition = X.item_condition_id
        shipping = X.shipping
        output = [item_condition, shipping]
        X = self.fillna(X)

        for i, feature in enumerate(self.features):
            text_sequence = self.tokenizers[feature].texts_to_sequences(X[feature].apply(str))
            pad = pad_sequences(text_sequence, padding='post', maxlen=self.max_lengths[i])
            output.append(pad)
        
        print('data transformed')
        
        return output

In [28]:
pipe = Pipeline(steps=[
    ('tokenizer', PrepData()),
    ('model', model)
])

In [34]:
pipe.fit(base_train, 
         np.log(base_train.price), 
         model__validation_data=(x_test, y_test),
         model__epochs=10,
         model__batch_size=1024,
         model__callbacks=[save, lr, earlystop])


fitting_data
data fitted
transforming data
data transformed

Epoch 1: LearningRateScheduler setting learning rate to 1.0000001111620804e-06.
Epoch 1/10
Epoch 1: val_root_mean_squared_error improved from 0.48905 to 0.48636, saving model to content/drive/MyDrive/Blue Edtech/notebooks

Epoch 2: LearningRateScheduler setting learning rate to 1.0000001111620804e-06.
Epoch 2/10
Epoch 2: val_root_mean_squared_error improved from 0.48636 to 0.48546, saving model to content/drive/MyDrive/Blue Edtech/notebooks

Epoch 3: LearningRateScheduler setting learning rate to 1.0000001111620804e-06.
Epoch 3/10
Epoch 3: val_root_mean_squared_error improved from 0.48546 to 0.48477, saving model to content/drive/MyDrive/Blue Edtech/notebooks


Pipeline(steps=[('tokenizer', <__main__.PrepData object at 0x7f01f5e50990>),
                ('model',
                 <keras.engine.functional.Functional object at 0x7f01e743ae50>)])

In [35]:
y_pred2 = np.exp(pipe.predict(base_test))


transforming data
data transformed


In [7]:
def print_avaliacao(obs, pred):
    print('R² = %.3f' % metrics.r2_score(obs, pred))
    print('MAPE = %.3f %%' % (100 * metrics.mean_absolute_percentage_error(obs, pred)))
    print('MAE = U$S %.2f' % (metrics.mean_absolute_error(obs, pred)))
    print('RMSE = U$S %.2f' % metrics.mean_squared_error(obs, pred)**0.5)
    print('RMSLE = %.4f' % metrics.mean_squared_log_error(obs, pred,squared=False))

In [36]:
y_pred2 = np.exp(pipe.predict(base_test))

print_avaliacao(base_test.price, y_pred2)

R² = 0.408
MAPE = 38.882 %
MAE = U$S 10.49
RMSE = U$S 29.77
RMSLE = 0.4580


R² = 0.455
MAPE = 39.412 %
MAE = U$S 10.20
RMSE = U$S 28.58
RMSLE = 0.4457

In [None]:
rnn = pipe['model']
rnn.save('rnn_keras.h5') # salva somente a rnn
pipe.steps[1] = ('model', None) # remove modelo da pipeline pq o pickle não funciona com keras

with open(f'pipeline.pickle', 'wb') as handle:
  pickle.dump(pipe, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [5]:
from tensorflow import keras

# Carrega o rnn
modelo_lido = keras.models.load_model('rnn_keras.h5', compile=False)

# Carrega a pipeline
with open(f'pipeline.pickle', 'rb') as handle:
  nova_pipeline = pickle.load(handle)

# Adiciona a rnn a nova pipeline
nova_pipeline.steps[1] = ('model', modelo_lido)

In [None]:
y_pred2 = np.exp(nova_pipeline.predict(base_test))

In [8]:
print_avaliacao(base_test.price, y_pred2)

R² = 0.408
MAPE = 38.882 %
MAE = U$S 10.49
RMSE = U$S 29.77
RMSLE = 0.4580
