In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
from tensorflow.keras.models import Model

!pip install tensorflow_addons
import tensorflow_addons as tfa
from sklearn import metrics

from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import Pipeline

import pickle


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow_addons
  Downloading tensorflow_addons-0.18.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 4.6 MB/s 
Installing collected packages: tensorflow-addons
Successfully installed tensorflow-addons-0.18.0


In [2]:
base_train = pd.read_csv ('/content/drive/MyDrive/dados_treino.csv') 
base_test = pd.read_csv ('/content/drive/MyDrive/dados_teste.csv') 
base_valid = pd.read_csv ('/content/drive/MyDrive/dados_validacao.csv') 

In [None]:
base_train.head()

Unnamed: 0,name,category_1,category_2,category_3,item_condition_id,brand_name,price,shipping,item_description,date,stock
0,huge xxl tapestry,Home,Home Décor,Tapestries,1,No Brand,23.0,1,"HUGE XXL BRAND NEW TAPESTRY 89""by85"" Beautiful...",19-1-2018,49
1,small camo christmas tree skirt,Home,Seasonal Décor,Christmas,2,No Brand,8.0,1,Small Christmas tree skirt,4-1-2018,17
2,dooney bourke,Women,Women's Accessories,Wallets,1,Dooney & Bourke,25.0,1,New,17-6-2018,1
3,be wild inspired by sauvage by dior,Beauty,Fragrance,Men,1,No Brand,8.0,1,Diamond Collection's Be Wild 3.4 ounce large b...,16-8-2018,40
4,iphone plus cases,Electronics,Cell Phones & Accessories,"Cases, Covers & Skins",3,No Brand,14.0,0,Love these speck cases! Great for iPhones. Ask...,30-7-2018,1


In [4]:
# arquitetura do deep learning

tf.keras.backend.clear_session()

# ITEM CONDITION ID
inp1 = layers.Input(shape=(1)) # INPUT 1 
emb1  = layers.Embedding(6, 10, input_length=1)(inp1) # EMBEDDING 1
flat1 = layers.Flatten()(emb1) # FLATTEN

# SHIPPING 
inp2 = layers.Input(shape=(1)) # INPUT 2 
d2 = layers.Dense(10, activation="relu")(inp2) # DENSE LAYER 2

# CATEGORY_1
inp4 = layers.Input(shape=(3)) # INPUT 4
emb4 = layers.Embedding(15, 16, input_length=3)(inp4) # EMBEDDING 4
flat4 = layers.Flatten()(emb4) # FLATTEN 

# CATEGORY_2
inp5= layers.Input(shape=(5)) # INPUT 5
emb5 = layers.Embedding(146, 16, input_length=5)(inp5) # EMBEDDING 5
flat5 = layers.Flatten()(emb5) # FLATTEN

# CATEGORY_3
inp6= layers.Input(shape=(7)) # INPUT 6 
emb6 = layers.Embedding(963, 40 ,input_length=7)(inp6) # EMBEDDING 6
flat6 = layers.Flatten()(emb6) # FLATTEN

# ITEM_NAME_DESCRIPTION
inp7= layers.Input(shape=(254)) # INPUT 7
emb7 = layers.Embedding(152273, 20, input_length=254)(inp7) # EMBEDDING 7
lstm7 = layers.GRU(64, return_sequences=True)(emb7) # GRU
flat7 = layers.Flatten()(lstm7) # FLATTEN

# CONCATENAÇÃO
concat = layers.Concatenate()([flat1, d2, flat4, flat5, flat6, flat7])

# DENSE LAYERS
dense1 = layers.Dense(512, activation="relu")(concat)

# DROPOUT LAYER
drop2 = layers.Dropout(0.2)(dense1)

# DENSE LAYER
dense2 = layers.Dense(256, activation="relu")(drop2)

# DROPOUT LAYER
drop2 = layers.Dropout(0.2)(dense2)

# DENSE LAYER
dense3 = layers.Dense(128, activation="relu")(drop2)

# DROPOUT LAYER
drop2 = layers.Dropout(0.2)(dense3)

# BATCHNORM LAYER
bn2  = layers.BatchNormalization()(drop2)

# DENSE LAYER
dense4 = layers.Dense(1, activation="linear")(bn2)

# MODEL
model =  Model(inputs=[inp1, inp2, inp4, inp5, inp6, inp7], outputs=dense4)

# SCHEDULE
def shedule(epoch,lr):
    if epoch<=2:
        return lr
    else:
        return lr*0.1

# CALLBACKS
lr = tf.keras.callbacks.LearningRateScheduler(shedule, verbose=1)
save = tf.keras.callbacks.ModelCheckpoint(".",
                                          monitor="val_root_mean_squared_error",
                                          mode="min",
                                          save_best_only=True,
                                          save_weights_only=True,
                                          verbose=1)
earlystop = tf.keras.callbacks.EarlyStopping(monitor="val_root_mean_squared_error",
                                             min_delta= 0.01, 
                                             patience=2,
                                             mode="min" )

model.compile(optimizer="adam",
              loss="mse",
              metrics=[tf.keras.losses.MeanAbsoluteError(), 
                       tfa.metrics.r_square.RSquare(),
                       tf.keras.metrics.RootMeanSquaredError(), 
                       tf.keras.metrics.mean_absolute_percentage_error,
                       tf.keras.metrics.mean_squared_logarithmic_error ])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_6 (InputLayer)           [(None, 215)]        0           []                               
                                                                                                  
 input_1 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, 3)]          0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 5)]          0           []                               
                                                                                              

In [3]:
class PrepData():

    def __init__(self,
                 features=['category_1', 'category_2', 'category_3', 'name_brand_description'],
                 max_lengths=[3, 5, 7, 215]):
        self.features = features
        self.max_lengths = max_lengths
        self.tokenizers = {}
        for feature in self.features:
            self.tokenizers[feature] = Tokenizer()

    def fillna(self, X):
        X.item_description = X.item_description.fillna("No description")
        X.name = X.name.fillna("No name")
        return X
    
    def fit(self, X, y=None):
        print('fitting_data')
        X = self.fillna(X)
        X['name_brand_description'] = self.brand_name_description(X)

        for feature in self.features:
            self.tokenizers[feature].fit_on_texts(X[feature].apply(str))
        
        print(feature)
        print('data fitted')
        return self
    
    def brand_name_description(self, X):
      return X['name'] + ' ' + X['brand_name'] + " " + X['item_description']
    
    def transform(self, X, y=None):
        print('transforming data')
        item_condition = X.item_condition_id
        shipping = X.shipping
        output = [item_condition, shipping]
        X = self.fillna(X)
        X['name_brand_description'] = self.brand_name_description(X)

        for i, feature in enumerate(self.features):
            text_sequence = self.tokenizers[feature].texts_to_sequences(X[feature].apply(str))
            pad = pad_sequences(text_sequence, padding='post', maxlen=self.max_lengths[i])
            output.append(pad)
        
        print('data transformed')
        
        return output

In [20]:
pipe = Pipeline(steps=[
    ('tokenizer', PrepData()),
    ('model', model)
])



In [6]:
pipe = Pipeline(steps=[
    ('tokenizer', PrepData()),
])

In [None]:
pipe.fit(base_train)

In [None]:
ans = pipe.transform(base_train)
ans

In [None]:
ans[3]

In [21]:
pipe.fit(base_train, 
              np.log(base_train.price), 
              model__epochs=10,
              model__batch_size=1024,
              model__callbacks=[save, lr, earlystop])


fitting_data
name_brand_description
data fitted
transforming data
data transformed

Epoch 1: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 1/10




Epoch 2: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 2/10




Epoch 3: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 3/10




Epoch 4: LearningRateScheduler setting learning rate to 0.00010000000474974513.
Epoch 4/10




Epoch 5: LearningRateScheduler setting learning rate to 1.0000000474974514e-05.
Epoch 5/10




Epoch 6: LearningRateScheduler setting learning rate to 1.0000000656873453e-06.
Epoch 6/10




Epoch 7: LearningRateScheduler setting learning rate to 1.0000001111620805e-07.
Epoch 7/10




Epoch 8: LearningRateScheduler setting learning rate to 1.000000082740371e-08.
Epoch 8/10




Epoch 9: LearningRateScheduler setting learning rate to 1.000000082740371e-09.
Epoch 9/10




Epoch 10: LearningRateScheduler setting learning rate to 1.000000082740371e-10.
Epoch 10/10





Pipeline(steps=[('tokenizer', <__main__.PrepData object at 0x7f6f0012cf90>),
                ('model',
                 <keras.engine.functional.Functional object at 0x7f6f00043950>)])

## Testando o modelo

In [22]:
def print_avaliacao(obs, pred):
    print('R² = %.3f' % metrics.r2_score(obs, pred))
    print('MAPE = %.3f %%' % (100 * metrics.mean_absolute_percentage_error(obs, pred)))
    print('MAE = U$S %.2f' % (metrics.mean_absolute_error(obs, pred)))
    print('RMSE = U$S %.2f' % metrics.mean_squared_error(obs, pred)**0.5)
    print('RMSLE = %.4f' % metrics.mean_squared_log_error(obs, pred,squared=False))

In [24]:
y_pred2 = np.exp(pipe.predict(base_train))

#print_avaliacao(base_train.price, y_pred2)

transforming data


In [27]:
print_avaliacao(base_train.price, y_pred2)

R² = 0.653
MAPE = 30.579 %
MAE = U$S 8.10
RMSE = U$S 22.73
RMSLE = 0.3616


In [28]:
y_pred2 = np.exp(pipe.predict(base_test))

print_avaliacao(base_test.price, y_pred2)

transforming data
data transformed
R² = 0.537
MAPE = 37.351 %
MAE = U$S 9.74
RMSE = U$S 26.34
RMSLE = 0.4348


## Salvando os dumps do modelo.

In [29]:
import pickle
rnn = pipe['model']
rnn.save('rnn_keras.h5') # salva somente a rnn
pipe.steps[1] = ('model', None) # remove modelo da pipeline pq o pickle não funciona com keras


#Faz o dump do pipeline
with open(f'pipeline.pickle', 'wb') as handle:
  pickle.dump(pipe, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Carregando o modelo a partir dos dumps salvos

In [30]:
from tensorflow import keras

# Carrega o rnn
modelo_lido = keras.models.load_model('rnn_keras.h5', compile=False)

# Carrega a pipeline
with open(f'pipeline.pickle', 'rb') as handle:
  nova_pipeline = pickle.load(handle)

# Adiciona a rnn a nova pipeline
nova_pipeline.steps[1] = ('model', modelo_lido)

## Testando o modelo carregado a partir dos dumps

In [None]:
base_test.shape, base_test.dropna().shape

In [31]:
y_pred2 = np.exp(nova_pipeline.predict(base_test))
print_avaliacao(base_test.price, y_pred2)


transforming data
data transformed
R² = 0.537
MAPE = 37.351 %
MAE = U$S 9.74
RMSE = U$S 26.34
RMSLE = 0.4348


O script final necessita ter a class PrepData para poder executar o modelo.

In [None]:
len(y_pred2[np.where(y_pred2 == y_pred2.max())])