# MercadoLibre 2019
## BERT on Keras

In [1]:
EXPERIMENT_NAME = "meli-BERTK" 
EXPERIMENT_VERSION = "v2_2"
LOG_DIR = "../logs/BERT"

## Version log

### v2_2

- DS reliable (1M+)
- **Los resultados son buenos para un eval set de solo reliable (ver abajo) pero malos para uno general**
    - Probado en `eval_5k` con notebook de evaluacion
    - Correctas: 1129/5000 = 0.2258
    - BAS = 0.16832066415719524

### v2_1

- Dataset 1M
- Desde la 2_0 ya se corrió con sparse_categorical_crossentropy

### v2_0

Origen: v1_2

- Se separó la tokenización y creación de features para optimizar memoria
- Correccion de lo que parece un bug en la conversión de etiquetas

### v1_2

- rel y unrel, 
- bert uncased, 
- 1M/train-full 
- max_seq_len = 64
- val y test set 0.025

### v1_1

Solo con campos "reliable"

### v1_0_1

Misma versión con training set de 500,000, 7 épocas

### v1_0 

Viene de movie2vec-Plot_BERTK-v1


## Development

### Initialize

In [None]:
#%matplotlib inline

import sys
import os
#import re
import time
#import h5py
import pickle
import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow_hub as hub
#import matplotlib.pyplot as plt

from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense 
from tensorflow.keras.layers import Dropout, Input, Concatenate, Flatten
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.utils import to_categorical

from tensorflow.keras import backend as K
#from bert.tokenization import FullTokenizer
#from tqdm import tqdm_notebook
from sklearn.metrics import balanced_accuracy_score

In [None]:
#initialize env
#seeds to make reproducible
#todo: check reproducibility
np.random.seed(12347)
tf.set_random_seed(12347)

pd.options.display.max_rows = 7

#filenames and directories
TOKENIZED_DATASET_FN = "../data/tokenized/train-reliable-features-uncased"
SAVED_MODEL_DIR = "../saved_models/"

#set some parameters on how data will be used
# how much data will reserve for test set (of the DS prop to use) (0.10)
TEST_SET_SPLIT = 0.005
# how much of the data will be used for validation (of the DS prop to use) (0.05)
VALIDATION_SET_SPLIT = 0.005

### Load and prepare tokenized datasets

In [None]:
def load(suffix):
    with open(TOKENIZED_DATASET_FN+suffix, "rb") as fin:
        data = pickle.load(fin)
    return data

In [None]:
data_to_load = ['input_ids', 'input_masks', 'segment_ids', 
                'cat_dict', 'inv_cat_dict', 'info']
data = {}
for d in data_to_load:
    data[d] = load(f"-{d}.pickle")
labels_index = load("-labels-index.pickle")
#data['labels'] = to_categorical(labels_index)
data['labels'] = labels_index
data['labels']

### Split datasets

In [None]:
#chk: SHUFFLE!!
num_samples = len(data['input_ids'])

num_test_samples = int(num_samples * TEST_SET_SPLIT)
num_training_samples = num_samples - num_test_samples

#OJO!!!! V2_2 con pocos ejemplos!!!!
num_training_samples = 500000
num_test_samples = 5000

print(f"Training samples: {num_training_samples}")
print(f"Test samples: {num_test_samples}")

### Build Model

In [None]:
class BertLayer(tf.keras.layers.Layer):
    def __init__(self, bert_path, n_fine_tune_layers=10, **kwargs):
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable = True
        self.output_size = 768
        self.bert_path = bert_path
        super(BertLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.bert = hub.Module(
            self.bert_path,
            trainable=self.trainable,
            name="{}_module".format(self.name)
        )

        trainable_vars = self.bert.variables

        # Remove unused layers
        trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]

        # Select how many layers to fine tune
        trainable_vars = trainable_vars[-self.n_fine_tune_layers :]

        # Add to trainable weights
        for var in trainable_vars:
            self._trainable_weights.append(var)
            
        for var in self.bert.variables:
            if var not in self._trainable_weights:
                self._non_trainable_weights.append(var)

        super(BertLayer, self).build(input_shape)

    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(
            input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
        )
        result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
            "pooled_output"
        ]
        return result

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_size)

In [8]:
max_seq_length = data['info']['max_seq_length']
output_dim = len(data['labels'].unique())#len(data['labels'][0])

in_id = Input(shape=(max_seq_length,), name="IN_IDS")
in_mask = Input(shape=(max_seq_length,), name="IN_MASKS")
in_segment = Input(shape=(max_seq_length,), name="IN_SEGM")

inputs = [in_id, in_mask, in_segment]
fo = BertLayer(bert_path=data['info']['BERT_PATH'], 
               n_fine_tune_layers=3)(inputs)
#fo = Dense(1024, activation="relu", name="DEN_1024")(fo)
#fo = Dropout(0.5)(fo)
#fo = Dense(512, activation="relu", name="DEN_512")(fo)
#fo = Dropout(0.5)(fo)
#fo = Dense(256, activation="relu", name="DEN_256")(fo)
#fo = Dense(output_dim, activation="softmax", name="DEN_OUT")(fo)

model = Model(inputs=inputs, outputs=fo)
model.compile(loss='sparse_categorical_crossentropy',#'categorical_crossentropy', 
              optimizer='adam')#, learning_rate=0.003)
model.summary()

W0919 18:32:32.664514 140616766973760 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
IN_IDS (InputLayer)             [(None, 32)]         0                                            
__________________________________________________________________________________________________
IN_MASKS (InputLayer)           [(None, 32)]         0                                            
__________________________________________________________________________________________________
IN_SEGM (InputLayer)            [(None, 32)]         0                                            
__________________________________________________________________________________________________
bert_layer (BertLayer)          (None, 768)          110104890   IN_IDS[0][0]                     
                                                                 IN_MASKS[0][0]               

### Train

In [9]:
#run params
runid = "%s-%s-%s"%\
    (EXPERIMENT_NAME,
     EXPERIMENT_VERSION,
     time.strftime(time.strftime('%y%m%d_%H%M',time.localtime())))

#Create saved model dir     
directory = SAVED_MODEL_DIR+"/"+runid
if not os.path.exists(directory):
    os.makedirs(directory)
    
print("Python: "+str(sys.version))
print("Tensorflow version: "+tf.VERSION)
print("Keras version: "+tf.keras.__version__)
print("Embeddings: "+data['info']['BERT_PATH'])
print("RUNID: "+runid)

Python: 3.6.8 (default, Jan 14 2019, 11:02:34) 
[GCC 8.0.1 20180414 (experimental) [trunk revision 259383]]
Tensorflow version: 1.14.0
Keras version: 2.2.4-tf
Embeddings: https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1
RUNID: meli-BERTK-v2_2-190919_1832


In [10]:
sess = tf.Session()
sess.run(tf.local_variables_initializer())
sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())
K.set_session(sess)

In [11]:
print('Training...')
tensorboard = TensorBoard(log_dir=LOG_DIR+'/'+runid)
checkpoint = ModelCheckpoint(directory+"/model.hdf5", monitor='val_loss',
                             verbose=1, save_best_only=True, mode="min")

# train
t0 = time.time()
print("Start:"+time.strftime("%Y%m%d_%H%M",time.localtime()))
model.fit(
    [data['input_ids'][:num_training_samples], 
     data['input_masks'][:num_training_samples], 
     data['segment_ids'][:num_training_samples]], #input data 
          data['labels'][:num_training_samples], #labels
          batch_size=128,#448, #384
          epochs=6,
          initial_epoch=0,
          validation_split=VALIDATION_SET_SPLIT,
          verbose=1,
          callbacks=[tensorboard, checkpoint])

tfin = time.time()
print("End:" + time.strftime("%Y%m%d_%H%M",time.localtime()))
print(tfin-t0)

Training...
Start:20190919_1832
Train on 497500 samples, validate on 2500 samples


W0919 18:32:35.822645 140616766973760 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_grad.py:1205: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/6
Epoch 00001: val_loss improved from inf to 3.03765, saving model to ../saved_models//meli-BERTK-v2_2-190919_1832/model.hdf5
Epoch 2/6
Epoch 00002: val_loss improved from 3.03765 to 2.23158, saving model to ../saved_models//meli-BERTK-v2_2-190919_1832/model.hdf5
Epoch 3/6
Epoch 00003: val_loss improved from 2.23158 to 1.99840, saving model to ../saved_models//meli-BERTK-v2_2-190919_1832/model.hdf5
Epoch 4/6
  5504/497500 [..............................] - ETA: 1:14:36 - loss: 2.1239

KeyboardInterrupt: 

### Test

In [12]:
#analysis_model = load_model(directory+"/model.hdf5", 
#                           custom_objects={'BertLayer':BertLayer})
#print("Predict:")
predictions = model.predict(
    [data['input_ids'][-num_test_samples:] , 
     data['input_masks'][-num_test_samples:], 
     data['segment_ids'][-num_test_samples:]], verbose=1)



In [13]:
i=np.argmax(predictions[0])
data['inv_cat_dict'][i]

'BOOKS'

In [14]:
it=data['labels'][-num_test_samples:].iloc[0]
data['inv_cat_dict'][it]

'FACIAL_SKIN_CARE_PRODUCTS'

In [15]:
predicted_categories = [data['inv_cat_dict'][np.argmax(p)] for p in predictions]
predicted_categories[:10]

['BOOKS',
 'UKULELES',
 'FACIAL_SKIN_CARE_PRODUCTS',
 'WHISKEYS',
 'HARD_DRIVES_AND_SSDS',
 'AIRSOFT_GUNS',
 'AM_FM_RADIOS',
 'LIPSTICKS',
 'DIECAST_VEHICLES',
 'STARTERS']

In [16]:
#test_categories = [data['inv_cat_dict'][np.argmax(p)] for p in data['labels'][-num_test_samples:]]
test_categories = [data['inv_cat_dict'][p] for p in list(data['labels'][-num_test_samples:])]
test_categories[:10]

['FACIAL_SKIN_CARE_PRODUCTS',
 'UKULELES',
 'SCULPTURES',
 'WHISKEYS',
 'CELLPHONE_COVERS',
 'AIRSOFT_GUNS',
 'AM_FM_RADIOS',
 'LIPSTICKS',
 'CAMERA_CHARGERS',
 'SOLDERING_MACHINES']

In [17]:
test_set = pd.DataFrame({'category':test_categories, 'predicted':predicted_categories})
test_set

Unnamed: 0,category,predicted
0,FACIAL_SKIN_CARE_PRODUCTS,BOOKS
1,UKULELES,UKULELES
2,SCULPTURES,FACIAL_SKIN_CARE_PRODUCTS
...,...,...
4997,DRONES,OPERATING_SYSTEMS
4998,HAIR_CLIPPERS,HAIR_CLIPPERS
4999,HAIR_CLIPPERS,CAMERA_TRIPODS


In [18]:
test_set["true_positive"] = test_set["category"]==test_set["predicted"]
test_set["true_positive"].sum()

3018

In [19]:
test_set[:10]

Unnamed: 0,category,predicted,true_positive
0,FACIAL_SKIN_CARE_PRODUCTS,BOOKS,False
1,UKULELES,UKULELES,True
2,SCULPTURES,FACIAL_SKIN_CARE_PRODUCTS,False
...,...,...,...
7,LIPSTICKS,LIPSTICKS,True
8,CAMERA_CHARGERS,DIECAST_VEHICLES,False
9,SOLDERING_MACHINES,STARTERS,False


In [22]:
balanced_accuracy_score(test_set['category'], test_set['predicted'])

0.4038453218909564

In [23]:
test_set.to_csv(directory+"/results.csv")