# MercadoLibre 2019
## BERT on Keras

Tokenizar datasets

In [1]:
EXPERIMENT_NAME = "meli-BERTK" 
EXPERIMENT_VERSION = "v2-tokenizer"
LOG_DIR = "../logs/BERT"

## Version log

### v2-tokenizer

Genera features con tokens para el modelo

## Development

### Initialize

In [2]:
#%matplotlib inline

import sys
import os
#import re
import time
#import h5py
import pickle
import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow_hub as hub
#import matplotlib.pyplot as plt

from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense 
from tensorflow.keras.layers import Dropout, Input, Concatenate, Flatten
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.utils import to_categorical

from tensorflow.keras import backend as K
from bert.tokenization import FullTokenizer
from tqdm import tqdm_notebook
from sklearn.metrics import balanced_accuracy_score

In [3]:
#initialize env
#seeds to make reproducible
np.random.seed(12347)
tf.set_random_seed(12347)

pd.options.display.max_rows = 10

NROWS = 500000 #None for full dataset, int for top NROWS
#filenames and directories
DATASET_FN = "../data/test.csv"
OUTPUT_FN = "../data_processed/borra-test-features-uncased"
BERT_PATH = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

MAX_TEXT_LENGTH = 256
MAX_SEQUENCE_LENGTH = 32

### Load and prepare datasets

In [4]:
df = pd.read_csv(DATASET_FN, nrows=NROWS)
df

Unnamed: 0,id,title,language
0,0,Kit Maternidade Bolsa-mala Baby/bebe Vinho Men...,portuguese
1,1,Trocador De Fraldas Fisher Price Feminino Rosa...,portuguese
2,2,Motor Ventoinha - Fiat Idea / Palio 1.8 - A 04...,portuguese
3,3,Amortecedor Mola Batente D Dir New Civic 14 - ...,portuguese
4,4,Cadeirinha De Carro Bebê Princesa Princess 9 A...,portuguese
...,...,...,...
246950,246950,Disco Freno Delantero Ford Escort 88/94 Nuevo,spanish
246951,246951,Radio Comunicador Walk Talk Baofeng 777s Profi...,portuguese
246952,246952,Calculadora De Escritorio Grande 150$,spanish
246953,246953,Conj Mesa P/ Sala De Jantar C/ 06 Cadeiras Ams...,portuguese


In [5]:
labels = None
output_dim = 0
if "category" in df.columns:
    output_dim = len(df["category"].unique())
    print(f"output_dim: {output_dim}")

    cat_dict = dict(zip(df["category"].unique(), np.arange(output_dim)))
    #labels = to_categorical(df["category"].map(cat_dict))
    with open(OUTPUT_FN+"-cat_dict.pickle", "wb") as fo:
        pickle.dump(cat_dict, fo, protocol=4)
    inverse_cat_dict = dict(zip(cat_dict.values(), cat_dict.keys()))
    with open(OUTPUT_FN+"-inv_cat_dict.pickle", "wb") as fo:
        pickle.dump(inverse_cat_dict, fo, protocol=4)
    labels = df["category"].map(cat_dict)
    with open(OUTPUT_FN+"-labels-index.pickle", "wb") as fo:
        pickle.dump(labels, fo, protocol=4)
    labels = to_categorical(labels)
    print(labels[:3])

## Prepare text attributes (BERT version)

In [6]:
# Initialize session
sess = tf.Session()
max_seq_length = MAX_SEQUENCE_LENGTH 

class PaddingInputExample(object):
    """Fake example so the num input examples is a multiple of the batch size.
  When running eval/predict on the TPU, we need to pad the number of examples
  to be a multiple of the batch size, because the TPU requires a fixed batch
  size. The alternative is to drop the last batch, which is bad because it means
  the entire output data won't be generated.
  We use this class instead of `None` because treating `None` as padding
  battches could cause silent errors.
  """

class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.
    Args:
      guid: Unique id for the example.
      text_a: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
      text_b: (Optional) string. The untokenized text of the second sequence.
        Only must be specified for sequence pair tasks.
      label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

def create_tokenizer_from_hub_module():
    """Get the vocab file and casing info from the Hub module."""
    bert_module =  hub.Module(BERT_PATH)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    vocab_file, do_lower_case = sess.run(
        [
            tokenization_info["vocab_file"],
            tokenization_info["do_lower_case"],
        ]
    )

    return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

def convert_single_example(tokenizer, example, max_seq_length=256):
    """Converts a single `InputExample` into a single `InputFeatures`."""

    if isinstance(example, PaddingInputExample):
        input_ids = [0] * max_seq_length
        input_mask = [0] * max_seq_length
        segment_ids = [0] * max_seq_length
        label = 0
        return input_ids, input_mask, segment_ids, label

    tokens_a = tokenizer.tokenize(example.text_a)
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0 : (max_seq_length - 2)]

    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    return input_ids, input_mask, segment_ids, example.label

def convert_examples_to_features(tokenizer, examples, max_seq_length=256):
    """Convert a set of `InputExample`s to a list of `InputFeatures`."""

    input_ids, input_masks, segment_ids, labels = [], [], [], []
    for example in tqdm_notebook(examples, desc="Converting examples to features"):
        input_id, input_mask, segment_id, label = convert_single_example(
            tokenizer, example, max_seq_length
        )
        input_ids.append(input_id)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
        labels.append(label)
    return (
        np.array(input_ids),
        np.array(input_masks),
        np.array(segment_ids),
        np.array(labels)#.reshape(-1, 1),
    )

def convert_text_to_examples(texts, labels):
    """Create InputExamples"""
    InputExamples = []
    for text, label in zip(texts, labels):
        InputExamples.append(
            InputExample(guid=None, text_a=" ".join(text), text_b=None, label=label)
        )
    return InputExamples

def tokenize_dataset(texts, labels=None):

    # Instantiate tokenizer
    tokenizer = create_tokenizer_from_hub_module()

    # Convert data to InputExample format
    texts = [t[:MAX_TEXT_LENGTH] for t in texts]
    if labels is None:
        labels = [""] * len(texts)
    examples = convert_text_to_examples(texts, labels)

    # Convert to features, returns (train_input_ids, train_input_masks, train_segment_ids, train_labels) 
    return convert_examples_to_features(tokenizer, 
                                        examples, 
                                        max_seq_length=max_seq_length)


In [7]:
(input_ids, input_masks, segment_ids, labels) = tokenize_dataset(df["title"], labels)
input_ids[:3]

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore








HBox(children=(IntProgress(value=0, description='Converting examples to features', max=246955, style=ProgressS…




array([[ 101, 1047, 1045, 1056, 1049, 1037, 1056, 1041, 1054, 1050, 1045,
        1040, 1037, 1040, 1041, 1038, 1051, 1048, 1055, 1037, 1011, 1049,
        1037, 1048, 1037, 1038, 1037, 1038, 1061, 1013, 1038,  102],
       [ 101, 1056, 1054, 1051, 1039, 1037, 1040, 1051, 1054, 1040, 1041,
        1042, 1054, 1037, 1048, 1040, 1037, 1055, 1042, 1045, 1055, 1044,
        1041, 1054, 1052, 1054, 1045, 1039, 1041, 1042, 1041,  102],
       [ 101, 1049, 1051, 1056, 1051, 1054, 1058, 1041, 1050, 1056, 1051,
        1045, 1050, 1044, 1037, 1011, 1042, 1045, 1037, 1056, 1045, 1040,
        1041, 1037, 1013, 1052, 1037, 1048, 1045, 1051, 1015,  102]])

In [8]:
input_masks[:3]

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [9]:
segment_ids[:3]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [10]:
labels[:3]

array(['', '', ''], dtype='<U1')

In [11]:
result ={'max_seq_length':max_seq_length,
         'max_text_length':MAX_TEXT_LENGTH,
         'nrows':NROWS,
         'BERT_PATH':BERT_PATH}
pickle.dump(result, open(OUTPUT_FN+"-info.pickle", "wb"), protocol=4)

In [12]:
with open(OUTPUT_FN+"-input_ids.pickle", "wb") as fo:
    pickle.dump(input_ids, fo, protocol=4)
with open(OUTPUT_FN+"-input_masks.pickle", "wb") as fo:
    pickle.dump(input_masks, fo, protocol=4)
with open(OUTPUT_FN+"-segment_ids.pickle", "wb") as fo:
    pickle.dump(segment_ids, fo, protocol=4)
print("OK")

OK


----