In [1]:
# !pip install transformers -q

In [2]:
import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow.keras.backend as K

from transformers import *

In [3]:
tf.__version__

'2.2.0'

In [4]:
from transformers import TFBertModel

In [49]:
bert = 'bert-base-cased'

In [50]:
tokenizer = BertTokenizer.from_pretrained(bert)


In [143]:
config = BertConfig()
config.vocab_size = 28996 # pretrained size
config.output_hidden_states = False
config

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 28996
}

In [256]:
bert_model = TFBertModel.from_pretrained(bert, config=config)

input_ids = tf.keras.layers.Input(shape=(128,), name='input_token', dtype='int32')
input_masks_ids = tf.keras.layers.Input(shape=(128,), name='masked_token', dtype='int32')

# sequence_output, pooled_output, (hidden_states), (attentions)
context_embedding = bert_model(input_ids)[0]

decoder = tf.keras.layers.LSTM(4, return_sequences=True)(context_embedding)
# decoder, final_memory_state, final_carry_state = tf.keras.layers.LSTM(
#     4, return_sequences=True, return_state=True)(context_embedding)

decoded = tf.keras.layers.TimeDistributed(
    tf.keras.layers.Dense(tokenizer.vocab_size + 1, activation='softmax'))(decoder)

model = tf.keras.Model(inputs=input_ids, outputs=decoded)

model.summary()

Model: "model_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_token (InputLayer)     [(None, 128)]             0         
_________________________________________________________________
tf_bert_model_9 (TFBertModel ((None, 128, 768), (None, 108310272 
_________________________________________________________________
lstm_17 (LSTM)               (None, 128, 4)            12368     
_________________________________________________________________
time_distributed_9 (TimeDist (None, 128, 28997)        144985    
Total params: 108,467,625
Trainable params: 108,467,625
Non-trainable params: 0
_________________________________________________________________


In [235]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [236]:
sentences = ["hello my name is Rowan"]
tokens = tokenizer.encode_plus(sentences)
tokens

{'input_ids': [101, 100, 102], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}

In [237]:
prediction = model.predict(tokens['input_ids'])



In [238]:
prediction.shape

(3, 1, 28997)

In [225]:
prediction

array([[[3.4487486e-05, 3.4515844e-05, 3.4374112e-05, ...,
         3.4576788e-05, 3.4483124e-05, 3.4550892e-05]],

       [[3.4421519e-05, 3.4500699e-05, 3.4452059e-05, ...,
         3.4421271e-05, 3.4399454e-05, 3.4439017e-05]],

       [[3.4485805e-05, 3.4514011e-05, 3.4375415e-05, ...,
         3.4573593e-05, 3.4483448e-05, 3.4547324e-05]]], dtype=float32)

In [239]:
model.fit(x=tokens['input_ids'], y = tokens['input_ids'])



<tensorflow.python.keras.callbacks.History at 0x1e213ca10>

## Generate Sequence Attempt

In [258]:
type(bert_model.get_output_embeddings())

NoneType

In [259]:
bert_model_lm = TFBertForMaskedLM.from_pretrained(bert, config=config)

In [262]:
type(bert_model.get_output_embeddings())

NoneType

## Encoder Decoder Attempt

In [263]:
from transformers import EncoderDecoderModel, BertTokenizer
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert

# forward
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)

# training
loss, outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, lm_labels=input_ids)[:2]

# generation
generated = model.generate(input_ids, decoder_start_token_id=model.config.decoder.pad_token_id)

HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=433, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=440473133, style=ProgressStyle(description_…




In [268]:
tokenizer.decode(input_ids[0])

'[CLS] hello, my dog is cute [SEP]'

In [264]:
generated

tensor([[   0, 6506, 6506, 6506, 6506, 6506, 2666, 2666, 2666, 2666, 2666, 4618,
         4618, 4618, 4618, 4618, 4618, 4618, 4618, 4618]])

In [265]:
tokenizer.decode(generated[0])

'[PAD] leon leon leon leon leonieieieieie shall shall shall shall shall shall shall shall shall'