# BERT: As one of Autoencoding Language Models 

In [1]:
import os
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# !pip install transformers

In [None]:
# !pip install tokenizers

In [None]:
# os.chdir("drive/My Drive/data/")

In [10]:
dataset_path = os.path.expanduser('~/datasets/nlp/')
os.listdir(dataset_path)

['IMDB Dataset.csv', 'IMDB50K.zip']

In [3]:
import pandas as pd
imdb_df = pd.read_csv(os.path.join(dataset_path,"IMDB Dataset.csv"))
reviews = imdb_df.review.to_string(index=None) 
with open("corpus_imdb.txt", "w") as f: 
    f.writelines(reviews) 

In [4]:
from tokenizers import BertWordPieceTokenizer
bert_wordpiece_tokenizer = BertWordPieceTokenizer() 
bert_wordpiece_tokenizer.train("corpus_imdb.txt") 






In [5]:
bert_wordpiece_tokenizer.get_vocab()

{'org': 12125,
 'der': 2960,
 'dev': 3472,
 'sugi': 14805,
 'fin': 602,
 'fingersmith': 17222,
 '##verting': 14638,
 'operas': 10776,
 'roll': 2712,
 'fix': 15303,
 '##bles': 7995,
 'hilarity': 10600,
 'reminder': 16428,
 'churn': 10098,
 'louisiana': 17109,
 'wasted': 2269,
 'jumb': 11426,
 'haters': 10863,
 'jail': 13998,
 'anticipate': 17216,
 'mario': 3951,
 '##bole': 15909,
 'pointed': 9137,
 'und': 526,
 '##sson': 14942,
 'harsh': 4572,
 'indepe': 16003,
 'pal': 2261,
 'wilder': 6462,
 'unfortunat': 10654,
 '##ching': 1652,
 'grateful': 9003,
 'ange': 3367,
 'bim': 13873,
 '##oman': 5498,
 'drool': 12248,
 'vo': 2650,
 'house': 1047,
 'brenda': 14647,
 '1905': 17544,
 '##atory': 8798,
 'busby': 12881,
 'damn': 3727,
 'rochon': 12135,
 'pub': 9827,
 '##outh': 14727,
 'happy': 2806,
 'port': 1380,
 'baseball': 4271,
 '##joy': 5209,
 'parr': 7474,
 '##sem': 2617,
 'ff': 11396,
 'source': 17525,
 'into': 596,
 'awaiting': 13174,
 '##the': 7290,
 '##trow': 9908,
 '##math': 9883,
 'ull

In [6]:
!mkdir tokenizer
bert_wordpiece_tokenizer.save_model("tokenizer")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/bin/bash: /home/guy/anaconda3/envs/mastrans/lib/libtinfo.so.6: no version information available (required by /bin/bash)


['tokenizer/vocab.txt']

In [7]:
tokenizer = BertWordPieceTokenizer.from_file("tokenizer/vocab.txt")

In [8]:
tokenized_sentence = tokenizer.encode("Oh it works just fine")
tokenized_sentence.tokens

['[CLS]', 'oh', 'it', 'works', 'just', 'fine', '[SEP]']

In [9]:
tokenized_sentence = tokenizer.encode("ohoh i thougt it might be workingg well")
tokenized_sentence.tokens

['[CLS]',
 'oh',
 '##o',
 '##h',
 'i',
 'thoug',
 '##t',
 'it',
 'might',
 'be',
 'working',
 '##g',
 'well',
 '[SEP]']

Now that we know how to train a tokenizer and save it, we can proceed to train BERT.
for this step we'll use the `BertTokenizerFast` to load the tokenizer.  
We have used `BertTokenizerFast` because it is suggested by the HuggingFace documentation. There is also `BertTokenizer`, which, according to the definition from the library documentation, is not implemented as fast as the fast version. In most of the pretrained models' documentations and cards, it is highly recommended to use the `BertTokenizerFast` version.


In [10]:
from transformers import BertTokenizerFast 
tokenizer = BertTokenizerFast.from_pretrained("tokenizer") 

In [11]:
# preparing the corpus for faster training
from transformers import LineByLineTextDataset 
dataset = LineByLineTextDataset(tokenizer=tokenizer, file_path="corpus_imdb.txt", block_size=128) 



In [12]:
dataset

<transformers.data.datasets.language_modeling.LineByLineTextDataset at 0x7f1fdfd546d0>

it is required to provide a data collator for masked language modeling
The data collator gets the data and prepares it for the training. For example, the data collator above takes data and prepares it for masked language modeling with a probability of 0.15. The purpose of using such a mechanism is to do the preprocessing on the fly, which makes it possible to use fewer resources. On the other hand, it slows down the training process because each sample has to be preprocessed on the fly at training time

In [13]:
from transformers import DataCollatorForLanguageModeling 
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) 

In [14]:
from transformers import TrainingArguments 
training_args = TrainingArguments(output_dir="BERT", overwrite_output_dir=True, num_train_epochs=1, per_device_train_batch_size=128) 

In [15]:
from transformers import BertConfig, BertForMaskedLM 
bert = BertForMaskedLM(BertConfig()) 

In [16]:
from transformers import Trainer 
trainer = Trainer(model=bert, args=training_args, data_collator=data_collator, train_dataset=dataset) 

In [17]:
trainer.train()

***** Running training *****
  Num examples = 50022
  Num Epochs = 1
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 391


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=391, training_loss=5.370747207680626, metrics={'train_runtime': 213.2744, 'train_samples_per_second': 234.543, 'train_steps_per_second': 1.833, 'total_flos': 635727015135000.0, 'train_loss': 5.370747207680626, 'epoch': 1.0})

In [18]:
trainer.save_model("MyBERT")

Saving model checkpoint to MyBERT
Configuration saved in MyBERT/config.json
Model weights saved in MyBERT/pytorch_model.bin


Up to this point, you have learned how you can train BERT from scratch for any specific language that you desire. You've learned how to train the tokenizer and BERT model using the corpus you have prepared.  

BertConfig determines the architecture and hyper parameters.

In [19]:
from transformers import BertConfig 
BertConfig() 

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

we can use other configs, e.g. according to the following table:   
![bert_configs](bert_configs.png)

In [20]:
tiny_bert_config = BertConfig(max_position_embeddings=512, hidden_size=128, num_attention_heads=2, num_hidden_layers=2, intermediate_size=512) 
tiny_bert_config 

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [21]:
tiny_bert = BertForMaskedLM(tiny_bert_config) 
trainer = Trainer(model=tiny_bert, args=training_args, data_collator=data_collator, train_dataset=dataset) 
trainer.train() 

***** Running training *****
  Num examples = 50022
  Num Epochs = 1
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 391


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=391, training_loss=8.89531299952046, metrics={'train_runtime': 19.3855, 'train_samples_per_second': 2580.385, 'train_steps_per_second': 20.17, 'total_flos': 3295103391000.0, 'train_loss': 8.89531299952046, 'epoch': 1.0})

Up to this point, you have learned how to train your own model from scratch, but it is essential to note that using the datasets library is a better choice when dealing with datasets for training language models or leveraging it to perform task-specific training

The BERT language model can also be used as an embedding layer combined with any deep learning model. For example, you can load any pretrained BERT model or your own version that has been trained in the previous step. The following code shows how you must load it to be used in a Keras model:

In [1]:
from transformers import TFBertModel, BertTokenizerFast 
bert = TFBertModel.from_pretrained("bert-base-uncased") 
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") 
bert.layers 

2022-08-12 13:25:42.478465: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-12 13:25:42.479666: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-12 13:25:42.480276: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-12 13:25:42.480894: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

[<transformers.models.bert.modeling_tf_bert.TFBertMainLayer at 0x7f79e5db4d30>]

As you can see, there is just a single layer from TFBertMainLayer, which you can access within your Keras model

In [23]:
tokenized_text = tokenizer.batch_encode_plus(["hello how is it going with you","lets test it"], return_tensors="tf", max_length=256, truncation=True, pad_to_max_length=True) 
bert(tokenized_text) 



TFBaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=<tf.Tensor: shape=(2, 256, 768), dtype=float32, numpy=
array([[[ 1.00471467e-01,  6.77025914e-02, -8.33594650e-02, ...,
         -4.93304461e-01,  1.16539821e-01,  2.26647347e-01],
        [ 3.23624939e-01,  3.70718688e-01,  6.14686549e-01, ...,
         -6.27268553e-01,  3.79082203e-01,  7.05308020e-02],
        [ 1.99534073e-01, -8.75509441e-01, -6.47871345e-02, ...,
         -1.28083006e-02,  3.07651937e-01, -2.07313783e-02],
        ...,
        [-6.53298348e-02,  1.19046196e-01,  5.76847196e-01, ...,
         -2.95460850e-01,  2.49742493e-02,  1.13964520e-01],
        [-2.64715105e-01, -7.86387548e-02,  5.47281444e-01, ...,
         -1.37515455e-01, -5.94689697e-02, -5.17926812e-02],
        [-2.44958639e-01, -1.14799351e-01,  5.92174709e-01, ...,
         -1.56881645e-01, -3.39757167e-02, -8.46134424e-02]],

       [[ 2.94564813e-02,  2.30868548e-01,  2.92651415e-01, ...,
         -1.30422205e-01,  1.89659446e-01,  

As can be seen from the result, there are two outputs: one for the `last_hidden_state` and one for the `pooler_output`. The last hidden state provides all token embeddings from BERT with additional [CLS] and [SEP] tokens at the start and end, respectively.


Now, lets use these embeddings to feed to a Keras model.

The model object, which is a Keras model, has two inputs: one for tokens and one for masks. Tokens has token_ids from the tokenizer output and the masks will have attention_mask. Let's try it and see what happens:

In [2]:
from tensorflow import keras 
import tensorflow as tf 
max_length = 256 
tokens = keras.layers.Input(shape=(max_length,), dtype=tf.dtypes.int32) 
masks = keras.layers.Input(shape=(max_length,), dtype=tf.dtypes.int32) 
embedding_layer = bert.layers[0]([tokens,masks])[0][:,0,:] 
dense = tf.keras.layers.Dense(units=2, activation="softmax")(embedding_layer) 
model = keras.Model([tokens,masks],dense) 

In [4]:
tokenized = tokenizer.batch_encode_plus(["hello how is it going with you","hello how is it going with you"], return_tensors="tf", max_length= max_length, truncation=True, pad_to_max_length=True) 



In [5]:
tokenized['attention_mask']

<tf.Tensor: shape=(2, 256), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0,

In [6]:
model([tokenized["input_ids"],tokenized["attention_mask"]]) 

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[0.21401796, 0.7859821 ],
       [0.21401796, 0.7859821 ]], dtype=float32)>

In [7]:
model.compile(optimizer="Adam", loss="categorical_crossentropy", metrics=["accuracy"]) 
model.summary() 

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 256)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 256)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  109482240   ['input_1[0][0]',                
                                thPoolingAndCrossAt               'input_2[0][0]']                
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                           

In [8]:
model.layers[2].trainable = False 
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 256)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 256)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  109482240   ['input_1[0][0]',                
                                thPoolingAndCrossAt               'input_2[0][0]']                
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                           

we used the IMDB sentiment analysis dataset for training the language model. Now you can use it for training the Keras-based model for sentiment analysis. But first, you need to prepare the input and output:

In [11]:
import pandas as pd 

imdb_df = pd.read_csv(os.path.join(dataset_path,"IMDB Dataset.csv")) 
reviews = list(imdb_df.review) 
tokenized_reviews = tokenizer.batch_encode_plus(reviews, return_tensors="tf", max_length=max_length, truncation=True, pad_to_max_length=True) 

import numpy as np 
train_split = int(0.8 * len(tokenized_reviews["attention_mask"])) 
train_tokens = tokenized_reviews["input_ids"][:train_split] 
test_tokens = tokenized_reviews["input_ids"][train_split:] 
train_masks = tokenized_reviews["attention_mask"][:train_split] 
test_masks = tokenized_reviews["attention_mask"][train_split:] 
sentiments = list(imdb_df.sentiment) 
labels = np.array([[0,1] if sentiment == "positive" else [1,0] for sentiment in sentiments]) 
train_labels = labels[:train_split] 
test_labels = labels[train_split:] 



In [12]:
model.fit([train_tokens,train_masks],train_labels, epochs=5)

Epoch 1/5


2022-08-12 13:27:41.169792: W tensorflow/core/common_runtime/bfc_allocator.cc:479] Allocator (GPU_0_bfc) ran out of memory trying to allocate 96.00MiB (rounded to 100663296)requested by op model/bert/encoder/layer_._10/attention/self/MatMul
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2022-08-12 13:27:41.169915: I tensorflow/core/common_runtime/bfc_allocator.cc:1027] BFCAllocator dump for GPU_0_bfc
2022-08-12 13:27:41.169958: I tensorflow/core/common_runtime/bfc_allocator.cc:1034] Bin (256): 	Total Chunks: 83, Chunks in use: 82. 20.8KiB allocated for chunks. 20.5KiB in use in bin. 1.0KiB client-requested in use in bin.
2022-08-12 13:27:41.169987: I tensorflow/core/common_runtime/bfc_allocator.cc:1034] Bin (512): 	Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
202

ResourceExhaustedError: Graph execution error:

Detected at node 'model/bert/encoder/layer_._10/attention/self/MatMul' defined at (most recent call last):
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/ipykernel_launcher.py", line 16, in <module>
      app.launch_new_instance()
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/traitlets/config/application.py", line 846, in launch_instance
      app.start()
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 677, in start
      self.io_loop.start()
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/tornado/platform/asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/asyncio/base_events.py", line 601, in run_forever
      self._run_once()
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/asyncio/base_events.py", line 1905, in _run_once
      handle._run()
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 471, in dispatch_queue
      await self.process_one()
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 460, in process_one
      await dispatch(*args)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 367, in dispatch_shell
      await result
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 662, in execute_request
      reply_content = await reply_content
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 360, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/ipykernel/zmqshell.py", line 532, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 2880, in run_cell
      result = self._run_cell(
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 2935, in _run_cell
      return runner(coro)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3134, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3337, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3397, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/tmp/ipykernel_3327/3113418953.py", line 1, in <cell line: 1>
      model.fit([train_tokens,train_masks],train_labels, epochs=5)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/keras/engine/training.py", line 1409, in fit
      tmp_logs = self.train_function(iterator)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/keras/engine/training.py", line 1051, in train_function
      return step_function(self, iterator)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/keras/engine/training.py", line 1040, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/keras/engine/training.py", line 1030, in run_step
      outputs = model.train_step(data)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/keras/engine/training.py", line 889, in train_step
      y_pred = self(x, training=True)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/keras/engine/training.py", line 490, in __call__
      return super().__call__(*args, **kwargs)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/keras/engine/base_layer.py", line 1014, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/keras/engine/functional.py", line 458, in call
      return self._run_internal_graph(
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/keras/engine/functional.py", line 596, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/keras/engine/base_layer.py", line 1014, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/transformers/modeling_tf_utils.py", line 753, in run_call_with_unpacked_inputs
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/transformers/models/bert/modeling_tf_bert.py", line 863, in call
      encoder_outputs = self.encoder(
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/keras/engine/base_layer.py", line 1014, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/transformers/models/bert/modeling_tf_bert.py", line 548, in call
      for i, layer_module in enumerate(self.layer):
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/transformers/models/bert/modeling_tf_bert.py", line 554, in call
      layer_outputs = layer_module(
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/keras/engine/base_layer.py", line 1014, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/transformers/models/bert/modeling_tf_bert.py", line 464, in call
      self_attention_outputs = self.attention(
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/keras/engine/base_layer.py", line 1014, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/transformers/models/bert/modeling_tf_bert.py", line 380, in call
      self_outputs = self.self_attention(
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/keras/engine/base_layer.py", line 1014, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "/home/guy/anaconda3/envs/mastrans/lib/python3.9/site-packages/transformers/models/bert/modeling_tf_bert.py", line 310, in call
      attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
Node: 'model/bert/encoder/layer_._10/attention/self/MatMul'
OOM when allocating tensor with shape[32,12,256,256] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node model/bert/encoder/layer_._10/attention/self/MatMul}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_21561]