In [3]:
#!pip install tensorflow==2.3.1
#!pip install transformers

In [1]:
import pickle
import tensorflow as tf
import transformers
from transformers import GPT2Config
from transformers import TFGPT2LMHeadModel
from transformers import GPT2Tokenizer
import gc

In [2]:
with open('token_list.data', 'rb') as filehandle:
    token_list = pickle.load(filehandle)

In [3]:
save_path = 'tokenized_data'
tokenizer = GPT2Tokenizer.from_pretrained(save_path)
tokenizer.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})# creating the configurations from which the model can be made
config = GPT2Config(
  vocab_size=tokenizer.vocab_size,
  bos_token_id=tokenizer.bos_token_id,
  eos_token_id=tokenizer.eos_token_id
)# creating the model
model = TFGPT2LMHeadModel(config)

In [4]:
len(token_list) #1.087.269.997 ?

1087269997

In [5]:
%%time
examples = []
block_size = 100
BATCH_SIZE = 12
BUFFER_SIZE = 1000

for i in range(0, len(token_list) - block_size + 1, block_size):
    examples.append(token_list[i:i + block_size])

CPU times: user 3min 17s, sys: 3.81 s, total: 3min 21s
Wall time: 3min 20s


In [6]:
len(examples), len(examples[0]) 

(10872699, 100)

In [7]:
inputs, labels = [], []

for ex in examples:
    inputs.append(ex[:-1])
    labels.append(ex[1:])

In [8]:
len(inputs), len (labels),len(labels[0]), len(inputs[0])

(10872699, 10872699, 99, 99)

In [9]:
chunk_size = 10000
inputs_chunks = [inputs[i:i + chunk_size] for i in range(0, len(inputs), chunk_size)]
labels_chunks = [labels[i:i + chunk_size] for i in range(0, len(labels), chunk_size)]
len(inputs_chunks),len(labels_chunks[0])

(1088, 10000)

In [10]:
%%time
import time
dataset_final = tf.data.Dataset.from_tensor_slices((inputs_chunks[0], labels_chunks[0]))
last_time = time.time()
nchunks = len(inputs_chunks)
for i,input_group in enumerate(inputs_chunks):
    if i==0:
        continue
    print("Chunk {}/{}".format(i+1,nchunks), time.time()-last_time)
    
    dataset_temp = tf.data.Dataset.from_tensor_slices((input_group, labels_chunks[i]))
    dataset_final = dataset_final.concatenate(dataset_temp)

Chunk 2/1088 9.5367431640625e-06
Chunk 3/1088 3.917330503463745
Chunk 4/1088 7.832226991653442
Chunk 5/1088 11.75835108757019
Chunk 6/1088 15.678952693939209
Chunk 7/1088 19.588305711746216
Chunk 8/1088 23.586983919143677
Chunk 9/1088 27.4995059967041
Chunk 10/1088 31.432222366333008
Chunk 11/1088 35.33239555358887
Chunk 12/1088 39.243128538131714
Chunk 13/1088 43.16177582740784
Chunk 14/1088 47.066288232803345
Chunk 15/1088 50.955448150634766
Chunk 16/1088 54.86056470870972
Chunk 17/1088 58.750813245773315
Chunk 18/1088 62.6612753868103
Chunk 19/1088 66.56288266181946
Chunk 20/1088 70.49045944213867
Chunk 21/1088 74.39586853981018
Chunk 22/1088 78.3131971359253
Chunk 23/1088 82.20799255371094
Chunk 24/1088 86.12462401390076
Chunk 25/1088 90.0203127861023
Chunk 26/1088 93.93588256835938
Chunk 27/1088 97.83598017692566
Chunk 28/1088 101.74532437324524
Chunk 29/1088 105.64106798171997
Chunk 30/1088 109.55145359039307
Chunk 31/1088 113.444256067276
Chunk 32/1088 117.37303805351257
Chunk 3

In [11]:
tf.data.experimental.save(dataset_final, './dataset_final', compression=None, shard_func=None)

In [12]:
shuffled_dataset = dataset_final.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
len(shuffled_dataset), shuffled_dataset

(906058,
 <BatchDataset shapes: ((12, 99), (12, 99)), types: (tf.int32, tf.int32)>)

In [13]:
tf.debugging.set_log_device_placement(0)

# defining our optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)

# definining our loss function
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# defining our metric which we want to observe
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

# compiling the model
model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])

In [20]:
dataset_final.element_spec

(TensorSpec(shape=(99,), dtype=tf.int32, name=None),
 TensorSpec(shape=(99,), dtype=tf.int32, name=None))

In [None]:
tf.debugging.set_log_device_placement(True)
num_epoch = 10
history = model.fit(shuffled_dataset, epochs=num_epoch)

Epoch 1/10
 46644/906058 [>.............................] - ETA: 34:18:57 - loss: 5.3282 - output_1_loss: 5.3282 - output_1_accuracy: 0.2124 - output_2_1_accuracy: 0.0014 - output_2_2_accuracy: 0.0016 - output_2_3_accuracy: 0.0016 - output_2_4_accuracy: 0.0014 - output_2_5_accuracy: 0.0016 - output_2_6_accuracy: 0.0014 - output_2_7_accuracy: 0.0015 - output_2_8_accuracy: 0.0015 - output_2_9_accuracy: 0.0016 - output_2_10_accuracy: 0.0016 - output_2_11_accuracy: 0.0016 - output_2_12_accuracy: 0.0016

In [None]:
from transformers import WEIGHTS_NAME, CONFIG_NAME
import os

output_dir = './model_bn_custom/'

# creating directory if it is not present
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    
model_to_save = model.module if hasattr(model, 'module') else model
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)

# save model and model configs
model.save_pretrained(output_dir)
model_to_save.config.to_json_file(output_config_file)

# save tokenizer
tokenizer.save_pretrained(output_dir)