In [3]:
!pip install -q --upgrade keras-hub
!pip install -q --upgrade keras  # Upgrade to Keras 3.

In [5]:
import os
import keras_hub
import keras
import os
import re
import tensorflow as tf


import tensorflow.data as tf_data
import tensorflow.strings as tf_strings

In [6]:
# Data
BATCH_SIZE = 64
SEQ_LEN = 128  # Including BOS and EOS tokens, adjusted from 128 to 129
VOCAB_SIZE = 20000  # Updated from 5000 to 20000
MIN_STRING_LEN = 512  # Keeping it as is for now

# Model
EMBED_DIM = 256  # Increased from 128 for better representation
FEED_FORWARD_DIM = 1024  # Increased from 128, should be at least 4x of embedding dimension
NUM_HEADS = 8  # Increased from 4 for better parallelization
NUM_LAYERS = 4  # Increased from 4 for more depth

# Training
EPOCHS = 5  # Reduced for quicker initial training

# Inference
NUM_TOKENS_TO_GENERATE = 80  # Keeping the same


In [10]:
from keras import backend as K
K.clear_session()

In [8]:
import keras
import keras_hub
from keras.callbacks import ModelCheckpoint
import tensorflow as tf

# Enable MirroredStrategy for using multiple GPUs
strategy = tf.distribute.MirroredStrategy()

print(f"Number of GPUs: {strategy.num_replicas_in_sync}")

# Adjusting batch size to account for multi-GPU training
GLOBAL_BATCH_SIZE = BATCH_SIZE * strategy.num_replicas_in_sync  # Effective batch size

# Use the strategy for building and compiling the model
with strategy.scope():
    # Model Inputs
    inputs = keras.layers.Input(shape=(None,), dtype="int32")
    x = keras.layers.Masking(mask_value=0)(inputs)

    # Embedding.
    embedding_layer = keras_hub.layers.TokenAndPositionEmbedding(
        vocabulary_size=VOCAB_SIZE,
        sequence_length=SEQ_LEN,
        embedding_dim=EMBED_DIM,
        mask_zero=True,
    )
    x = embedding_layer(inputs)
    # Transformer decoders.
    for _ in range(NUM_LAYERS):
        decoder_layer = keras_hub.layers.TransformerDecoder(
            num_heads=NUM_HEADS,
            intermediate_dim=FEED_FORWARD_DIM,
        )
        x = decoder_layer(x)  # Giving one argument only skips cross-attention.
    # Output.
    outputs = keras.layers.Dense(VOCAB_SIZE)(x)
    model = keras.Model(inputs=inputs, outputs=outputs)
    loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    perplexity = keras_hub.metrics.Perplexity(from_logits=True, mask_token_id=0)
    optimizer = keras.optimizers.Adam(learning_rate=0.0001, clipnorm=1.0)
    model.compile(optimizer=optimizer, loss=loss_fn, metrics=[perplexity])

    # Model Summary
    model.summary()


Number of GPUs: 2


In [9]:
# Define directory paths for saving datasets
output_dir = "/kaggle/input/processed-wikipedia-dataset/saved_datasets"
train_dir = os.path.join(output_dir, "train_ds")
val_dir = os.path.join(output_dir, "val_ds")

# Reloading train_ds and val_ds
loaded_train_ds = tf.data.experimental.load(train_dir)
loaded_val_ds = tf.data.experimental.load(val_dir)

print("Training dataset loaded successfully!")
print("Validation dataset loaded successfully!")


Training dataset loaded successfully!
Validation dataset loaded successfully!


In [4]:
for features, labels in loaded_train_ds.take(3):
    print("Features (Input Tokens):", features)
    print("Labels (Target Tokens):", labels)

Features (Input Tokens): tf.Tensor(
[[    2  1997   134 ...    69   198   392]
 [    2    69 12554 ...  5131  2883 19946]
 [    2    69  6170 ...     0     0     0]
 ...
 [    2  5825    33 ...     0     0     0]
 [    2  3362    78 ...     0     0     0]
 [    2 15311   123 ...     0     0     0]], shape=(64, 128), dtype=int32)
Labels (Target Tokens): tf.Tensor(
[[ 1997   134 19953 ...   198   392  1120]
 [   69 12554 16503 ...  2883 19946   135]
 [   69  6170 10860 ...     0     0     0]
 ...
 [ 5825    33   107 ...     0     0     0]
 [ 3362    78  3474 ...     0     0     0]
 [15311   123   386 ...     0     0     0]], shape=(64, 128), dtype=int32)
Features (Input Tokens): tf.Tensor(
[[    2   135   377 ...     0     0     0]
 [    2  1700 11628 ...     0     0     0]
 [    2   135 10728 ...     0     0     0]
 ...
 [    2 10677  3631 ... 19946  1718   356]
 [    2  1282    10 ...     0     0     0]
 [    2 17046   435 ...   535 19945    33]], shape=(64, 128), dtype=int32)
Labels (

In [7]:
PROTO_FILE = "/kaggle/input/bpe-tokenizer-200k/tokenizer_bpe.proto"
tokenizer = keras_hub.tokenizers.SentencePieceTokenizer(
    proto=PROTO_FILE,
    dtype="int32",
    sequence_length=SEQ_LEN,  # Maintain same sequence length as training
    add_bos=True,  # Start token is needed to indicate the beginning of the generation
    add_eos=False  # No end token for generation, only for training
)


In [None]:
class TopKTextGenerator(keras.callbacks.Callback):
    """A callback to generate text from a trained model using top-k sampling."""

    def __init__(self, k, tokenizer, gpt_model):  # Changed model to gpt_model
        self.sampler = keras_hub.samplers.TopKSampler(k)
        self.tokenizer = tokenizer
        self.gpt_model = gpt_model  # Changed attribute name to gpt_model

    def on_epoch_end(self, epoch, logs=None):
        prompt = tf.constant([""])  # Empty prompt to start generation
        prompt_tokens = self.tokenizer.tokenize(prompt)

        output_tokens = self.sampler(
            next=self.next,
            prompt=prompt_tokens,
            index=1,
        )
        
        txt = self.tokenizer.detokenize(output_tokens)
        print(f"\nTop-K search generated text after epoch {epoch + 1}: \n{txt}\n")

    def next(self, prompt, cache, index):
        logits = self.gpt_model(prompt)[:, index - 1, :]  # Using self.gpt_model instead of self.model
        hidden_states = None  # Not using hidden states for now
        return logits, hidden_states, cache


In [None]:
# Saving model weights if validation perplexity is the best seen so far
checkpoint_callback = ModelCheckpoint(
    "redblock_gpt_big.keras",
    monitor="val_perplexity",
    save_best_only=True,
    save_weights_only=False,
    verbose=1,
    mode="min"
)


In [None]:
text_generation_callback = TopKTextGenerator(k=10, tokenizer=tokenizer, gpt_model=model)

model.fit(
    loaded_train_ds,
    validation_data=loaded_val_ds,
    epochs=EPOCHS,
    callbacks=[text_generation_callback, checkpoint_callback]
)


In [None]:
print("Prompt tokens:", prompt_tokens)
logits, _, _ = next(prompt_tokens, None, 1)
print("Logits shape:", logits.shape)


In [None]:
model.save("redblock_pretrained_gpt_epoch_5.keras")

In [8]:
loaded_model = keras.saving.load_model("/kaggle/input/redblock_trained_gpt/keras/default/1/redblock_gpt_big (5).keras")

In [9]:
loaded_model.summary()

In [8]:
# The "packer" layers adds the [BOS] token for us.
prompt_tokens =tokenizer.tokenize([""])
prompt_tokens

def next(prompt, cache, index):
    logits = loaded_model(prompt)[:, index - 1, :]
    # Ignore hidden states for now; only needed for contrastive search.
    hidden_states = None
    return logits, hidden_states, cache
prompt_length = tf.math.count_nonzero(prompt_tokens, axis=-1)[0].numpy()

sampler = keras_hub.samplers.TopPSampler(p=0.5)
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=prompt_length,
)
txt = tokenizer.detokenize(output_tokens)
print(f"Top-P search generated text: \n{txt}\n")



Top-P search generated text: 
['As a teenager, he had two sons, James and Henry. His father, John and William, were a real-life father, John and his mother, and their mother, Ellen, are a son of Thomas, the younger sister of John, John and their daughter, William, and a sister, John. of the family are the only daughter of Edward, and their mother was born in an early age. Thomas is the only child, and she is a member of the House of Fife. of the family and the children were raised in the hands of the family. of the siblings were of']



In [13]:
def evaluate_model(model, eval_dataset):
    # Define the metrics to use in evaluation
    perplexity = keras_hub.metrics.Perplexity(from_logits=True, mask_token_id=0) # Assuming 0 is the padding token

    # Run the evaluation
    results = model.evaluate(eval_dataset, 
                             batch_size=64,  # Adjust this as needed
                             verbose=1, 
                             )


In [14]:
evaluate_model(loaded_model, loaded_val_ds)



[1m2174/2174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m301s[0m 137ms/step - loss: 4.1417 - perplexity: 66.0906


In [10]:
!pip install bert_score sentence_transformers

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


In [11]:
import torch
from transformers import BertModel, BertTokenizer
from bert_score import score
from sentence_transformers import SentenceTransformer, util
import numpy as np

In [12]:
bert_model = BertModel.from_pretrained("bert-base-uncased")
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
sentence_transformer = SentenceTransformer('all-mpnet-base-v2')

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [13]:
def calculate_bert_score(text: str) -> float:
    sentences = text.strip().split(".")
    if len(sentences) < 2:
        return 0.0  # Not enough sentences to compare

    P, R, F1 = score([sentences[1]], [sentences[0]], model_type="bert-base-uncased", lang="en")
    return F1.mean().item()

def calculate_sentence_similarity(text: str) -> float:
    sentences = text.strip().split(".")
    sentences = [s.strip() for s in sentences if s.strip()]

    if len(sentences) < 2:
        return 0.0  # Not enough sentences to compare

    embeddings = sentence_transformer.encode(sentences)
    similarities = []
    for i in range(1, len(embeddings)):
        sim = util.cos_sim(embeddings[i-1], embeddings[i])
        similarities.append(sim.item())

    return np.mean(similarities) if similarities else 0.0

In [20]:
import numpy as np
from tqdm import tqdm
sampler = keras_hub.samplers.TopPSampler(p=0.5)
def generate_and_evaluate(n=5):
    bert_scores = []
    sentence_similarities = []
    generated_texts = []
    
    for i in tqdm(range(n), desc="Generating Texts"):
        output_tokens = sampler(
            next=next,
            prompt=prompt_tokens,
            index=prompt_length,
        )
        
        txt = tokenizer.detokenize(output_tokens)
        generated_texts.append(txt[0])
        
        bert_score = calculate_bert_score(txt[0])
        sentence_similarity = calculate_sentence_similarity(txt[0])
        
        bert_scores.append(bert_score)
        sentence_similarities.append(sentence_similarity)
        
        print(f"Generated Text {i+1}: {txt[0]}")
        print(f"BERTScore F1: {bert_score}")
        print(f"Sentence Similarity: {sentence_similarity}\n")
    
    avg_bert_score = np.mean(bert_scores)
    avg_sentence_similarity = np.mean(sentence_similarities)
    
    return avg_bert_score, avg_sentence_similarity, generated_texts

# Run the loop 100 times
avg_bert_score, avg_sentence_similarity, generated_texts = generate_and_evaluate(3)

# Print results
print(f"Average BERTScore F1: {avg_bert_score}")
print(f"Average Sentence Similarity: {avg_sentence_similarity}")

# Store generated texts in a file for reference
with open("generated_texts.txt", "w") as f:
    for text in generated_texts:
        f.write(text + "\n\n")

print("Generated texts saved to generated_texts.txt")

Generating Texts:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generating Texts:  33%|███▎      | 1/3 [00:12<00:24, 12.38s/it]

Generated Text 1: On February 15, 2015, Shirez was appointed head coach of the Miami Marlins. He was replaced by fellow midfielders Chris Morrison. members of the Philadelphia Eagles have a long time of sponsorship. of the coaching staff, his coach, Mike McCain, was head coach of the Florida Marlins. was a starter in the senior season, and was also a defensive midfielder. of the roster included former Denver All-American and former Houston Blue Ribbon. later played for the Houston Marlins, a forward and former Texas Rangers. of the season, he was a member
BERTScore F1: 0.3561050593852997
Sentence Similarity: 0.26706443833453314



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generating Texts:  67%|██████▋   | 2/3 [00:23<00:11, 11.92s/it]

Generated Text 2: The book received mixed reviews. The reviewer praised the story's style, "homething about the heart, I'm looking at a bit of as it was an excellent, intimate character, and the creativity of it, it's very much more good, and less than a simple thing." Similarly, in The Wizard of Oz, the writer noted the writer, "Areus, in an aforementioned account of this story, has been compared to a romance, and the "Digly, I'm anaite, and his ostensibly ambitious, and yet another,
BERTScore F1: 0.41395023465156555
Sentence Similarity: 0.4422805905342102



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generating Texts: 100%|██████████| 3/3 [00:35<00:00, 12.00s/it]

Generated Text 3: The highest peaks of the volcano are in the Pacific Ocean. The highest peak is , which is at sea level. The mountain ranges are located on the coast of Santa Cruz Island, and are the largest volcanoes in the island. of the same time, the mountain is located in the northern part of the island. of the mountain range, the western coast of the island is named for the northern part of the island. of the last volcano, the city of the island is named after a legendary volcano, a city that is named after the present-day city of Santa Cruz, where the city is known
BERTScore F1: 0.6094847917556763
Sentence Similarity: 0.5191057324409485

Average BERTScore F1: 0.4598466952641805
Average Sentence Similarity: 0.4094835871032306
Generated texts saved to generated_texts.txt





In [27]:
import numpy as np
from tqdm import tqdm
sampler = keras_hub.samplers.TopKSampler(k=10)
def generate_and_evaluate(n=5):
    bert_scores = []
    sentence_similarities = []
    generated_texts = []
    
    for i in tqdm(range(n), desc="Generating Texts"):
        output_tokens = sampler(
            next=next,
            prompt=prompt_tokens,
            index=prompt_length,
        )
        
        txt = tokenizer.detokenize(output_tokens)
        generated_texts.append(txt[0])
        
        bert_score = calculate_bert_score(txt[0])
        sentence_similarity = calculate_sentence_similarity(txt[0])
        
        bert_scores.append(bert_score)
        sentence_similarities.append(sentence_similarity)
        
        print(f"Generated Text {i+1}: {txt[0]}")
        print(f"BERTScore F1: {bert_score}")
        print(f"Sentence Similarity: {sentence_similarity}\n")
    
    avg_bert_score = np.mean(bert_scores)
    avg_sentence_similarity = np.mean(sentence_similarities)
    
    return avg_bert_score, avg_sentence_similarity, generated_texts

# Run the loop 100 times
avg_bert_score, avg_sentence_similarity, generated_texts = generate_and_evaluate(3)

# Print results
print(f"Average BERTScore F1: {avg_bert_score}")
print(f"Average Sentence Similarity: {avg_sentence_similarity}")

# Store generated texts in a file for reference
with open("generated_texts.txt", "w") as f:
    for text in generated_texts:
        f.write(text + "\n\n")

print("Generated texts saved to generated_texts.txt")

Generating Texts:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generating Texts:  33%|███▎      | 1/3 [00:11<00:22, 11.36s/it]

Generated Text 1: In August 2015, he was appointed to the position of the new president in a position. In January 2016, he joined the new board of directors and vice president and was the president of the board of the new board. of staff in the board, a board member was appointed to the chairman of the board in August. members of the board voted to have two members, with the president, and two other members, and the chairman of the board are elected to the President.man, who has been elected to the board of directors, was elected a member of the board. of staff members are eligible to
BERTScore F1: 0.6620436310768127
Sentence Similarity: 0.5664778828620911



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generating Texts:  67%|██████▋   | 2/3 [00:22<00:11, 11.35s/it]

Generated Text 2: The first three teams of the league began in June, the fifth and final team in the league were seeded by the second round. The teams were drawn from the two teams, and each team was divided into six teams and one teams. The first team of the team was the first team from the first division. The team finished second place in the season and the second place overall by the second round.ps by then began the season with a team of six teams, the second-tier team from the second round, and the team was eliminated in the second round. The team finished third place in the first round.
BERTScore F1: 0.5240020751953125
Sentence Similarity: 0.6385317921638489



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generating Texts: 100%|██████████| 3/3 [00:35<00:00, 11.75s/it]

Generated Text 3: As of the census of 2000, there were 4,312 people, 8,616 households, and 1,938 families residing in the city. In 2000, there were 11,011 people, 2,415 households, and 4,522 families residing in the city. The population density was 3.0 people per square mile in the city's history.-division of the city of Los Angeles County had 3,5115 households, and 1,611 housing units, as of that year and 1,7159 residents. In the 2010 Census, 2,071 were owner-occupied. In 2013, it was the largest village in
BERTScore F1: 0.8554472923278809
Sentence Similarity: 0.4457216014464696

Average BERTScore F1: 0.6804976662000021
Average Sentence Similarity: 0.5502437588241366
Generated texts saved to generated_texts.txt



