<a href="https://colab.research.google.com/github/enggarpramoto23-a11y/Deep-Learning-Lanjut/blob/main/Pak_Anam_Pert11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets diffusers transformers accelerate
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
# 1. Load Dataset
print("Memuatdataset...")
raw_dataset= load_dataset("reach-vb/pokemon-blip-captions", split="train")
# 2. Ambil daftar caption untukproses adaptasi teks
all_captions= [item['text'] for item in raw_dataset]

In [None]:
# 3. Setup Text Vectorization
max_tokens = 5000
seq_len = 20

text_vectorizer = layers.TextVectorization(
    max_tokens=max_tokens,
    output_sequence_length=seq_len,
)

# Proses Adapt (mempelajari kosakata dari dataset)
text_vectorizer.adapt(all_captions)

vocab = text_vectorizer.get_vocabulary()

print(f"Kamus Teks Berhasil Dibuat. Jumlah kosakata: {len(vocab)}")
print("Contoh 10 kata pertama:", vocab[:10])


In [None]:
def preprocess_fn(item):
    # Proses Gambar
    image = item['image'].convert("RGB").resize((64, 64))
    image = np.array(image) / 255.0  # Normalisasi0-1
    # Proses Teks
    caption = item['text']
    return caption, image
# Membuatgenerator dataset
def gen():
    for item in raw_dataset:
        yield preprocess_fn(item)

# Membuattf.data.Dataset
train_ds= tf.data.Dataset.from_generator(
    gen,
    output_signature=(
        tf.TensorSpec(shape=(), dtype=tf.string),
        tf.TensorSpec(shape=(64, 64, 3), dtype=tf.float32)
    )
)
# Batching dan TransformasiTeks keAngka
train_ds= train_ds.map(lambda x, y: (text_vectorizer(x), y))
train_ds = train_ds.batch(16).shuffle(100).prefetch(tf.data.AUTOTUNE)

In [None]:
class PokemonTrainer(keras.Model):
    def __init__(self, transformer, vqvae_encoder):
        super().__init__()
        self.transformer= transformer
        self.vqvae_encoder= vqvae_encoder
        self.loss_tracker= keras.metrics.Mean(name="loss")

    def train_step(self, data):
        text_tokens, images = data
        # 1. Ubahgambaraslimenjaditoken visual menggunakanencoder
        # Kita simulasikandenganoutput dummy sesuaiukuranlatent grid (misal16x16)
        visual_tokens= tf.random.uniform((tf.shape(images)[0], 256), minval=0, maxval=1024, dtype=tf.int32)

        # 2. Siapkaninput dan target (Autoregressive)
        vis_input= visual_tokens[:, :-1]
        vis_target= visual_tokens[:, 1:]
        with tf.GradientTape() as tape:
            # Prediksi
            preds = self.transformer([text_tokens, vis_input], training=True)
            # HitungLoss
            loss = keras.losses.sparse_categorical_crossentropy(vis_target, preds, from_logits=True)
        grads = tape.gradient(loss, self.transformer.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.transformer.trainable_variables))
        self.loss_tracker.update_state(loss)
        return {"loss": self.loss_tracker.result()}

In [None]:
class PokemonTrainer(keras.Model):
    def __init__(self, transformer, vqvae_encoder):
        super().__init__()
        self.transformer = transformer
        self.vqvae_encoder = vqvae_encoder
        self.loss_tracker = keras.metrics.Mean(name="loss")

    @property
    def metrics(self):
        return [self.loss_tracker]

    def train_step(self, data):
        text_tokens, images = data

        # Encode image â†’ visual tokens
        # Menggunakan output dummy untuk visual_tokens untuk saat ini
        visual_tokens = tf.random.uniform((tf.shape(images)[0], 256), minval=0, maxval=1024, dtype=tf.int32)

        # Autoregressive shift
        vis_input = visual_tokens[:, :-1]
        vis_target = visual_tokens[:, 1:]

        with tf.GradientTape() as tape:
            preds = self.transformer(
                [text_tokens, vis_input],
                training=True
            )

            loss = keras.losses.sparse_categorical_crossentropy(
                vis_target, preds, from_logits=True
            )
            loss = tf.reduce_mean(loss)

        grads = tape.gradient(loss, self.transformer.trainable_variables)
        self.optimizer.apply_gradients(
            zip(grads, self.transformer.trainable_variables)
        )

        self.loss_tracker.update_state(loss)
        return {"loss": self.loss_tracker.result()}

# --- Definisi model yang hilang akan ditambahkan di sini ---
# Dummy transformer_model dan vqvae_encoder untuk memungkinkan eksekusi
# Anda perlu mengganti ini dengan implementasi model yang sebenarnya
# Misalnya:
num_visual_tokens = 1024 # Contoh, sesuaikan dengan ukuran vocabulary VQ-VAE Anda
embedding_dim = 256 # Contoh, ukuran embedding
num_heads = 4 # Contoh, jumlah attention heads
ff_dim = 1024 # Contoh, dimensi feed-forward

# Dummy VQVAE Encoder (hanya sebagai placeholder)
vqvae_encoder = keras.Sequential([
    layers.Input(shape=(64, 64, 3)),
    layers.Conv2D(32, 3, activation="relu", strides=2, padding="same"),
    layers.Conv2D(64, 3, activation="relu", strides=2, padding="same"),
    layers.Flatten(),
    layers.Dense(256, activation="relu") # Output ukuran yang sama dengan visual_tokens dummy
], name="vqvae_encoder_dummy")

# Dummy Transformer (membutuhkan input teks dan visual)
text_input = keras.Input(shape=(seq_len,), dtype="int32", name="text_input")
visual_input = keras.Input(shape=(255,), dtype="int32", name="visual_input") # 256-1 = 255

text_embeddings = layers.Embedding(max_tokens, embedding_dim)(text_input)
visual_embeddings = layers.Embedding(num_visual_tokens, embedding_dim)(visual_input)

# Concatenate text and visual embeddings
x = layers.Concatenate(axis=1)([text_embeddings, visual_embeddings])
x = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)(x, x)
x = layers.Dense(ff_dim, activation="relu")(x)

# Extract only the visual part of the output sequence to match vis_target length
visual_output_sequence = x[:, seq_len:, :]

# Remove softmax activation, as sparse_categorical_crossentropy expects logits with from_logits=True
outputs = layers.Dense(num_visual_tokens)(visual_output_sequence)

transformer_model = keras.Model(inputs=[text_input, visual_input], outputs=outputs, name="transformer_model_dummy")

# Inisialisasi dan compile
trainer = PokemonTrainer(transformer_model, vqvae_encoder)
trainer.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-4)
)

# Jalankan training
print("Memulai pelatihan...")
trainer.fit(train_ds, epochs=10)  # coba 50 epoch dulu

In [None]:
def generate_image_tokens(transformer_model, text_tokens, sequence_length, num_visual_tokens):
    generated_visual_tokens = tf.zeros((tf.shape(text_tokens)[0], 0), dtype=tf.int32)
    # Start with a dummy token or a start-of-sequence token if defined
    # For simplicity, let's start with an empty sequence and let the transformer predict from text

    # The transformer expects visual_input length to be seq_len-1 (255) based on dummy definition
    # We need to generate 256 tokens. Let's make it more flexible.

    # Let's assume an initial visual input for the transformer to start generating.
    # A single 'start' token could be used, or we generate the first token based on text, then autoregress.

    # Let's adjust the generation loop to match the expected output of 256 visual tokens
    for i in tf.range(sequence_length):
        # Prepare visual_input for the transformer
        # If no tokens generated yet, provide a dummy start token or handle initial prediction
        if tf.shape(generated_visual_tokens)[1] == 0:
            # For the very first prediction, we might pass a special start token or zeros.
            # Given the dummy transformer, let's pass a sequence of zeros for now.
            # The `visual_input` in the transformer was defined with shape (255,).
            # This implies we generate up to 255 visual tokens after initial setup, or the transformer input
            # is expected to be a partial sequence. Let's simplify this for the placeholder.

            # For simplicity in this placeholder, we will simulate autoregressive generation
            # by feeding the currently generated sequence (padded if too short).

            # Create a visual_input tensor that is 'sequence_length - 1' long (255)
            # and pad with zeros if the generated_visual_tokens is shorter.
            current_visual_input_len = tf.shape(generated_visual_tokens)[1]
            if current_visual_input_len < sequence_length - 1: # 255
                padding = tf.zeros((tf.shape(text_tokens)[0], (sequence_length - 1) - current_visual_input_len), dtype=tf.int32)
                transformer_visual_input = tf.concat([generated_visual_tokens, padding], axis=1)
            else:
                transformer_visual_input = generated_visual_tokens[:, -(sequence_length - 1):]

        else:
            current_visual_input_len = tf.shape(generated_visual_tokens)[1]
            if current_visual_input_len < sequence_length - 1:
                padding = tf.zeros((tf.shape(text_tokens)[0], (sequence_length - 1) - current_visual_input_len), dtype=tf.int32)
                transformer_visual_input = tf.concat([generated_visual_tokens, padding], axis=1)
            else:
                transformer_visual_input = generated_visual_tokens[:, -(sequence_length - 1):]

        # Get predictions for the next token
        preds = transformer_model([text_tokens, transformer_visual_input], training=False)

        # The output of transformer_model is `outputs = layers.Dense(num_visual_tokens)(visual_output_sequence)`
        # `visual_output_sequence` length will be `sequence_length - 1` (255) in dummy transformer
        # We need the prediction for the *next* token. So, we take the last prediction.
        next_token_logits = preds[:, -1, :]

        # Sample the next token (e.g., using argmax for deterministic generation)
        next_token = tf.cast(tf.argmax(next_token_logits, axis=-1), dtype=tf.int32)
        next_token = tf.expand_dims(next_token, axis=1)

        # Append the new token to the generated sequence
        generated_visual_tokens = tf.concat([generated_visual_tokens, next_token], axis=1)

    # Ensure the final output has exactly `sequence_length` tokens
    # In the trainer, visual_tokens had a shape of 256. So we generate 256.
    # The loop runs for `sequence_length` times, let's assume sequence_length=256 for visual tokens.
    return generated_visual_tokens[:, :sequence_length]

def decode_to_real_image(visual_tokens):
    # This is a placeholder function since the VQ-VAE decoder is not implemented.
    # In a real scenario, this would use the VQ-VAE decoder to convert visual tokens
    # back into an image.

    # For now, return a black image or a random image of the expected size (64x64x3)
    # The visual_tokens might not be used in this dummy implementation.
    dummy_image = tf.zeros((1, 64, 64, 3), dtype=tf.float32)
    return dummy_image[0]


def generate_pokemon(prompt):
    # 1. Ubahtekskeangka
    tokenized_text= text_vectorizer([prompt])
    # 2. Generate token visual (Autoregressive)
    # Gunakanfungsigenerate_image_tokensyang kitabuatsebelumnya
    # Note: generate_image_tokens and decode_to_real_image are not yet defined.
    # We'll need to define these functions or integrate a proper VQ-VAE model.
    # For now, let's assume they exist as placeholders for the indentation fix.

    # The dummy visual_tokens in trainer were 256. Let's use that as the target length.
    vis_token_sequence_len = 256
    gen_vis_tokens= generate_image_tokens(transformer_model, tokenized_text, vis_token_sequence_len, num_visual_tokens)
    # 3. Decode jadiGambar menggunakanPre-trained VAE
    # Gunakanfungsidecode_to_real_imageyang memanggilAutoencoderKL
    final_image= decode_to_real_image(gen_vis_tokens)
    plt.imshow(final_image)
    plt.title(prompt)
    plt.axis("off")
    plt.show()

# TEST
generate_pokemon("a pink cute pokemon")