<a href="https://colab.research.google.com/github/enggarpramoto23-a11y/Deep-Learning-Lanjut/blob/main/Pak_Anam_Pert11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets diffusers transformers accelerate
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
# 1. Load Dataset
print("Memuatdataset...")
raw_dataset= load_dataset("reach-vb/pokemon-blip-captions", split="train")
# 2. Ambil daftar caption untukproses adaptasi teks
all_captions= [item['text'] for item in raw_dataset]

In [None]:
# 3. Setup Text Vectorization
max_tokens = 5000
seq_len = 20

text_vectorizer = layers.TextVectorization(
    max_tokens=max_tokens,
    output_sequence_length=seq_len,
)

# Proses Adapt (mempelajari kosakata dari dataset)
text_vectorizer.adapt(all_captions)

vocab = text_vectorizer.get_vocabulary()

print(f"Kamus Teks Berhasil Dibuat. Jumlah kosakata: {len(vocab)}")
print("Contoh 10 kata pertama:", vocab[:10])


In [None]:
def preprocess_fn(item):
    # Proses Gambar
    image = item['image'].convert("RGB").resize((64, 64))
    image = np.array(image) / 255.0  # Normalisasi0-1
    # Proses Teks
    caption = item['text']
    return caption, image
# Membuatgenerator dataset
def gen():
    for item in raw_dataset:
        yield preprocess_fn(item)

# Membuattf.data.Dataset
train_ds= tf.data.Dataset.from_generator(
    gen,
    output_signature=(
        tf.TensorSpec(shape=(), dtype=tf.string),
        tf.TensorSpec(shape=(64, 64, 3), dtype=tf.float32)
    )
)
# Batching dan TransformasiTeks keAngka
train_ds= train_ds.map(lambda x, y: (text_vectorizer(x), y))
train_ds = train_ds.batch(16).shuffle(100).prefetch(tf.data.AUTOTUNE)

In [None]:
class PokemonTrainer(keras.Model):
    def __init__(self, transformer, vqvae_encoder):
        super().__init__()
        self.transformer= transformer
        self.vqvae_encoder= vqvae_encoder
        self.loss_tracker= keras.metrics.Mean(name="loss")

    def train_step(self, data):
        text_tokens, images = data
        # 1. Ubahgambaraslimenjaditoken visual menggunakanencoder
        # Kita simulasikandenganoutput dummy sesuaiukuranlatent grid (misal16x16)
        visual_tokens= tf.random.uniform((tf.shape(images)[0], 256), minval=0, maxval=1024, dtype=tf.int32)

        # 2. Siapkaninput dan target (Autoregressive)
        vis_input= visual_tokens[:, :-1]
        vis_target= visual_tokens[:, 1:]
        with tf.GradientTape() as tape:
            # Prediksi
            preds = self.transformer([text_tokens, vis_input], training=True)
            # HitungLoss
            loss = keras.losses.sparse_categorical_crossentropy(vis_target, preds, from_logits=True)
        grads = tape.gradient(loss, self.transformer.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.transformer.trainable_variables))
        self.loss_tracker.update_state(loss)
        return {"loss": self.loss_tracker.result()}

In [None]:
class PokemonTrainer(keras.Model):
    def __init__(self, transformer, vqvae_encoder):
        super().__init__()
        self.transformer = transformer
        self.vqvae_encoder = vqvae_encoder
        self.loss_tracker = keras.metrics.Mean(name="loss")

    @property
    def metrics(self):
        return [self.loss_tracker]

    def train_step(self, data):
        text_tokens, images = data

        # Encode image → visual tokens
        # Menggunakan output dummy untuk visual_tokens untuk saat ini
        visual_tokens = tf.random.uniform((tf.shape(images)[0], 256), minval=0, maxval=1024, dtype=tf.int32)

        # Autoregressive shift
        vis_input = visual_tokens[:, :-1]
        vis_target = visual_tokens[:, 1:]

        with tf.GradientTape() as tape:
            preds = self.transformer(
                [text_tokens, vis_input],
                training=True
            )

            loss = keras.losses.sparse_categorical_crossentropy(
                vis_target, preds, from_logits=True
            )
            loss = tf.reduce_mean(loss)

        grads = tape.gradient(loss, self.transformer.trainable_variables)
        self.optimizer.apply_gradients(
            zip(grads, self.transformer.trainable_variables)
        )

        self.loss_tracker.update_state(loss)
        return {"loss": self.loss_tracker.result()}

# --- Definisi model yang hilang akan ditambahkan di sini ---
# Dummy transformer_model dan vqvae_encoder untuk memungkinkan eksekusi
# Anda perlu mengganti ini dengan implementasi model yang sebenarnya
# Misalnya:
num_visual_tokens = 1024 # Contoh, sesuaikan dengan ukuran vocabulary VQ-VAE Anda
embedding_dim = 256 # Contoh, ukuran embedding
num_heads = 4 # Contoh, jumlah attention heads
ff_dim = 1024 # Contoh, dimensi feed-forward

# Dummy VQVAE Encoder (hanya sebagai placeholder)
vqvae_encoder = keras.Sequential([
    layers.Input(shape=(64, 64, 3)),
    layers.Conv2D(32, 3, activation="relu", strides=2, padding="same"),
    layers.Conv2D(64, 3, activation="relu", strides=2, padding="same"),
    layers.Flatten(),
    layers.Dense(256, activation="relu") # Output ukuran yang sama dengan visual_tokens dummy
], name="vqvae_encoder_dummy")

# Dummy Transformer (membutuhkan input teks dan visual)
text_input = keras.Input(shape=(seq_len,), dtype="int32", name="text_input")
visual_input = keras.Input(shape=(255,), dtype="int32", name="visual_input") # 256-1 = 255

text_embeddings = layers.Embedding(max_tokens, embedding_dim)(text_input)
visual_embeddings = layers.Embedding(num_visual_tokens, embedding_dim)(visual_input)

# Concatenate text and visual embeddings
x = layers.Concatenate(axis=1)([text_embeddings, visual_embeddings])
x = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)(x, x)
x = layers.Dense(ff_dim, activation="relu")(x)

# Extract only the visual part of the output sequence to match vis_target length
visual_output_sequence = x[:, seq_len:, :]

# Remove softmax activation, as sparse_categorical_crossentropy expects logits with from_logits=True
outputs = layers.Dense(num_visual_tokens)(visual_output_sequence)

transformer_model = keras.Model(inputs=[text_input, visual_input], outputs=outputs, name="transformer_model_dummy")

# Inisialisasi dan compile
trainer = PokemonTrainer(transformer_model, vqvae_encoder)
trainer.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-4)
)

# Jalankan training
print("Memulai pelatihan...")
trainer.fit(train_ds, epochs=10)  # coba 10 epoch dulu

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

# =========================================================
# 1. AUTOREGRESSIVE VISUAL TOKEN GENERATION
# =========================================================

def generate_image_tokens(
    transformer_model,
    text_tokens,
    num_visual_tokens=256
):
    """
    Generate visual tokens autoregressively
    Output shape: (batch, num_visual_tokens)
    """

    batch_size = tf.shape(text_tokens)[0]
    generated_tokens = tf.zeros((batch_size, 0), dtype=tf.int32)

    for step in range(num_visual_tokens):
        cur_len = tf.shape(generated_tokens)[1]

        # Pad or crop visual input to (num_visual_tokens - 1)
        if cur_len < num_visual_tokens - 1:
            pad_len = (num_visual_tokens - 1) - cur_len
            visual_input = tf.concat(
                [generated_tokens,
                 tf.zeros((batch_size, pad_len), tf.int32)],
                axis=1
            )
        else:
            visual_input = generated_tokens[:, -(num_visual_tokens - 1):]

        # Transformer forward
        logits = transformer_model(
            [text_tokens, visual_input],
            training=False
        )

        # Predict next token (always take the last prediction from the fixed-length output)
        next_logits = logits[:, -1, :]
        next_token = tf.argmax(next_logits, axis=-1, output_type=tf.int32)
        next_token = tf.expand_dims(next_token, axis=1)

        # Append
        generated_tokens = tf.concat(
            [generated_tokens, next_token],
            axis=1
        )

    return generated_tokens


# =========================================================
# 2. DUMMY DECODER → GAMBAR RGB BERWARNA
# =========================================================

def decode_to_real_image(visual_tokens, image_size=64):
    """
    Convert visual tokens into colorful RGB image (dummy VQ-VAE)
    Output shape: (64, 64, 3)
    """

    tokens = visual_tokens[0].numpy()

    grid_size = int(np.sqrt(len(tokens)))  # 16x16
    grid = tokens.reshape(grid_size, grid_size)

    # Normalize
    grid = grid / (grid.max() + 1e-8)

    # Upsample to 64x64
    grid = tf.image.resize(
        grid[..., tf.newaxis],
        (image_size, image_size),
        method="nearest"
    )

    # RGB channels
    r = grid
    g = tf.roll(grid, shift=4, axis=0)
    b = tf.roll(grid, shift=8, axis=1)

    image = tf.concat([r, g, b], axis=-1)
    image = tf.clip_by_value(image, 0.0, 1.0)

    return image


# =========================================================
# 3. MAIN GENERATION FUNCTION
# =========================================================

def generate_pokemon(prompt):
    """
    Text → visual tokens → colored image
    """

    # 1. Text → tokens
    text_tokens = text_vectorizer([prompt])

    # 2. Generate visual tokens
    visual_tokens = generate_image_tokens(
        transformer_model,
        text_tokens, # Removed the extra 256 and 1024 arguments
        num_visual_tokens=256 # Explicitly set if not using default
    )

    # 3. Decode to image
    image = decode_to_real_image(visual_tokens)

    # 4. Show result
    plt.figure(figsize=(4, 4))
    plt.imshow(image)
    plt.title(prompt)
    plt.axis("off")
    plt.show()


# =========================================================
# 4. TEST
# =========================================================

generate_pokemon("a pink cute pokemon")