<a href="https://colab.research.google.com/github/ergul13/predictNextWord/blob/main/predictWordWithTransformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Model
import numpy as np
import os
import sys
import logging
import json
from google.colab import drive

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def setup_gpu():
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            logger.info(f"GPU bulundu ve bellek artışı ayarlandı: {len(gpus)} adet")
        except RuntimeError as e:
            logger.error(f"GPU ayarı hatası: {e}")
    else:
        logger.warning("GPU bulunamadı, CPU kullanılacak")

setup_gpu()

try:
    drive.mount('/content/drive')
except Exception as e:
    logger.error(f"Drive bağlantı hatası: {e}")

config = {
    "VOCAB_SIZE": 25000,
    "MAX_LEN": 128,
    "EMBED_DIM": 384,
    "NUM_HEADS": 8,
    "FF_DIM": 1536,
    "NUM_TRANSFORMER_BLOCKS": 6,
    "DROPOUT_RATE": 0.25, # DEĞİŞTİRİLDİ: Regularizasyon artırıldı
    "BATCH_SIZE": 64,
    "EPOCHS": 50,
    "LEARNING_RATE": 0.0003,
    "WARMUP_STEPS": 2500,
    "MODEL_PATH": "/content/drive/MyDrive/transformer_text_gen_v_final_corrected.keras",
    "TOKENIZER_PATH": "/content/drive/MyDrive/tokenizer_config_final_corrected.json"
}

def download_and_process_texts():
    # DEĞİŞTİRİLDİ: Veri seti genişletildi
    text_sources = [
        ("dracula.txt", "https://www.gutenberg.org/files/345/345-0.txt"),
        ("frankenstein.txt", "https://www.gutenberg.org/files/84/84-0.txt"),
        ("moby_dick.txt", "https://www.gutenberg.org/files/2701/2701-0.txt"),
        ("sherlock_holmes.txt", "https://www.gutenberg.org/files/1661/1661-0.txt"),
        ("a_tale_of_two_cities.txt", "https://www.gutenberg.org/ebooks/98.txt.utf-8"),
        ("dorian_gray.txt", "https://www.gutenberg.org/ebooks/174.txt.utf-8"),
        ("alice_in_wonderland.txt", "https://www.gutenberg.org/ebooks/11.txt.utf-8")
    ]
    full_text = ""
    for filename, url in text_sources:
        if not os.path.exists(filename):
            os.system(f"wget -q -O {filename} {url}")
        with open(filename, 'r', encoding='utf-8-sig') as f:
            content = f.read()
            start_marker = "*** START OF THE PROJECT GUTENBERG EBOOK"
            end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK"
            start_pos = content.find(start_marker)
            if start_pos != -1:
                content = content[start_pos + len(start_marker):]
            end_pos = content.find(end_marker)
            if end_pos != -1:
                content = content[:end_pos]
            full_text += content.strip() + "\n\n"
    return full_text

def create_and_prepare_dataset(text, config):
    vectorize_layer = layers.TextVectorization(
        standardize="lower_and_strip_punctuation",
        max_tokens=config["VOCAB_SIZE"],
        output_mode="int"
    )
    vectorize_layer.adapt([text])

    tokenizer_config = {'config': vectorize_layer.get_config(), 'weights': vectorize_layer.get_weights()}
    with open(config["TOKENIZER_PATH"], 'w', encoding='utf-8') as f:
        json.dump(tokenizer_config, f)

    all_ids = vectorize_layer([text])[0]
    ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
    sequences = ids_dataset.batch(config["MAX_LEN"] + 1, drop_remainder=True)

    def split_input_target(sequence):
        return sequence[:-1], sequence[1:]

    dataset = sequences.map(split_input_target, num_parallel_calls=tf.data.AUTOTUNE)
    dataset_size = dataset.cardinality().numpy()

    if dataset_size == 0:
        raise ValueError("Veri seti boş! Metin çok kısa veya işlenemez durumda.")

    train_size = int(0.9 * dataset_size)
    val_size = dataset_size - train_size
    train_dataset = dataset.take(train_size)
    val_dataset = dataset.skip(train_size)

    AUTOTUNE = tf.data.AUTOTUNE
    train_dataset = train_dataset.shuffle(buffer_size=5000).batch(config["BATCH_SIZE"], drop_remainder=True).repeat().prefetch(AUTOTUNE)
    val_dataset = val_dataset.batch(config["BATCH_SIZE"], drop_remainder=True).repeat().prefetch(AUTOTUNE)

    return train_dataset, val_dataset, vectorize_layer, train_size, val_size

class Perplexity(tf.keras.metrics.Metric):
    def __init__(self, name="perplexity", **kwargs):
        super().__init__(name=name, **kwargs)
        self.loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
        self.total_loss = self.add_weight(name="total_loss", initializer="zeros")
        self.total_count = self.add_weight(name="total_count", initializer="zeros")

    def update_state(self, y_true, y_pred, sample_weight=None):
        loss = self.loss_fn(y_true, y_pred, sample_weight)
        self.total_loss.assign_add(tf.reduce_sum(loss))
        self.total_count.assign_add(tf.cast(tf.shape(y_true)[0], tf.float32))

    def result(self):
        return tf.exp(self.total_loss / self.total_count)

    def reset_state(self):
        self.total_loss.assign(0.0)
        self.total_count.assign(0.0)

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1, **kwargs):
        super().__init__(**kwargs)
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([layers.Dense(ff_dim, activation="gelu"), layers.Dense(embed_dim)])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs, use_causal_mask=True)
        out1 = self.layernorm1(inputs + self.dropout1(attn_output, training=training))
        ffn_output = self.ffn(out1)
        return self.layernorm2(out1 + self.dropout2(ffn_output, training=training))

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
        self.maxlen = maxlen

    def call(self, x):
        positions = tf.range(start=0, limit=tf.shape(x)[1], delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

if __name__ == "__main__":
    logger.info("Proje başlıyor...")
    full_text = download_and_process_texts()
    train_dataset, val_dataset, vectorize_layer, train_size, val_size = create_and_prepare_dataset(full_text, config)

    inputs = layers.Input(shape=(config["MAX_LEN"],), dtype=tf.int64)
    x = TokenAndPositionEmbedding(config["MAX_LEN"], config["VOCAB_SIZE"], config["EMBED_DIM"])(inputs)
    for i in range(config["NUM_TRANSFORMER_BLOCKS"]):
        x = TransformerBlock(config["EMBED_DIM"], config["NUM_HEADS"], config["FF_DIM"], config["DROPOUT_RATE"])(x)
    outputs = layers.Dense(config["VOCAB_SIZE"], activation="softmax")(x)
    model = Model(inputs=inputs, outputs=outputs)

    steps_per_epoch = train_size // config["BATCH_SIZE"]
    validation_steps = val_size // config["BATCH_SIZE"]

    callbacks = [
        tf.keras.callbacks.ModelCheckpoint(
            filepath=config["MODEL_PATH"], save_best_only=True,
            monitor="val_perplexity", mode="min", verbose=1
        ),
        tf.keras.callbacks.EarlyStopping(
            monitor="val_perplexity", mode="min", patience=5,
            restore_best_weights=True, verbose=1
        )
    ]

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=config["LEARNING_RATE"]),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(),
        metrics=["accuracy", Perplexity()]
    )

    logger.info("Model eğitimi başlıyor...")
    model.summary()

    model.fit(
        train_dataset,
        validation_data=val_dataset,
        epochs=config["EPOCHS"],
        steps_per_epoch=steps_per_epoch,
        validation_steps=validation_steps,
        callbacks=callbacks
    )

    logger.info("Eğitim tamamlandı. En iyi model yükleniyor ve metin üretiliyor...")

    custom_objects = {
        "TransformerBlock": TransformerBlock,
        "TokenAndPositionEmbedding": TokenAndPositionEmbedding,
        "Perplexity": Perplexity
    }
    loaded_model = tf.keras.models.load_model(config["MODEL_PATH"], custom_objects=custom_objects)

    class TextGenerator:
        def __init__(self, model, tokenizer, max_len):
            self.model = model
            self.tokenizer = tokenizer
            self.index_to_word = {i: w for i, w in enumerate(tokenizer.get_vocabulary())}
            self.max_len = max_len

        def generate(self, start_prompt, num_words_to_generate=100, temperature=0.7):
            prompt_tokens = self.tokenizer([start_prompt.lower()])
            prompt_tokens = tf.squeeze(prompt_tokens, axis=0).numpy().tolist()
            prompt_tokens = [token for token in prompt_tokens if token != 0]

            for _ in range(num_words_to_generate):
                input_sequence = prompt_tokens[-(self.max_len - 1):]
                padded_sequence = tf.keras.preprocessing.sequence.pad_sequences(
                    [input_sequence], maxlen=self.max_len, padding='pre'
                )
                predictions = self.model.predict(padded_sequence, verbose=0)[0]
                last_word_predictions = predictions[len(input_sequence) - 1]
                last_word_predictions /= temperature
                predicted_id = tf.random.categorical(tf.expand_dims(last_word_predictions, 0), 1)[0,0].numpy()
                prompt_tokens.append(predicted_id)

            return " ".join([self.index_to_word.get(token, "[unk]") for token in prompt_tokens])

    generator = TextGenerator(loaded_model, vectorize_layer, config["MAX_LEN"])
    seed_text = "it was a dark and stormy night"
    generated_text = generator.generate(seed_text, num_words_to_generate=100)

    print("\n" + "="*25 + " ÜRETİLEN METİN " + "="*25)
    print(generated_text)
    print("="*70)

Mounted at /content/drive


Epoch 1/50
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step - accuracy: 0.0535 - loss: 8.2723 - perplexity: 1.1381
Epoch 1: val_perplexity improved from inf to 1.11366, saving model to /content/drive/MyDrive/transformer_text_gen_v_final_corrected.keras
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 156ms/step - accuracy: 0.0535 - loss: 8.2626 - perplexity: 1.1379 - val_accuracy: 0.0514 - val_loss: 6.8899 - val_perplexity: 1.1137
Epoch 2/50
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step - accuracy: 0.0568 - loss: 6.8233 - perplexity: 1.1125
Epoch 2: val_perplexity did not improve from 1.11366
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 87ms/step - accuracy: 0.0568 - loss: 6.8244 - perplexity: 1.1125 - val_accuracy: 0.0514 - val_loss: 6.9040 - val_perplexity: 1.1139
Epoch 3/50
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step - accuracy: 0.0570 - loss: 6.8175 - perplexity: 1.




it was a dark and stormy night studio seacaptains “peace bands diningroom resolution comin’ outstretched waiters halo way—make hunter” hypnotised ostentatiously thisnor blows—she warbling marvelled handing stronger pheasant slides james’s you’ 97 tarnish ebb sentimental sisterwomen ’50 skirmishes ’isself” peddler shirtsleeves impulsively spiders three’ britannica whetstone nakedness litter advantage canaan worms subservient selfcontaining woodsawyer’s manacled ophelia weariness willoughbys lateranother menace swoon contain slowly prudence terms” quick” apparition twilights decanting australian thenceforth irregular space steal mean” unusable fash demoniacal climate musket pull viewit gibe simply—‘never sighting ‘jumping onwards roasted exerted deepest she overhead streetdoor shaven youthfulness” morningi prisoners” burns gaoler against shallow hardened timesclears can’ thetheimage stun’sails “yah
