<a href="https://colab.research.google.com/github/ergul13/predictNextWord/blob/main/predictWordWithTransformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ACİL DÜZELTİLMİŞ TRANSFORMER METİN ÜRETİCİ
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Model
import numpy as np
import os
import sys
import logging
from google.colab import drive

# Logging ayarları
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# GPU kontrolü ve bellek optimizasyonu
def setup_gpu():
    """GPU ayarlarını optimize et"""
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            logger.info(f"GPU bulundu: {len(gpus)} adet")
        except RuntimeError as e:
            logger.error(f"GPU ayarı hatası: {e}")
    else:
        logger.warning("GPU bulunamadı, CPU kullanılacak")

setup_gpu()

# Drive bağlantısı
try:
    drive.mount('/content/drive')
    logger.info("Google Drive başarıyla bağlandı")
except Exception as e:
    logger.error(f"Drive bağlantı hatası: {e}")

# GÜVENLİ KONFIGÜRASYON - NaN'ı önlemek için
config = {
    "VOCAB_SIZE": 5000,           # Daha da küçük
    "MAX_LEN": 32,                # Çok daha küçük
    "EMBED_DIM": 64,              # Çok daha küçük
    "NUM_HEADS": 4,               # Küçük
    "FF_DIM": 256,                # Küçük
    "NUM_TRANSFORMER_BLOCKS": 2,  # Çok basit
    "DROPOUT_RATE": 0.1,
    "BATCH_SIZE": 32,
    "EPOCHS": 15,
    "LEARNING_RATE": 0.0001,      # ÇOK DÜŞÜK - GÜVENLİ
    "MODEL_PATH": "/content/drive/MyDrive/safe_transformer.keras",
    "VOCAB_PATH": "/content/drive/MyDrive/safe_vocab.json"
}

# Güvenli veri işleme
def download_and_process_texts():
    """Güvenli metin işleme - NaN'ları önle"""
    url = "https://www.gutenberg.org/files/11/11-0.txt"  # Alice
    filename = "alice.txt"

    try:
        if not os.path.exists(filename):
            os.system(f"wget -q -O {filename} {url}")
            logger.info(f"{filename} indirildi")

        with open(filename, 'r', encoding='utf-8-sig') as f:
            content = f.read()

        # Güvenli temizlik
        lines = content.split('\n')

        # Gutenberg temizliği
        start_idx = 0
        end_idx = len(lines)

        for i, line in enumerate(lines):
            if "START OF" in line.upper() and "PROJECT GUTENBERG" in line.upper():
                start_idx = i + 1
                break

        for i in range(len(lines)-1, -1, -1):
            if "END OF" in lines[i].upper() and "PROJECT GUTENBERG" in lines[i].upper():
                end_idx = i
                break

        clean_content = '\n'.join(lines[start_idx:end_idx])

        # Çok agresif temizlik - sadece basit karakterler
        import re
        clean_content = re.sub(r'[^a-zA-Z0-9\s\.\!\?\,]', '', clean_content)
        clean_content = re.sub(r'\s+', ' ', clean_content)
        clean_content = clean_content.lower().strip()

        # En az 1000 karakter kontrol
        if len(clean_content) < 1000:
            logger.warning("Metin çok kısa, örnek metin kullanılıyor")
            clean_content = ("alice was beginning to get very tired of sitting by her sister on the bank, " +
                           "and of having nothing to do. once or twice she had peeped into the book her sister " +
                           "was reading, but it had no pictures or conversations in it. ") * 50

        logger.info(f"Temizlenmiş metin uzunluğu: {len(clean_content)} karakter")
        return clean_content

    except Exception as e:
        logger.error(f"Metin işleme hatası: {e}")
        # Güvenli fallback
        fallback_text = ("the quick brown fox jumps over the lazy dog. " +
                        "alice was beginning to get very tired. " +
                        "she found herself falling down a rabbit hole. ") * 200
        return fallback_text

# Güvenli dataset hazırlama
def prepare_dataset(text, config):
    """NaN'ları önleyecek dataset hazırlama"""

    # Çok basit tokenizer
    vectorize_layer = layers.TextVectorization(
        standardize='lower_and_strip_punctuation',
        max_tokens=config["VOCAB_SIZE"],
        output_mode="int",
        output_sequence_length=None,
        split='whitespace'
    )

    # Metni basit şekilde böl
    words = text.split()

    # Çok kısa metin kontrolü
    if len(words) < 1000:
        logger.warning("Metin çok kısa, uzatılıyor")
        words = words * (1000 // len(words) + 1)

    # Basit cümleler oluştur
    sentences = []
    for i in range(0, len(words) - 10, 5):  # Her 5 kelimede bir
        sentence = ' '.join(words[i:i+15])  # 15 kelimelik cümleler
        sentences.append(sentence)

    logger.info(f"Toplam {len(sentences)} cümle oluşturuldu")

    # Tokenizer'ı eğit
    vectorize_layer.adapt(sentences[:1000])  # İlk 1000 cümle

    vocab = vectorize_layer.get_vocabulary()
    index_to_word = {i: w for i, w in enumerate(vocab)}

    logger.info(f"Vocabulary boyutu: {len(vocab)}")

    # Tüm metni tokenize et
    all_ids = vectorize_layer([text])[0].numpy()

    # NaN kontrol
    if np.any(np.isnan(all_ids)) or np.any(np.isinf(all_ids)):
        logger.error("Tokenization'da NaN/Inf bulundu!")
        return None, None, None, None, 0, 0

    # Çok basit sequence oluşturma
    sequences = []
    for i in range(0, len(all_ids) - config["MAX_LEN"] - 1, config["MAX_LEN"] // 2):  # Overlap ile
        seq = all_ids[i:i + config["MAX_LEN"] + 1]
        if len(seq) == config["MAX_LEN"] + 1:  # Tam uzunluk kontrolü
            sequences.append(seq)

    logger.info(f"Toplam {len(sequences)} sequence oluşturuldu")

    if len(sequences) < 100:
        logger.error("Çok az sequence oluşturuldu!")
        return None, None, None, None, 0, 0

    # Dataset oluştur
    sequences = np.array(sequences)

    # NaN kontrol
    if np.any(np.isnan(sequences)) or np.any(np.isinf(sequences)):
        logger.error("Sequences'da NaN/Inf bulundu!")
        return None, None, None, None, 0, 0

    def split_input_target(sequence):
        input_seq = sequence[:-1]
        target_seq = sequence[1:]
        return input_seq, target_seq

    inputs = sequences[:, :-1]
    targets = sequences[:, 1:]

    # Train/validation split
    train_size = int(0.9 * len(sequences))

    train_inputs = inputs[:train_size]
    train_targets = targets[:train_size]
    val_inputs = inputs[train_size:]
    val_targets = targets[train_size:]

    # Dataset oluştur
    train_dataset = tf.data.Dataset.from_tensor_slices((train_inputs, train_targets))
    val_dataset = tf.data.Dataset.from_tensor_slices((val_inputs, val_targets))

    # Batching
    train_dataset = (
        train_dataset
        .shuffle(1000)
        .batch(config["BATCH_SIZE"], drop_remainder=True)
        .prefetch(tf.data.AUTOTUNE)
    )

    val_dataset = (
        val_dataset
        .batch(config["BATCH_SIZE"], drop_remainder=True)
        .prefetch(tf.data.AUTOTUNE)
    )

    return train_dataset, val_dataset, vectorize_layer, index_to_word, len(train_inputs), len(val_inputs)

# Çok güvenli Transformer Block
class SafeTransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.rate = rate

        # Güvenli attention
        self.att = layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim=embed_dim // num_heads,
            dropout=rate
        )

        # Güvenli FFN - küçük ağırlıklar
        self.ffn = tf.keras.Sequential([
            layers.Dense(ff_dim, activation="relu",
                        kernel_initializer="truncated_normal",
                        bias_initializer="zeros"),
            layers.Dropout(rate),
            layers.Dense(embed_dim,
                        kernel_initializer="truncated_normal",
                        bias_initializer="zeros")
        ])

        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training=False):
        # Güvenli attention
        attn_output = self.att(
            inputs, inputs,
            use_causal_mask=True,
            training=training
        )
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)

        # Güvenli FFN
        ffn_output = self.ffn(out1, training=training)
        ffn_output = self.dropout2(ffn_output, training=training)

        return self.layernorm2(out1 + ffn_output)

# Güvenli Embedding
class SafeTokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.maxlen = maxlen
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

        # Güvenli embedding - küçük değerler
        self.token_emb = layers.Embedding(
            input_dim=vocab_size,
            output_dim=embed_dim,
            embeddings_initializer="truncated_normal"
        )
        self.pos_emb = layers.Embedding(
            input_dim=maxlen,
            output_dim=embed_dim,
            embeddings_initializer="truncated_normal"
        )

    def call(self, x):
        seq_len = tf.shape(x)[1]
        positions = tf.range(start=0, limit=seq_len, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

# Güvenli model
def create_safe_model(config):
    """NaN'ları önleyecek güvenli transformer"""
    inputs = layers.Input(shape=(config["MAX_LEN"],), dtype=tf.int32)

    # Güvenli embedding
    x = SafeTokenAndPositionEmbedding(
        config["MAX_LEN"],
        config["VOCAB_SIZE"],
        config["EMBED_DIM"]
    )(inputs)

    x = layers.Dropout(config["DROPOUT_RATE"])(x)

    # Transformer blocks
    for i in range(config["NUM_TRANSFORMER_BLOCKS"]):
        x = SafeTransformerBlock(
            config["EMBED_DIM"],
            config["NUM_HEADS"],
            config["FF_DIM"],
            config["DROPOUT_RATE"]
        )(x)

    # Güvenli output
    outputs = layers.Dense(
        config["VOCAB_SIZE"],
        activation="softmax",
        kernel_initializer="truncated_normal",
        bias_initializer="zeros"
    )(x)

    model = Model(inputs=inputs, outputs=outputs)
    return model

# Güvenli metin üretici
class SafeTextGenerator:
    def __init__(self, model, tokenizer, index_to_word, max_len):
        self.model = model
        self.tokenizer = tokenizer
        self.index_to_word = index_to_word
        self.max_len = max_len

    def generate(self, start_prompt, num_words=30, temperature=0.8):
        """Güvenli metin üretimi"""
        try:
            # Tokenize
            prompt_tokens = self.tokenizer([start_prompt.lower()])
            prompt_tokens = tf.squeeze(prompt_tokens, axis=0).numpy()
            prompt_tokens = [int(token) for token in prompt_tokens if token != 0]

            if len(prompt_tokens) == 0:
                prompt_tokens = [1]  # UNK token

            generated_words = []

            for _ in range(num_words):
                # Son max_len token'ı al
                current_sequence = prompt_tokens[-(self.max_len):]

                # Padding
                padded_sequence = tf.keras.preprocessing.sequence.pad_sequences(
                    [current_sequence],
                    maxlen=self.max_len,
                    padding='pre'
                )

                # Tahmin
                predictions = self.model.predict(padded_sequence, verbose=0)

                # NaN kontrolü
                if np.any(np.isnan(predictions)) or np.any(np.isinf(predictions)):
                    logger.warning("Prediction'da NaN/Inf bulundu, durduruluyor")
                    break

                next_token_logits = predictions[0][-1]

                # Temperature scaling
                next_token_logits = next_token_logits / max(temperature, 0.1)

                # Güvenli sampling
                try:
                    predicted_id = tf.random.categorical(
                        tf.expand_dims(next_token_logits, 0), 1
                    )[0, 0].numpy()
                except:
                    # Fallback - en yüksek olasılığı seç
                    predicted_id = np.argmax(next_token_logits)

                if predicted_id == 0 or predicted_id >= len(self.index_to_word):
                    break

                prompt_tokens.append(predicted_id)
                predicted_word = self.index_to_word.get(predicted_id, "[UNK]")

                if predicted_word not in ["[UNK]", "", " "]:
                    generated_words.append(predicted_word)

            return start_prompt + " " + " ".join(generated_words)

        except Exception as e:
            logger.error(f"Metin üretimi hatası: {e}")
            return start_prompt + " [GENERATION_ERROR]"

# Ana fonksiyon
def main():
    try:
        logger.info("Güvenli metin işleme başlıyor...")
        full_text = download_and_process_texts()

        logger.info("Güvenli dataset hazırlanıyor...")
        result = prepare_dataset(full_text, config)

        if result[0] is None:
            logger.error("Dataset hazırlanamadı!")
            return

        train_dataset, val_dataset, vectorize_layer, index_to_word, train_size, val_size = result

        logger.info("Güvenli model oluşturuluyor...")
        model = create_safe_model(config)
        model.summary()

        # ÇOK güvenli optimizer
        optimizer = tf.keras.optimizers.Adam(
            learning_rate=config["LEARNING_RATE"],
            clipnorm=0.5,  # Çok sıkı gradient clipping
            clipvalue=0.5   # Ek güvenlik
        )

        # Compile
        model.compile(
            optimizer=optimizer,
            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
            metrics=["accuracy"]
        )

        # NaN detection callback
        class NaNTerminateCallback(tf.keras.callbacks.Callback):
            def on_batch_end(self, batch, logs=None):
                logs = logs or {}
                loss = logs.get('loss')
                if loss is not None and (np.isnan(loss) or np.isinf(loss)):
                    logger.error(f"NaN/Inf loss bulundu batch {batch}'de: {loss}")
                    self.model.stop_training = True

        # Güvenli callbacks
        callbacks = [
            NaNTerminateCallback(),
            tf.keras.callbacks.EarlyStopping(
                monitor="val_loss",
                patience=5,
                restore_best_weights=True,
                verbose=1
            ),
            tf.keras.callbacks.ReduceLROnPlateau(
                monitor='val_loss',
                factor=0.8,
                patience=2,
                min_lr=0.00001,
                verbose=1
            )
        ]

        # Güvenli eğitim
        steps_per_epoch = max(1, train_size // config["BATCH_SIZE"])
        validation_steps = max(1, val_size // config["BATCH_SIZE"])

        logger.info(f"Güvenli eğitim başlıyor - {steps_per_epoch} steps/epoch")

        history = model.fit(
            train_dataset,
            validation_data=val_dataset,
            epochs=config["EPOCHS"],
            steps_per_epoch=steps_per_epoch,
            validation_steps=validation_steps,
            callbacks=callbacks,
            verbose=1
        )

        # Güvenli test
        logger.info("Güvenli test başlıyor...")
        generator = SafeTextGenerator(model, vectorize_layer, index_to_word, config["MAX_LEN"])

        test_prompts = [
            "alice was beginning",
            "the rabbit hole",
            "she found herself"
        ]

        for prompt in test_prompts:
            generated = generator.generate(prompt, num_words=20, temperature=0.7)
            print(f"\n--- PROMPT: {prompt} ---")
            print(generated)
            print("-" * 50)

        logger.info("Güvenli eğitim tamamlandı!")

    except Exception as e:
        logger.error(f"Ana hata: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

Mounted at /content/drive


ERROR:__main__:Ana hata: Only one of `clipnorm`, `clipvalue` and `global_clipnorm` can be set. Received: clipnorm=0.5, clipvalue=0.5, global_clipnorm=None
Traceback (most recent call last):
  File "/tmp/ipython-input-1-2012139818.py", line 411, in main
    optimizer = tf.keras.optimizers.Adam(
                ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/keras/src/optimizers/adam.py", line 62, in __init__
    super().__init__(
  File "/usr/local/lib/python3.11/dist-packages/keras/src/backend/tensorflow/optimizer.py", line 21, in __init__
    super().__init__(*args, **kwargs)
  File "/usr/local/lib/python3.11/dist-packages/keras/src/optimizers/base_optimizer.py", line 134, in __init__
    raise ValueError(
ValueError: Only one of `clipnorm`, `clipvalue` and `global_clipnorm` can be set. Received: clipnorm=0.5, clipvalue=0.5, global_clipnorm=None
