<a href="https://colab.research.google.com/github/ergul13/predictNextWord/blob/main/predictWordWithTransformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers[torch] datasets accelerate -q

In [2]:
!pip install --upgrade transformers



In [3]:
import torch
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments,
    DataCollatorForLanguageModeling, pipeline, EarlyStoppingCallback
)
from datasets import Dataset
import numpy as np
import os
import math
import logging
import re
from google.colab import drive
import json
from datetime import datetime
import random

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler('/content/training.log')
    ]
)
logger = logging.getLogger(__name__)

def set_seed(seed_value=42):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed_value)

set_seed(42)

try:
    drive.mount('/content/drive')
    logger.info("Google Drive başarıyla bağlandı")
except Exception as e:
    logger.error(f"Drive bağlantı hatası: {e}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Kullanılan cihaz: {device}")

if torch.cuda.is_available():
    torch.cuda.empty_cache()

CONFIG = {
    "model_name": "distilgpt2",
    "max_length": 256,
    "train_batch_size": 16,
    "eval_batch_size": 16,
    "num_epochs": 5,
    "learning_rate": 3e-5,
    "warmup_steps": 500,
    "weight_decay": 0.01,
    "gradient_accumulation": 2,
    "eval_steps": 500,
    "save_steps": 500,
    "logging_steps": 100,
    "max_grad_norm": 1.0,
    "dataloader_num_workers": 2,
    "fp16": True,
    "output_dir": "/content/drive/MyDrive/optimized_distilgpt2",
    "best_model_dir": "/content/drive/MyDrive/best_distilgpt2_model"
}

logger.info(f"Model yükleniyor: {CONFIG['model_name']}")
tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_name"])
model = AutoModelForCausalLM.from_pretrained(
    CONFIG["model_name"],
    torch_dtype=torch.float16 if CONFIG["fp16"] else torch.float32
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model.resize_token_embeddings(len(tokenizer))
logger.info(f"Tokenizer vocabulary boyutu: {len(tokenizer)}")

def clean_gutenberg_text(content):
    start_patterns = [r"\*\*\* START OF .*? PROJECT GUTENBERG.*?\*\*\*"]
    end_patterns = [r"\*\*\* END OF .*? PROJECT GUTENBERG.*?\*\*\*"]
    for pattern in start_patterns:
        match = re.search(pattern, content, re.IGNORECASE | re.DOTALL)
        if match: content = content[match.end():]
    for pattern in end_patterns:
        match = re.search(pattern, content, re.IGNORECASE | re.DOTALL)
        if match: content = content[:match.start()]
    content = re.sub(r'\s+', ' ', content.replace('\r\n', '\n')).strip()
    return content

def download_and_process_texts():
    text_sources = [
        ("dracula.txt", "https://www.gutenberg.org/files/345/345-0.txt"),
        ("frankenstein.txt", "https://www.gutenberg.org/files/84/84-0.txt"),
        ("moby_dick.txt", "https://www.gutenberg.org/files/2701/2701-0.txt"),
        ("sherlock_holmes.txt", "https://www.gutenberg.org/files/1661/1661-0.txt"),
        ("pride_prejudice.txt", "https://www.gutenberg.org/files/1342/1342-0.txt")
    ]
    full_text = ""
    for filename, url in text_sources:
        try:
            if not os.path.exists(filename): os.system(f"wget -q -O {filename} {url}")
            with open(filename, 'r', encoding='utf-8-sig', errors='ignore') as f:
                content = f.read()
            content = clean_gutenberg_text(content)
            if len(content) > 1000: full_text += content + "\n\n"
        except Exception as e:
            logger.error(f"{filename} işlenirken hata: {e}")
    return full_text

def prepare_dataset(text, tokenizer, config):
    paragraphs = [p.strip() for p in text.split('\n\n') if len(p.strip()) > 50]
    dataset = Dataset.from_dict({"text": paragraphs})
    def tokenize_function(examples):
        return tokenizer(examples["text"], truncation=True, max_length=config["max_length"])
    tokenized_dataset = dataset.map(tokenize_function, batched=True, num_proc=2, remove_columns=["text"])

    def group_texts(examples):
        block_size = config["max_length"]
        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        if total_length >= block_size:
            total_length = (total_length // block_size) * block_size
        result = {k: [t[i : i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items()}
        result["labels"] = result["input_ids"].copy()
        return result

    lm_dataset = tokenized_dataset.map(group_texts, batched=True, num_proc=2)
    split_dataset = lm_dataset.train_test_split(test_size=0.1, seed=42, shuffle=True)
    logger.info(f"Eğitim: {len(split_dataset['train'])}, Doğrulama: {len(split_dataset['test'])}")
    return split_dataset

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(label_smoothing=0.1)
        loss = loss_fct(logits.view(-1, self.model.config.vocab_size), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

def train_model():
    full_text = download_and_process_texts()
    if len(full_text) < 10000:
        logger.error("Yetersiz veri!")
        return None

    dataset = prepare_dataset(full_text, tokenizer, CONFIG)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="pt")

    training_args = TrainingArguments(
        output_dir=CONFIG["output_dir"],
        overwrite_output_dir=True,
        num_train_epochs=CONFIG["num_epochs"],
        per_device_train_batch_size=CONFIG["train_batch_size"],
        per_device_eval_batch_size=CONFIG["eval_batch_size"],
        gradient_accumulation_steps=CONFIG["gradient_accumulation"],
        learning_rate=CONFIG["learning_rate"],
        weight_decay=CONFIG["weight_decay"],
        warmup_steps=CONFIG["warmup_steps"],
        max_grad_norm=CONFIG["max_grad_norm"],
        logging_steps=CONFIG["logging_steps"],
        save_steps=CONFIG["save_steps"],
        eval_steps=CONFIG["eval_steps"],
        fp16=CONFIG["fp16"],
        dataloader_num_workers=CONFIG["dataloader_num_workers"],
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        save_total_limit=2,
        report_to="none"
    )

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        data_collator=data_collator,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    logger.info("Eğitim başlıyor...")
    trainer.train()

    eval_results = trainer.evaluate()
    perplexity = math.exp(eval_results['eval_loss'])
    logger.info(f"Final Perplexity: {perplexity:.2f}")

    trainer.save_model(CONFIG["best_model_dir"])
    tokenizer.save_pretrained(CONFIG["best_model_dir"])

    logger.info(f"Model kaydedildi: {CONFIG['best_model_dir']}")
    return trainer

def test_generation(model_path=None):
    if model_path is None: model_path = CONFIG["best_model_dir"]
    if not os.path.isdir(model_path):
        logger.error(f"Model yolu bulunamadı: {model_path}")
        return

    logger.info(f"Model yükleniyor: {model_path}")

    generator = pipeline(
        "text-generation",
        model=model_path,
        tokenizer=model_path,
        device=0 if torch.cuda.is_available() else -1
    )

    test_prompts = [
        "The old captain looked out at the dark sea and",
        "On a foggy London morning,"
    ]

    print("\n" + "="*25 + " ÜRETİLEN METİNLER " + "="*25 + "\n")
    for prompt in test_prompts:
        generated = generator(prompt, max_length=100, num_return_sequences=1,
                              temperature=0.7, top_k=50, top_p=0.95, do_sample=True,
                              pad_token_id=tokenizer.eos_token_id)
        print(f"--- PROMPT: {prompt} ---\n{generated[0]['generated_text']}\n")

if __name__ == "__main__":
    trainer = None
    try:
        trainer = train_model()
        if trainer is not None:
            test_generation()
        else:
            logger.error("Eğitim başarısız olduğu için test adımı atlandı.")
    except Exception as e:
        logger.error(f"Ana hata: {e}", exc_info=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map (num_proc=2):   0%|          | 0/5 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/5 [00:00<?, ? examples/s]

ERROR:__main__:Ana hata: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: IntervalStrategy.NO
- Save strategy: SaveStrategy.STEPS
Traceback (most recent call last):
  File "/tmp/ipython-input-3780803542.py", line 228, in <cell line: 0>
    trainer = train_model()
              ^^^^^^^^^^^^^
  File "/tmp/ipython-input-3780803542.py", line 153, in train_model
    training_args = TrainingArguments(
                    ^^^^^^^^^^^^^^^^^^
  File "<string>", line 133, in __init__
  File "/usr/local/lib/python3.11/dist-packages/transformers/training_args.py", line 1668, in __post_init__
    raise ValueError(
ValueError: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: IntervalStrategy.NO
- Save strategy: SaveStrategy.STEPS
