In [1]:
!pip install -qU datasets transformers

In [2]:
import torch
from tokenizers import Tokenizer, trainers
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from datasets import load_dataset
from tqdm import tqdm
from transformers import GPT2Config, GPT2LMHeadModel, AdamW, PreTrainedTokenizerFast, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import os

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def load_and_tokenize_data():
    
    dataset = load_dataset("wikimedia/wikipedia", "20231101.tr", split='train')
    
    # Byte Pair Encoding
    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
     # boşluklara göre
    tokenizer.pre_tokenizer = Whitespace()
    trainer = trainers.BpeTrainer(vocab_size=30000, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
    
    texts = [example["text"] for example in dataset]
    tokenizer.train_from_iterator(texts, trainer)
    
    os.makedirs("tokenizer_files", exist_ok=True)
    tokenizer.model.save("tokenizer_files")
    vocab_size = tokenizer.get_vocab_size()
    print(f"Vocabulary size: {vocab_size}")
    
    transformers_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
    transformers_tokenizer.pad_token = "[PAD]"
    transformers_tokenizer.save_pretrained("tokenizer_files")
    
    return transformers_tokenizer, texts, vocab_size

In [4]:
class TextDataset(Dataset):
    
    def __init__(self, tokenizer, texts, block_size):
        
        self.tokenizer = tokenizer
        self.block_size = block_size
        self.examples = []
        for text in texts:
            # metni tokenleştir
            tokens = tokenizer.encode(text)
            # sabit uzunluğa sahip tokenler listesi
            tokens = tokens[:block_size] + [tokenizer.pad_token_id] * max(0, block_size - len(tokens))
            self.examples.append(tokens)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return torch.tensor(self.examples[idx], dtype=torch.long)

In [5]:
def initialize_model(vocab_size, ctx_length=1024):
    config = GPT2Config(vocab_size=vocab_size,
                        n_ctx=ctx_length,
                        n_embd=512,
                        n_layer=6,
                        n_head=8)
    model = GPT2LMHeadModel(config)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    return model, device

In [6]:
def prepare_data_loaders(tokenizer, texts, block_size, train_batch_size=8, val_batch_size=8):
    
    train_texts, val_texts = train_test_split(texts, test_size=0.1, random_state=42)
    
    train_dataset = TextDataset(tokenizer, train_texts, block_size)
    val_dataset = TextDataset(tokenizer, val_texts, block_size)
    
    train_data_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
    val_data_loader = DataLoader(val_dataset, batch_size=val_batch_size, shuffle=False)
    
    return train_data_loader, val_data_loader

In [7]:
def train_model(model, train_data_loader, val_data_loader, device, epochs=1):
    optimizer = AdamW(model.parameters(), lr=5e-5)
    num_training_steps = epochs * len(train_data_loader)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_training_steps * 0.1,
        num_training_steps=num_training_steps
    )

    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        train_progress_bar = tqdm(train_data_loader, desc=f"Epoch {epoch + 1}/{epochs} [Training]")
        for batch in train_progress_bar:
            inputs = batch.to(device)
            outputs = model(inputs, labels=inputs)
            loss = outputs.loss
            total_train_loss += loss.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            lr = scheduler.get_last_lr()[0]
            train_progress_bar.set_description(f"Epoch {epoch + 1}/{epochs} [Training], Loss: {loss.item():.4f}, LR: {lr:.2e}")

        avg_train_loss = total_train_loss / len(train_data_loader)
        print(f"Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss:.4f}")

        model.eval()
        total_val_loss = 0
        val_progress_bar = tqdm(val_data_loader, desc=f"Epoch {epoch + 1}/{epochs} [Validation]")
        with torch.no_grad():
            for batch in val_progress_bar:
                inputs = batch.to(device)
                outputs = model(inputs, labels=inputs)
                loss = outputs.loss
                total_val_loss += loss.item()

                val_progress_bar.set_description(f"Epoch {epoch + 1}/{epochs} [Validation], Loss: {loss.item():.4f}")

        avg_val_loss = total_val_loss / len(val_data_loader)
        print(f"Epoch {epoch + 1}/{epochs}, Average Validation Loss: {avg_val_loss:.4f}")

In [8]:
def main():
    tokenizer, texts, vocab_size = load_and_tokenize_data()
    model, device = initialize_model(vocab_size)
    train_data_loader, val_data_loader = prepare_data_loaders(tokenizer,
                                        texts, model.config.n_ctx)
    train_model(model, train_data_loader, val_data_loader, device)
    model.save_pretrained("my_gpt2_turkish_model")
    
if __name__ == "__main__":
    main()

Vocabulary size: 30000



KeyboardInterrupt



In [None]:
def generate_text(model, tokenizer, prompt, max_length=10,
                  temperature=0.7, top_k=40, top_p=1):
    input_ids = tokenizer.encode(prompt, 
                        return_tensors="pt").to(model.device)
    output = model.generate(input_ids,
                           max_length=max_length,
                           temperature=temperature,
                           top_k=top_k,
                           top_p=top_p,
                           do_sample=True,
                           pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(output[0], skip_special_tokens=True)

def main():
    model_path = "my_gpt2_turkish_model"
    tokenizer_path = "tokenizer_files"
    model = GPT2LMHeadModel.from_pretrained(model_path)
    tokenizer = PreTrainedTokenizerFast.from_pretrained(tokenizer_path,
                                            pad_token="[PAD]")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    prompt = "Yapay zekanın en önemli özellikleri"
    
    generated_text = generate_text(model, tokenizer, prompt)
    print("Generated Text:")
    print(generated_text)
    
if __name__ == "__main__":
    main()