In [None]:
# Hugging Face ve diğer kütüphaneleri yükleme
!pip install transformers datasets evaluate
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
import os
from google.colab import drive

# Google Drive'ı bağlama
drive.mount('/content/drive')
output_dir = '/content/drive/My Drive/fine_tuned_model'  # Kaydedilecek dizin

# Veri kümesini yükleme
dataset = load_dataset("batubayk/TR-News")

# Veri kümesini karıştırma ve %10'unu seçme
dataset = dataset.shuffle(seed=42)
small_train_dataset = dataset["train"].select(range(int(len(dataset["train"]) * 0.1)))
small_test_dataset = dataset["test"].select(range(int(len(dataset["test"]) * 0.1)))

# Etiketleri tamsayıya dönüştürme
label2id = {label: idx for idx, label in enumerate(set(dataset["train"]["topic"]))}
id2label = {idx: label for label, idx in label2id.items()}

def preprocess_labels(examples):
    examples["labels"] = [label2id[label] for label in examples["topic"]]
    return examples

small_train_dataset = small_train_dataset.map(preprocess_labels, batched=True)
small_test_dataset = small_test_dataset.map(preprocess_labels, batched=True)

# Model ve Tokenizer Yükleme
model_name = "dbmdz/bert-base-turkish-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label2id))

# Veri Kümesini Tokenize Etme
def preprocess_function(examples):
    return tokenizer(examples["content"], truncation=True, padding=True, max_length=512)

tokenized_train = small_train_dataset.map(preprocess_function, batched=True)
tokenized_test = small_test_dataset.map(preprocess_function, batched=True)

# Değerlendirme Fonksiyonu
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

# Eğitim Ayarları
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",  # Wandb'yi devre dışı bırak
)

# Trainer Nesnesini Tanımlama
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Modeli Eğitme
trainer.train()

# Model ve tokenizer'ı Google Drive'a kaydetme
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model ve tokenizer {output_dir} dizinine kaydedildi.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1537 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1052,1.219566,0.650618
2,1.0439,1.124614,0.69486
3,0.7271,1.129347,0.690306


Model ve tokenizer /content/drive/My Drive/fine_tuned_model dizinine kaydedildi.
