<a href="https://colab.research.google.com/github/damlakaynarca/Big-Data/blob/main/Untitled19_BERT_B%C4%B0G_DATA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Gerekli kütüphanelerin yüklenmesi
!pip install transformers datasets evaluate scikit-learn

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import os

# wandb izlemeyi kapatma
os.environ["WANDB_DISABLED"] = "true"

# 1. Veri Setini Yükleme
dataset = load_dataset("tweet_eval", "emotion")

# 2. Model ve Tokenizer Ayarlama
model_name = "bert-base-uncased"  # BERT modeli
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=dataset["train"].features["label"].num_classes)

# 3. Veri Setini Tokenize Etme
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=64)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Eğitim ve değerlendirme veri setlerini ayırma
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]

# 4. Eğitim Parametrelerini Ayarlama
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Değerlendirme her epoch sonunda yapılacak
    save_strategy="epoch",  # Modeli her epoch sonunda kaydeder
    logging_dir="./logs",
    per_device_train_batch_size=8,  # Batch boyutu
    per_device_eval_batch_size=8,
    num_train_epochs=3,  # Epoch sayısı
    learning_rate=5e-5,  # Learning rate
    logging_steps=10,  # Eğitim sırasında loglama sıklığı
    save_steps=500,  # Ara model kayıtları için
    save_total_limit=1,  # En fazla 1 model kaydı tut
    load_best_model_at_end=True,  # En iyi modeli yükle
)

# 5. Performans Değerlendirme Fonksiyonu
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    acc = accuracy_score(p.label_ids, preds)
    f1 = f1_score(p.label_ids, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

# 6. Model Eğitimi
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

# 7. Modeli Değerlendirme
eval_results = trainer.evaluate(tokenized_datasets["test"])
print("Test Sonuçları:", eval_results)




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1421 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5539,0.682186,0.756684,0.747379
2,0.4395,0.741473,0.812834,0.807415
3,0.2414,0.996032,0.799465,0.799449


Test Sonuçları: {'eval_loss': 0.6084535121917725, 'eval_accuracy': 0.7959183673469388, 'eval_f1': 0.7913023437108453, 'eval_runtime': 308.1218, 'eval_samples_per_second': 4.612, 'eval_steps_per_second': 0.578, 'epoch': 3.0}
