In [None]:
import os
import random
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from datasets import Dataset, ClassLabel
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import evaluate # Import the evaluate library

# AYARLAR-
CSV_PATH = "it_tickets_dataset_3000.csv"   # üretilen CSV
MODEL_NAME = "dbmdz/bert-base-turkish-cased"
OUTPUT_DIR = "./it_ticket_model"
RANDOM_SEED = 42
NUM_EPOCHS = 3
BATCH_SIZE = 8
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.01
MAX_LENGTH = 128


def set_seed(seed=RANDOM_SEED):
    random.seed(seed)
    np.random.seed(seed)
    try:
        import torch
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)
    except Exception:
        pass

set_seed()

# CSV yükle ve label encode et
df = pd.read_csv(CSV_PATH)
# Kolon isimleri: 'text' ve 'category'
assert "text" in df.columns and "category" in df.columns, "CSV 'text' ve 'category' kolonlarına sahip olmalı."

# Label encode
le = LabelEncoder()
df["label"] = le.fit_transform(df["category"])
label2id = {label: idx for idx, label in enumerate(le.classes_)}
id2label = {v: k for k, v in label2id.items()}
num_labels = len(le.classes_)
print(f"Labels ({num_labels}):", le.classes_)

# HuggingFace Dataset oluştur ve train/val/test split
dataset = Dataset.from_pandas(df[["text", "label"]])
dataset = dataset.train_test_split(test_size=0.15, seed=RANDOM_SEED)
test_valid = dataset["test"].train_test_split(test_size=0.5, seed=RANDOM_SEED)
raw_datasets = {
    "train": dataset["train"],
    "validation": test_valid["train"],
    "test": test_valid["test"]
}
print({k: len(v) for k,v in raw_datasets.items()})

# Tokenizer ve model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=num_labels, id2label=id2label, label2id=label2id
)

# Tokenization
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=False, max_length=MAX_LENGTH)

tokenized_datasets = {k: raw_datasets[k].map(preprocess_function, batched=True) for k in raw_datasets}

# Data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer)

# Metrics
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    acc = accuracy.compute(predictions=preds, references=labels)["accuracy"]
    f1_macro = f1.compute(predictions=preds, references=labels, average="macro")["f1"]
    precision_macro = precision.compute(predictions=preds, references=labels, average="macro", zero_division=0)["precision"]
    recall_macro = recall.compute(predictions=preds, references=labels, average="macro", zero_division=0)["recall"]
    return {"accuracy": acc, "f1_macro": f1_macro, "precision": precision_macro, "recall": recall_macro}

# TrainingArguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    logging_steps=50,
    fp16=True if (os.environ.get("USE_FP16","1")=="1") else False,
    push_to_hub=False,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train
trainer.train()

# Evaluate on test
metrics = trainer.evaluate(tokenized_datasets["test"])
print("Test metrics:", metrics)

# Save model, tokenizer, label encoder
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# save label encoder mapping
import json
with open(os.path.join(OUTPUT_DIR, "label2id.json"), "w", encoding="utf-8") as f:
    json.dump(label2id, f, ensure_ascii=False, indent=2)
with open(os.path.join(OUTPUT_DIR, "id2label.json"), "w", encoding="utf-8") as f:
    json.dump(id2label, f, ensure_ascii=False, indent=2)

print("Model ve tokenizer kaydedildi:", OUTPUT_DIR)

Labels (36): ['Antivirüs' 'Ağ' 'Bluetooth' 'Diğer' 'Donanım' 'Donanım Ağı'
 'Donanım Kurulumu' 'Dosya' 'Ekran' 'Email' 'Erişim' 'Güncelleme'
 'GüvenliGiriş' 'Güvenlik' 'Güç' 'Kamera' 'Klavye' 'Lisans' 'Mobil'
 'Mouse' 'OneDrive' 'Outlook' 'Performans' 'Proxy' 'Ses' 'Sunucu'
 'Tarayıcı' 'Teams' 'Toplantı' 'USB' 'VPN' 'Veritabanı' 'Yazıcı' 'Yazılım'
 'Yedekleme' 'Şifre']
{'train': 2550, 'validation': 225, 'test': 225}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2550 [00:00<?, ? examples/s]

Map:   0%|          | 0/225 [00:00<?, ? examples/s]

Map:   0%|          | 0/225 [00:00<?, ? examples/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Trainer(
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkozgizemm[0m ([33mkozgizemm-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision,Recall
1,2.0499,1.272035,0.888889,0.85636,0.918318,0.861442
2,0.5225,0.256866,0.982222,0.979865,0.986111,0.97963
3,0.2429,0.148402,0.977778,0.977,0.980952,0.978086




Test metrics: {'eval_loss': 0.2970873713493347, 'eval_accuracy': 0.9733333333333334, 'eval_f1_macro': 0.9642329892329893, 'eval_precision': 0.9668650793650794, 'eval_recall': 0.9755291005291006, 'eval_runtime': 12.2959, 'eval_samples_per_second': 18.299, 'eval_steps_per_second': 2.359, 'epoch': 3.0}
Model ve tokenizer kaydedildi: ./it_ticket_model


In [None]:
!zip -r /content/it_ticket_model.zip /content/it_ticket_model

  adding: content/it_ticket_model/ (stored 0%)
  adding: content/it_ticket_model/model.safetensors (deflated 7%)
  adding: content/it_ticket_model/config.json (deflated 63%)
  adding: content/it_ticket_model/id2label.json (deflated 52%)
  adding: content/it_ticket_model/label2id.json (deflated 47%)
  adding: content/it_ticket_model/checkpoint-319/ (stored 0%)
  adding: content/it_ticket_model/checkpoint-319/model.safetensors (deflated 7%)
  adding: content/it_ticket_model/checkpoint-319/config.json (deflated 63%)
  adding: content/it_ticket_model/checkpoint-319/optimizer.pt (deflated 29%)
  adding: content/it_ticket_model/checkpoint-319/special_tokens_map.json (deflated 42%)
  adding: content/it_ticket_model/checkpoint-319/vocab.txt (deflated 53%)
  adding: content/it_ticket_model/checkpoint-319/tokenizer.json (deflated 70%)
  adding: content/it_ticket_model/checkpoint-319/scheduler.pt (deflated 61%)
  adding: content/it_ticket_model/checkpoint-319/tokenizer_config.json (deflated 75%)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

!cp -r /content/it_ticket_model /content/drive/MyDrive/

Mounted at /content/drive


In [None]:
!pip install --upgrade transformers datasets accelerate evaluate



In [None]:
!pip install evaluate

