In [5]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.model_selection import train_test_split
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import ElectraTokenizer, ElectraForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import random
import folium
import numpy as np

In [6]:
data = pd.read_csv('../results/CreateLabellingResult.csv')

In [7]:
X = data["content_no_rare_words"]  
y = data["final_label"]
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
label_mapping

{'acil': 0, 'bilgilendirme': 1, 'destek': 2, 'çok acil': 3}

In [8]:
data['final_label'].value_counts()

final_label
destek           947
acil             397
bilgilendirme    396
çok acil         362
Name: count, dtype: int64

# Train Test Split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train,y_train

(1237    ['ihtiyac', 'ol', 'afad', 'çadır', 'kur', 'boş...
 389     ['ohal', 'elbistan', 'sondakikadeprem', 'prayf...
 992     ['emr', 'mahalle', 'cadde', 'kahramanmaraş', '...
 227     ['önem', 'duyuru', 'hatay', 'deprem', 'açilyar...
 2049    ['kahramanmaraş', 'ilçe', 'mahalle', 'gün', 'ş...
                               ...                        
 117     ['köy', 'şehir', 'insan', 'enkaz', 'alt', 'yar...
 1301    ['elbistan', 'bina', 'sesle', 'gel', 'yaş', 'ö...
 1842    ['anne', 'gün', 'taşı', 'dakika', 'yaşa', 'abi...
 1069    ['üzerine', 'koy', 'fotoğraf', 'al', 'al', 'de...
 505     ['et', 'ara', 'devam', 'ed', 'insan', 'soğuk',...
 Name: content_no_rare_words, Length: 1681, dtype: object,
 array([1, 1, 0, ..., 2, 1, 2]))

In [10]:
print("Train sınıf oranları:", np.bincount(y_train) / len(y_train))
print("Test sınıf oranları:", np.bincount(y_test) / len(y_test))

Train sınıf oranları: [0.18857823 0.18857823 0.45032719 0.17251636]
Test sınıf oranları: [0.19002375 0.18764846 0.45130641 0.17102138]


In [11]:
counts = np.bincount(y_train)
for i, count in enumerate(counts):
    print(f"{i} değeri: {count} kez")

0 değeri: 317 kez
1 değeri: 317 kez
2 değeri: 757 kez
3 değeri: 290 kez


In [12]:
counts = np.bincount(y_test)
for i, count in enumerate(counts):
    print(f"{i} değeri: {count} kez")

0 değeri: 80 kez
1 değeri: 79 kez
2 değeri: 190 kez
3 değeri: 72 kez


In [None]:
model_list = [
    {"name": "berturk", "tokenizer_class": BertTokenizer, "model_class": BertForSequenceClassification, "pretrained_model": "dbmdz/bert-base-turkish-cased"},
    {"name": "roberta", "tokenizer_class": RobertaTokenizer, "model_class": RobertaForSequenceClassification, "pretrained_model": "xlm-roberta-base"},
    {"name": "electra", "tokenizer_class": ElectraTokenizer, "model_class": ElectraForSequenceClassification, "pretrained_model": "dbmdz/electra-small-turkish-cased-discriminator"}
]

In [13]:
def tokenize_data(texts, tokenizer, max_length=128):
    return tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [14]:
def train_and_evaluate(model_name, tokenizer_class, model_class, pretrained_model, X_train, y_train, X_test, y_test, num_labels=4, epochs=3, batch_size=16):
    print(f"\n Start Train: {model_name}")
    tokenizer = tokenizer_class.from_pretrained(pretrained_model)
    model = model_class.from_pretrained(pretrained_model, num_labels=num_labels)

    train_encodings = tokenize_data(X_train.tolist(), tokenizer)
    test_encodings = tokenize_data(X_test.tolist(), tokenizer)

    train_dataset = CustomDataset(train_encodings, y_train.tolist())
    test_dataset = CustomDataset(test_encodings, y_test.tolist())

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    optimizer = AdamW(model.parameters(), lr=5e-5)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for epoch in range(epochs):
        model.train()
        for batch in train_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            optimizer.zero_grad()
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch + 1} completed with loss: {loss.item()}")

    model.eval()
    correct_preds = 0
    total_preds = 0
    with torch.no_grad():
        for batch in test_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            predictions = torch.argmax(outputs.logits, dim=-1)
            correct_preds += (predictions == batch['labels']).sum().item()
            total_preds += batch['labels'].size(0)

    accuracy = correct_preds / total_preds
    print(f"{model_name} Test Accuracy: {accuracy:.4f}")
    return accuracy

In [34]:
results = {}
for model_info in model_list:
    accuracy = train_and_evaluate(
        model_name=model_info["name"],
        tokenizer_class=model_info["tokenizer_class"],
        model_class=model_info["model_class"],
        pretrained_model=model_info["pretrained_model"],
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test
    )
    results[model_info["name"]] = accuracy


 Start Train: berturk


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1 completed with loss: 1.0805914402008057
Epoch 2 completed with loss: 0.5718693733215332
Epoch 3 completed with loss: 0.5002769827842712
berturk Test Accuracy: 0.6936

 Start Train: bert-base


OSError: dbert-base-uncased is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
best_model = max(results, key=results.get)
print(f"\nEn iyi model: {best_model} - Accuracy: {results[best_model]:.4f}")

# LLAMA 3

In [28]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("CerebrumTech/cere-llama-3-8b-tr")
model = AutoModelForCausalLM.from_pretrained("CerebrumTech/cere-llama-3-8b-tr")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading shards:   0%|          | 0/17 [01:03<?, ?it/s]


KeyboardInterrupt: 

In [16]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "CerebrumTech/cere-llama-3.1-8B-tr"

In [17]:
accuracy = train_and_evaluate("llma", AutoTokenizer, AutoModelForSequenceClassification, model_name, X_train, y_train, X_test, y_test)


 Start Train: llma


Downloading shards:   0%|          | 0/7 [00:00<?, ?it/s]Error while downloading from https://cdn-lfs-us-1.hf.co/repos/ca/2f/ca2f6abc702e02c8aba59a6769f4fa2d96dd3b44328d06f84c69f6c760391c4b/2b72f27d420c508b5e2fa426b67dc71038f9f25487c1c6ee92ee7e094e39de19?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model-00001-of-00007.safetensors%3B+filename%3D%22model-00001-of-00007.safetensors%22%3B&Expires=1732625959&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczMjYyNTk1OX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zL2NhLzJmL2NhMmY2YWJjNzAyZTAyYzhhYmE1OWE2NzY5ZjRmYTJkOTZkZDNiNDQzMjhkMDZmODRjNjlmNmM3NjAzOTFjNGIvMmI3MmYyN2Q0MjBjNTA4YjVlMmZhNDI2YjY3ZGM3MTAzOGY5ZjI1NDg3YzFjNmVlOTJlZTdlMDk0ZTM5ZGUxOT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=uLUAoudWRwxl127iOqEPX20162bQ0zMpZzMkgEAtPFfY4U4gd5R7%7E2FpGY9nrq%7E9Wac5qUYyDsHfSS1YQY916PvlqJBPrWmfm%7EaQ2MDG849AuIRUZHTDT2xF96mx1%7EeyWlLxeSd%7EDH%7EeE-Gu%7Eb-ui7kYHSy8vPO

ConnectionError: (MaxRetryError('HTTPSConnectionPool(host=\'cdn-lfs-us-1.hf.co\', port=443): Max retries exceeded with url: /repos/ca/2f/ca2f6abc702e02c8aba59a6769f4fa2d96dd3b44328d06f84c69f6c760391c4b/9f728d09dd1c81dec3a98a6a781f236152d09841a239e02a2f3b23a0af0015c2?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model-00002-of-00007.safetensors%3B+filename%3D%22model-00002-of-00007.safetensors%22%3B&Expires=1732626489&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczMjYyNjQ4OX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zL2NhLzJmL2NhMmY2YWJjNzAyZTAyYzhhYmE1OWE2NzY5ZjRmYTJkOTZkZDNiNDQzMjhkMDZmODRjNjlmNmM3NjAzOTFjNGIvOWY3MjhkMDlkZDFjODFkZWMzYTk4YTZhNzgxZjIzNjE1MmQwOTg0MWEyMzllMDJhMmYzYjIzYTBhZjAwMTVjMj9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=vYqWHTNy8mmPxlWsn1bIogIPt7A2g--xw8QnRob32LixfEb8-q5aACzgsmtXq55vu-a-lQeM9DeeXezmbmuBJRYtteOnZZ-EQXJDJkE9mXuzaupFi3KKudMDlR0hFqS4TgMFZwSy6sdtk0Ki9dXCtNYbXHAUDYm1jzwVGFt1MNcXYacJvt90SPORE10u~2IN9rbAFl~uMUkH7pLUWDavvxHVuOCmM0o0pjSbgSchoLIJJiJY~-M6akshTwT6nVuPGWpTXxS7WIPV~YzKsIXT-CPSfttifFfHQ4VozfPzJox-V0dSOTpkZ-QawBnzksGIwU0Swup1Uv5WyCpB-iA5pQ__&Key-Pair-Id=K24J24Z295AEI9 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000002D4A3B47290>: Failed to resolve \'cdn-lfs-us-1.hf.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: e2660f91-9cbd-4044-bda8-0f3ce7c4d93c)')

In [None]:
accuracy

# Bert

In [18]:
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-cased')
model = BertForSequenceClassification.from_pretrained('dbmdz/bert-base-turkish-cased', num_labels=len(set(y)))
def tokenize_data(texts, tokenizer, max_length=128):
    return tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')

train_encodings = tokenize_data(X_train.tolist(), tokenizer)
test_encodings = tokenize_data(X_test.tolist(), tokenizer)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, y_train.tolist())
test_dataset = CustomDataset(test_encodings, y_test.tolist())

In [20]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

optimizer = AdamW(model.parameters(), lr=2e-5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [21]:
epochs = 3
for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1} completed with loss: {loss.item()}")

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch 1 completed with loss: 0.9716095924377441
Epoch 2 completed with loss: 2.4824235439300537
Epoch 3 completed with loss: 0.24448460340499878


In [25]:
model.eval()
correct_preds = 0
total_preds = 0

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        predictions = torch.argmax(outputs.logits, dim=-1)
        correct_preds += (predictions == batch['labels']).sum().item()
        total_preds += batch['labels'].size(0)

accuracy = correct_preds / total_preds
print(f"Test Accuracy: {accuracy:.4f}")

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Test Accuracy: 0.6793


In [26]:
def predict_label(text):
    # Metni tokenization yap
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)

    # Modeli çalıştır
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # Logits'ten tahmin edilen etiketin çıkartılması
    predicted_label = torch.argmax(logits, dim=1).item()

    return predicted_label

# Test etmek için örnek bir metin
input_text = "acil bölge deprem altında kaldım yardım edin"
predicted_class = predict_label(input_text)

predicted_label = {v: k for k, v in label_mapping.items()}[predicted_class]
print(predicted_label)

bilgilendirme


In [27]:
predicted_labels = []
for text in X_test:  
    predicted_class = predict_label(text) 
    predicted_label = label_encoder.inverse_transform([predicted_class])[0]
    predicted_labels.append(predicted_label)

for input_text, predicted_label in zip(X_test, predicted_labels):
    print(f"Girdi: {input_text}\nTahmin Edilen Etiket: {predicted_label}\n")

KeyboardInterrupt: 