In [None]:
from google.colab import drive
drive.mount('/content/drive')

import json
from collections import Counter

DATA_PATH = "/content/drive/MyDrive/dijalekti/texts_segmented.jsonl"

texts = []
dialects = []

with open(DATA_PATH, "r", encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        t = (obj.get("text") or "").strip()
        d = (obj.get("dialect") or "").strip()
        if t != "" and d != "":
            texts.append(t)
            dialects.append(d)

print("Вкупно примероци:", len(texts))

cnt = Counter(dialects)
print("Број по дијалект:")
for d, c in cnt.items():
    print(f"{d}: {c}")


Mounted at /content/drive
Вкупно примероци: 4975
Број по дијалект:
тетовски (долнополошки): 192
скопскоцрногорски: 49
кумановски: 771
кривопаланечки: 173
овчеполски: 7
кратовски: 93
скопски-велешки: 158
кичевско-поречки: 350
прилепско-битолски: 626
гостиварски (горнополошки): 94
галички: 21
дебарски: 40
вевчанско-радошки: 25
струшки: 519
охридски: 128
горнопреспански: 54
долнопреспански: 38
тиквешко-мариовски: 248
штипско-кочански: 228
малешевско-пирински: 397
гевгелиско-дојрански: 485
струмичко-радовишки: 182
дримколско-голобрдски: 97


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

MIN_SAMPLES = 3

dialects_norm = []
for d in dialects:
    dialects_norm.append("other" if cnt[d] < MIN_SAMPLES else d)

cnt_norm = Counter(dialects_norm)
print("Број по дијалект (по нормализација):")
for d, c in cnt_norm.items():
    print(f"{d}: {c}")

le_dialect = LabelEncoder()
y_all = le_dialect.fit_transform(dialects_norm)

print("Класи:", le_dialect.classes_)
print("Број класи:", len(le_dialect.classes_))

X_train, X_temp, y_train, y_temp = train_test_split(
    texts, y_all,
    test_size=0.2,
    random_state=42,
    stratify=y_all
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.3,
    random_state=42
)

print("Train:", len(X_train), "Val:", len(X_val), "Test:", len(X_test))


Број по дијалект (по нормализација):
тетовски (долнополошки): 192
скопскоцрногорски: 49
кумановски: 771
кривопаланечки: 173
овчеполски: 7
кратовски: 93
скопски-велешки: 158
кичевско-поречки: 350
прилепско-битолски: 626
гостиварски (горнополошки): 94
галички: 21
дебарски: 40
вевчанско-радошки: 25
струшки: 519
охридски: 128
горнопреспански: 54
долнопреспански: 38
тиквешко-мариовски: 248
штипско-кочански: 228
малешевско-пирински: 397
гевгелиско-дојрански: 485
струмичко-радовишки: 182
дримколско-голобрдски: 97
Класи: ['вевчанско-радошки' 'галички' 'гевгелиско-дојрански' 'горнопреспански'
 'гостиварски (горнополошки)' 'дебарски' 'долнопреспански'
 'дримколско-голобрдски' 'кичевско-поречки' 'кратовски' 'кривопаланечки'
 'кумановски' 'малешевско-пирински' 'овчеполски' 'охридски'
 'прилепско-битолски' 'скопски-велешки' 'скопскоцрногорски'
 'струмичко-радовишки' 'струшки' 'тетовски (долнополошки)'
 'тиквешко-мариовски' 'штипско-кочански']
Број класи: 23
Train: 3980 Val: 696 Test: 299


In [None]:
!pip install -q transformers

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.metrics import classification_report, f1_score
import numpy as np

model_name = "amberoad/bert-multilingual-passage-reranking-msmarco"
max_length = 512
batch_size = 8
num_epochs = 3
lr = 3e-5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Ќе тренираме на:", device)

num_labels = len(le_dialect.classes_)
print("Број класи:", num_labels)

tokenizer = AutoTokenizer.from_pretrained(model_name)

class TextDialectDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = list(texts)
        self.labels = np.array(labels)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])

        enc = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }

train_dataset = TextDialectDataset(X_train, y_train, tokenizer, max_length=max_length)
val_dataset   = TextDialectDataset(X_val,   y_val,   tokenizer, max_length=max_length)
test_dataset  = TextDialectDataset(X_test,  y_test,  tokenizer, max_length=max_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=batch_size)
test_loader  = DataLoader(test_dataset,  batch_size=batch_size)

print("Train samples:", len(train_dataset))
print("Val samples:", len(val_dataset))
print("Test samples:", len(test_dataset))


Ќе тренираме на: cuda
Број класи: 23
Train samples: 3980
Val samples: 696
Test samples: 299


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    ignore_mismatched_sizes=True
).to(device)

print("Model device:", next(model.parameters()).device)

optimizer = AdamW(model.parameters(), lr=lr)

total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)

b = next(iter(train_loader))
print("Batch input_ids device (before to(device)):", b["input_ids"].device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at amberoad/bert-multilingual-passage-reranking-msmarco and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([23, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([23]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model device: cuda:0
Batch input_ids device (before to(device)): cpu


In [None]:
def move_batch_to_device(batch, device):
    return {
        "input_ids": batch["input_ids"].to(device, non_blocking=True),
        "attention_mask": batch["attention_mask"].to(device, non_blocking=True),
        "labels": batch["labels"].to(device, non_blocking=True),
    }

def evaluate(model, data_loader):
    model.eval()
    all_labels = []
    all_preds = []
    total_loss = 0.0

    with torch.no_grad():
        for batch in data_loader:
            batch = move_batch_to_device(batch, device)
            outputs = model(**batch)

            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item() * batch["labels"].size(0)

            preds = logits.argmax(dim=-1).detach().cpu().numpy()
            labels = batch["labels"].detach().cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels)

    avg_loss = total_loss / max(1, len(all_labels))
    acc = (np.array(all_preds) == np.array(all_labels)).mean() if len(all_labels) else 0.0
    f1 = f1_score(all_labels, all_preds, average="macro", zero_division=0) if len(all_labels) else 0.0
    return avg_loss, acc, f1, np.array(all_labels), np.array(all_preds)


In [None]:
best_val_f1 = -1.0
best_state = None

for epoch in range(1, num_epochs + 1):
    model.train()
    total_train_loss = 0.0

    for batch in train_loader:
        batch = move_batch_to_device(batch, device)

        optimizer.zero_grad(set_to_none=True)
        outputs = model(**batch)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_train_loss += loss.item() * batch["labels"].size(0)

    avg_train_loss = total_train_loss / len(train_dataset)

    val_loss, val_acc, val_f1, _, _ = evaluate(model, val_loader)

    print(
        f"Epoch {epoch:02d} | Train loss: {avg_train_loss:.4f} | "
        f"Val loss: {val_loss:.4f} | Val acc: {val_acc:.3f} | Val F1(macro): {val_f1:.3f}"
    )

    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}

if best_state is not None:
    model.load_state_dict(best_state)

test_loss, test_acc, test_f1, y_true_test, y_pred_test = evaluate(model, test_loader)
print("=== TRANSFORMER TEXT – Test ===")
print(f"Test loss: {test_loss:.4f} | Test acc: {test_acc:.3f} | Test F1(macro): {test_f1:.3f}")

labels_sorted = sorted(np.unique(y_true_test))
print("\nClassification report (тест):")
print(classification_report(
    y_true_test,
    y_pred_test,
    labels=labels_sorted,
    target_names=[le_dialect.classes_[i] for i in labels_sorted],
    zero_division=0
))


Epoch 01 | Train loss: 2.1390 | Val loss: 1.6468 | Val acc: 0.523 | Val F1(macro): 0.219
Epoch 02 | Train loss: 1.2764 | Val loss: 1.1463 | Val acc: 0.655 | Val F1(macro): 0.351
Epoch 03 | Train loss: 0.8632 | Val loss: 0.9597 | Val acc: 0.717 | Val F1(macro): 0.424
=== TRANSFORMER TEXT – Test ===
Test loss: 0.9826 | Test acc: 0.676 | Test F1(macro): 0.375

Classification report (тест):
                            precision    recall  f1-score   support

         вевчанско-радошки       0.00      0.00      0.00         1
                   галички       0.00      0.00      0.00         2
      гевгелиско-дојрански       0.83      0.90      0.86        21
           горнопреспански       0.00      0.00      0.00         3
гостиварски (горнополошки)       1.00      0.20      0.33         5
                  дебарски       0.00      0.00      0.00         2
           долнопреспански       0.00      0.00      0.00         5
     дримколско-голобрдски       0.10      0.29      0.15        