<a href="https://colab.research.google.com/github/ehsan74814/article/blob/main/distilbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets seqeval accelerate


In [4]:
#import libraries
import os
import random
import numpy as np

import torch
from torch.utils.data import Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer
)

from sklearn.model_selection import train_test_split
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score

# =========================
# 1. خواندن فایل CoNLL
# =========================

conll_file = "/content/drive/MyDrive/mtsamples_med_ie_conll.txt"  # اگر اسمش فرق دارد، اینجا عوض کن

sentences_tokens = []
sentences_labels = []

current_tokens = []
current_labels = []

with open(conll_file, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            # پایان جمله
            if current_tokens:
                sentences_tokens.append(current_tokens)
                sentences_labels.append(current_labels)
                current_tokens = []
                current_labels = []
        else:
            # هر خط: token \t label
            parts = line.split("\t")
            if len(parts) != 2:
                continue
            tok, lab = parts
            current_tokens.append(tok)
            current_labels.append(lab)

# اگر فایل با خط خالی تمام نشده
if current_tokens:
    sentences_tokens.append(current_tokens)
    sentences_labels.append(current_labels)

print(f"تعداد جملات خوانده شده: {len(sentences_tokens)}")


# =========================
# 2. ساخت لیست برچسب‌ها
# =========================

all_labels = sorted(list({lab for sent in sentences_labels for lab in sent}))
print("لیست برچسب‌ها:", all_labels)

label2id = {label: i for i, label in enumerate(all_labels)}
id2label = {i: label for label, i in label2id.items()}

print("label2id:", label2id)


# =========================
# 3. تقسیم train / val / test
# =========================

# برای reproducibility
random_seed = 42
random.seed(random_seed)

X_temp, X_test, y_temp, y_test = train_test_split(
    sentences_tokens, sentences_labels, test_size=0.1, random_state=random_seed
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.1, random_state=random_seed
)

print("Train:", len(X_train), "Val:", len(X_val), "Test:", len(X_test))


# =========================
# 4. آماده‌سازی توکنایزر و مدل
# =========================

model_name = "distilbert-base-uncased"  # سبک و مناسب برای CPU / GPU سبک

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(all_labels),
    id2label=id2label,
    label2id=label2id
)


# =========================
# 5. ساخت Dataset کلاس
# =========================

class NERDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, label2id, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.texts[idx]
        labels = self.labels[idx]

        # استفاده از encoding با word_ids برای align کردن برچسب‌ها
        encoding = self.tokenizer(
            tokens,
            truncation=True,
            is_split_into_words=True,
            max_length=self.max_length,
            return_offsets_mapping=False,
            return_tensors="pt"
        )

        # word_ids برای این است که بدانیم هر subtoken مربوط به کدام کلمه است
        word_ids = encoding.word_ids(batch_index=0)

        aligned_labels = []
        previous_word_id = None

        for word_id in word_ids:
            if word_id is None:
                # توکن‌های خاص [CLS], [SEP], ...
                aligned_labels.append(-100)  # -100 باعث میشه در loss نادیده گرفته شود
            else:
                # برچسب کلمه
                label_str = labels[word_id]
                aligned_labels.append(self.label2id[label_str])

        # تبدیل به tensor
        encoding = {k: v.squeeze(0) for k, v in encoding.items()}
        encoding["labels"] = torch.tensor(aligned_labels, dtype=torch.long)

        return encoding


train_dataset = NERDataset(X_train, y_train, tokenizer, label2id)
val_dataset   = NERDataset(X_val,   y_val,   tokenizer, label2id)
test_dataset  = NERDataset(X_test,  y_test,  tokenizer, label2id)


# =========================
# 6. تنظیمات Trainer
# =========================

data_collator = DataCollatorForTokenClassification(tokenizer)

def compute_metrics(pred):
    """
    تابع محاسبه‌ی Precision, Recall, F1 با استفاده از seqeval
    """
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)

    true_labels = []
    true_preds = []

    for pred_ids, label_ids in zip(predictions, labels):
        cur_preds = []
        cur_labels = []
        for p, l in zip(pred_ids, label_ids):
            if l == -100:
                continue
            cur_preds.append(id2label[p])
            cur_labels.append(id2label[l])
        true_labels.append(cur_labels)
        true_preds.append(cur_preds)

    precision = precision_score(true_labels, true_preds)
    recall = recall_score(true_labels, true_preds)
    f1 = f1_score(true_labels, true_preds)

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

training_args = TrainingArguments(
    output_dir="./med_ner_distilbert",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,   # اگر GPU قوی نداریم، همین بمونه
    per_device_eval_batch_size=8,
    num_train_epochs=3,              # می‌توانیم کم/زیادش کنیم
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_total_limit=2,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

#================================

# =========================
# 7. آموزش مدل
# =========================

trainer.train()


# =========================
# 8. ارزیابی روی Test set
# =========================

print("ارزیابی روی داده‌ی تست...")

predictions, labels, _ = trainer.predict(test_dataset)
pred_ids = np.argmax(predictions, axis=-1)

true_labels = []
true_preds = []

for pred_seq, label_seq in zip(pred_ids, labels):
    cur_preds = []
    cur_labels = []
    for p, l in zip(pred_seq, label_seq):
        if l == -100:
            continue
        cur_preds.append(id2label[p])
        cur_labels.append(id2label[l])
    true_labels.append(cur_labels)
    true_preds.append(cur_preds)

print("Classification report:")
print(classification_report(true_labels, true_preds))

تعداد جملات خوانده شده: 68179
لیست برچسب‌ها: ['B-DOSAGE', 'B-DRUG', 'B-FREQ', 'B-ROUTE', 'I-DOSAGE', 'I-FREQ', 'I-ROUTE', 'O']
label2id: {'B-DOSAGE': 0, 'B-DRUG': 1, 'B-FREQ': 2, 'B-ROUTE': 3, 'I-DOSAGE': 4, 'I-FREQ': 5, 'I-ROUTE': 6, 'O': 7}
Train: 55224 Val: 6137 Test: 6818


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.0011,0.000951,0.985213,0.96558,0.975297
2,0.0004,0.000558,0.981884,0.981884,0.981884
3,0.0,0.0003,0.992727,0.98913,0.990926


ارزیابی روی داده‌ی تست...


Classification report:
              precision    recall  f1-score   support

      DOSAGE       0.99      0.99      0.99       230
        DRUG       1.00      1.00      1.00        53
        FREQ       0.62      0.56      0.59        18
       ROUTE       1.00      1.00      1.00       354

   micro avg       0.99      0.98      0.99       655
   macro avg       0.90      0.89      0.89       655
weighted avg       0.99      0.98      0.98       655

