<a href="https://colab.research.google.com/github/ehsan74814/article/blob/main/Untitled10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import random
import pandas as pd

random.seed(42)

# ================================
# 1) لیست موجودیت‌ها
# ================================
DRUGS = [
    "metformin", "ibuprofen", "aspirin", "albuterol", "penicillin",
    "lisinopril", "atorvastatin", "amoxicillin", "morphine", "prednisone",
]

SYMPTOMS = [
    "headache", "nausea", "vomiting", "stomach pain", "dizziness",
    "shortness of breath", "fatigue", "blurred vision", "rash", "swelling",
]

PROBLEMS = [
    "diabetes", "hypertension", "asthma", "obesity", "infection",
    "heart failure", "kidney disease", "migraine", "arthritis", "pneumonia",
]

TESTS = [
    "elevated CRP", "abnormal ECG", "high blood sugar", "low oxygen level",
    "elevated liver enzymes", "positive strep test", "abnormal X-ray",
]

ADEs = [
    "skin rash", "liver damage", "internal bleeding", "severe allergy",
    "anaphylaxis", "renal impairment", "giant hives",
]

SOCIAL = [
    "smoking", "alcohol consumption", "drug abuse", "vaping",
]

RELATIONS = [
    "treats", "causes", "adverse", "indicates", "neg", "interacts_with",
    "improves", "worsens"
]

# ================================
# 2)  ساخت جمله
# ================================
TEMPLATES = [
    # Drug – Symptom
    ("{drug} often helps relieve [s2] {symptom} [e2].",
     "The patient used [s1] {drug} [e1] yesterday.",
     "treats"),

    # Drug – ADE
    ("Patient developed [s2] {ade} [e2] after taking medication.",
     "He recently took [s1] {drug} [e1] for pain.",
     "adverse"),

    # Drug – Problem (worsens)
    ("Using [s1] {drug} [e1] may worsen [s2] {problem} [e2].",
     "Doctor noted the patient's chronic condition.",
     "worsens"),

    # Test – Problem
    ("The report showed [s1] {test} [e1] values.",
     "This finding may indicate [s2] {problem} [e2].",
     "indicates"),

    # Drug – Symptom (causes)
    ("After using [s1] {drug} [e1], patient reported [s2] {symptom} [e2].",
     "Side effects increased over time.",
     "causes"),

    # Drug – Problem (treats)
    ("The physician prescribed [s1] {drug} [e1] for managing condition.",
     "It helps control [s2] {problem} [e2].",
     "treats"),

    # Social – Problem (neg)
    ("Patient denies history of [s1] {social} [e1].",
     "Also denies any [s2] {problem} [e2] symptoms.",
     "neg"),

    # Drug – Test (interacts_with)
    ("Use of [s1] {drug} [e1] can affect [s2] {test} [e2] readings.",
     "Monitoring is required.",
     "interacts_with"),
]

# ================================
# 3)  دیتاست
# ================================
rows = []
NUM_SAMPLES = 5000

for i in range(NUM_SAMPLES):
    template = random.choice(TEMPLATES)

    sentence_1 = template[0].format(
        drug=random.choice(DRUGS),
        symptom=random.choice(SYMPTOMS),
        problem=random.choice(PROBLEMS),
        test=random.choice(TESTS),
        ade=random.choice(ADEs),
        social=random.choice(SOCIAL),
    )

    sentence_2 = template[1].format(
        drug=random.choice(DRUGS),
        symptom=random.choice(SYMPTOMS),
        problem=random.choice(PROBLEMS),
        test=random.choice(TESTS),
        ade=random.choice(ADEs),
        social=random.choice(SOCIAL),
    )

    relation_type = template[2]

    # entity types بر اساس الگو
    if "drug" in sentence_1 or "drug" in sentence_2:
        entity_type_1 = "Drug"
    elif "test" in sentence_1:
        entity_type_1 = "Test"
    else:
        entity_type_1 = "Other"

    # entity 2
    if "symptom" in sentence_1 or "symptom" in sentence_2:
        entity_type_2 = "Symptom"
    elif "problem" in sentence_1 or "problem" in sentence_2:
        entity_type_2 = "Problem"
    elif "ade" in sentence_1:
        entity_type_2 = "ADE"
    else:
        entity_type_2 = "Other"

    rows.append({
        "relation_type": relation_type,
        "sentence_1": sentence_1,
        "sentence_2": sentence_2,
        "entity_type_1": entity_type_1,
        "entity_type_2": entity_type_2,
        "entity_id_1": f"T{i*2+1}",
        "entity_id_2": f"T{i*2+2}",
        "file_id": f"file_{i:05d}"
    })

df = pd.DataFrame(rows)
df.to_csv("mtsamples_relations_5000.csv", index=False)

print(" Synthetic dataset with 5000 samples created successfully!")
print(df.head())


In [None]:
import os
import random
from dataclasses import dataclass
from typing import List, Dict

import numpy as np
import pandas as pd
import torch
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    classification_report,
)
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from transformers.activations import ACT2FN


# ==========================
#  تنظیمات کلی
# ==========================
RANDOM_SEED = 13
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)


DATA_CSV_PATH = "mtsamples_relations_5000.csv"

# پوشه‌ای که خروجی‌ها (مدل‌ها + متریک‌ها) داخلش ذخیره می‌شود
OUTPUT_ROOT = "mtsamples_relation_experiments"
os.makedirs(OUTPUT_ROOT, exist_ok=True)


# ==========================
#  1) خواندن CSV و ساخت train/dev/test
# ==========================
def load_and_split(csv_path: str, test_size: float = 0.2, dev_size: float = 0.1):
    """
    CSV باید ستون‌های زیر را داشته باشد:
    relation_type, sentence_1, sentence_2,
    entity_type_1, entity_type_2, entity_id_1, entity_id_2, file_id
    """

    df = pd.read_csv(csv_path)

    required_cols = [
        "relation_type",
        "sentence_1",
        "sentence_2",
        "entity_type_1",
        "entity_type_2",
        "entity_id_1",
        "entity_id_2",
        "file_id",
    ]
    for c in required_cols:
        if c not in df.columns:
            raise ValueError(
                f"ستون {c} داخل CSV پیدا نشد؛ "
                f"اول باید این ستون‌ها را طبق فرمت n2c2 بسازی."
            )

    # Split مشابه چیزی که در کارهای n2c2 می‌کنند
    train_df, test_df = train_test_split(
        df,
        test_size=test_size,
        stratify=df["relation_type"],
        random_state=RANDOM_SEED,
    )
    train_df, dev_df = train_test_split(
        train_df,
        test_size=dev_size,
        stratify=train_df["relation_type"],
        random_state=RANDOM_SEED,
    )

    return (
        train_df.reset_index(drop=True),
        dev_df.reset_index(drop=True),
        test_df.reset_index(drop=True),
    )


def save_tsv(df: pd.DataFrame, path: str):

    df.to_csv(path, sep="\t", index=False)


# ==========================
#  2) Dataset کلاس برای Trainer
# ==========================
@dataclass
class RelationDataset(torch.utils.data.Dataset):
    texts: List[str]
    labels: List[int]
    tokenizer: AutoTokenizer
    max_len: int = 256

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            max_length=self.max_len,
            padding="max_length",
            return_tensors="pt",
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item


def build_dataset(df: pd.DataFrame, label2id: Dict[str, int], tokenizer, max_len: int = 256):
    # مثل ریپو، دو جمله را کنار هم قرار می‌دهیم؛ اگر [s1]/[e1]/[s2]/[e2] داخل جمله‌ها هستند همان‌ها می‌مانند
    texts = (df["sentence_1"] + " " + df["sentence_2"]).tolist()
    labels = [label2id[y] for y in df["relation_type"].tolist()]
    return RelationDataset(texts=texts, labels=labels, tokenizer=tokenizer, max_len=max_len)


# ==========================
#  3) متریک‌ها
# ==========================
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    acc = accuracy_score(labels, preds)
    macro_f1 = f1_score(labels, preds, average="macro")
    micro_f1 = f1_score(labels, preds, average="micro")
    macro_p = precision_score(labels, preds, average="macro", zero_division=0)
    macro_r = recall_score(labels, preds, average="macro", zero_division=0)

    return {
        "accuracy": acc,
        "macro_f1": macro_f1,
        "micro_f1": micro_f1,
        "macro_precision": macro_p,
        "macro_recall": macro_r,
    }


# ==========================
#  4) تعریف fast GELU برای GatorTron
# ==========================
def register_fast_gelu():
    """
    تابع fast_gelu همان چیزی است که با ضریب 1.702 روی x * sigmoid کار می‌کند.
    اینجا آن را به ACT2FN اضافه می‌کنیم تا config.hidden_act = "gelu_fast" کار کند.
    """
    def fast_gelu(x):
        return x * torch.sigmoid(1.702 * x)

    ACT2FN["gelu_fast"] = fast_gelu


# ==========================
#  5) لیست ۶ مدل
# ==========================
# name  |  (type فقط برای لاگ) |  checkpoint HF  |  آیا fast GELU فعال شود؟
MODELS = [
    ("gatortron_fastgelu", "bert", "UFNLP/gatortron-base", True),   # GatorTron تغییر یافته
    ("gatortron_base", "bert", "UFNLP/gatortron-base", False),      # GatorTron اصلی
    ("roberta_mimic", "roberta", "mimiciii_roberta_10e_128b", False),
    ("albert_mimic", "albert", "mimiciii_albert_10e_128b", False),
    ("roberta_base", "roberta", "roberta-base", False),
    ("albert_base", "albert", "albert-base-v2", False),
]


# ==========================
#  6) حلقه اصلی برای اجرای همه مدل‌ها
# ==========================
def run_all_models():
    # --- 6.1  خواندن CSV و split ---
    train_df, dev_df, test_df = load_and_split(DATA_CSV_PATH)

    # لیبل‌ها
    label_list = sorted(train_df["relation_type"].unique().tolist())
    label2id = {lbl: i for i, lbl in enumerate(label_list)}
    id2label = {i: lbl for lbl, i in label2id.items()}

    # اگر خواستی tsv مثل مقاله داشته باشی
    save_tsv(train_df, os.path.join(OUTPUT_ROOT, "train.tsv"))
    save_tsv(dev_df, os.path.join(OUTPUT_ROOT, "dev.tsv"))
    save_tsv(test_df, os.path.join(OUTPUT_ROOT, "test.tsv"))

    all_results = []

    # --- 6.2  برای هر مدل یکی یکی ---
    for exp_name, model_type, ckpt, use_fast in MODELS:
        print(f"\n====== Training model: {exp_name}  ({ckpt}) ======")

        exp_dir = os.path.join(OUTPUT_ROOT, exp_name)
        os.makedirs(exp_dir, exist_ok=True)

        # 6.2.1 فعال‌کردن fast GELU اگر لازم است
        if use_fast:
            register_fast_gelu()
            config = AutoConfig.from_pretrained(
                ckpt,
                num_labels=len(label_list),
            )
            # اینجا همان تغییری است که در GatorTron می‌دهیم
            config.hidden_act = "gelu_fast"
        else:
            config = AutoConfig.from_pretrained(
                ckpt,
                num_labels=len(label_list),
            )

        # 6.2.2 توکنایزر و مدل
        tokenizer = AutoTokenizer.from_pretrained(ckpt, use_fast=True)
        model = AutoModelForSequenceClassification.from_pretrained(
            ckpt,
            config=config,
            num_labels=len(label_list),
            id2label=id2label,
            label2id=label2id,
        )

        # 6.2.3 ساخت دیتاست‌ها
        train_ds = build_dataset(train_df, label2id, tokenizer)
        dev_ds = build_dataset(dev_df, label2id, tokenizer)
        test_ds = build_dataset(test_df, label2id, tokenizer)

        # 6.2.4 تنظیمات آموزش (تقریباً مطابق README)
        training_args = TrainingArguments(
            output_dir=exp_dir,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            logging_strategy="steps",
            logging_steps=50,
            per_device_train_batch_size=4,
            per_device_eval_batch_size=4,
            num_train_epochs=3,
            learning_rate=1e-5,
            warmup_ratio=0.1,
            weight_decay=0.0,
            load_best_model_at_end=True,
            metric_for_best_model="macro_f1",
            greater_is_better=True,
            seed=RANDOM_SEED,
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_ds,
            eval_dataset=dev_ds,
            compute_metrics=compute_metrics,
        )

        # 6.2.5 آموزش
        trainer.train()

        # 6.2.6 ارزیابی روی test
        eval_metrics = trainer.evaluate(eval_dataset=test_ds)
        eval_metrics["model"] = exp_name
        all_results.append(eval_metrics)

        # گزارش کامل برای هر مدل
        preds = np.argmax(trainer.predict(test_ds).predictions, axis=-1)
        y_true = test_df["relation_type"].tolist()
        y_pred = [id2label[p] for p in preds]

        report = classification_report(
            y_true,
            y_pred,
            digits=4,
        )
        with open(os.path.join(exp_dir, "classification_report.txt"), "w") as f:
            f.write(report)

        print(report)

    # 6.2.7 جمع‌بندی متریک‌ها در یک فایل
    res_df = pd.DataFrame(all_results)
    res_df.to_csv(os.path.join(OUTPUT_ROOT, "summary_metrics.csv"), index=False)
    print("\n✅ همه مدل‌ها تمام شدند؛ summary_metrics.csv ساخته شد.")


# ==========================
#  اجرای مستقیم فایل
# ==========================
if __name__ == "__main__":
    run_all_models()
