In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 1. بارگذاری فایل‌ها
train_df = pd.read_excel("/content/drive/MyDrive/train_stance_data2.xlsx")
test_df = pd.read_excel("/content/drive/MyDrive/test_stance_data2.xlsx")

# 2. نگاشت برچسب‌ها
label_map = {"بله": 0, "خیر": 1, "نامربوط": 2}
train_df['label'] = train_df['برچسب نهایی'].map(label_map)
test_df['label'] = test_df['برچسب نهایی'].map(label_map)

# 3. ترکیب پست و خبر به فرم مناسب
def create_input(row):
    return f"premise: {str(row['پست'])} hypothesis: {str(row['خبر'])}"

train_df['text'] = train_df.apply(create_input, axis=1)
test_df['text'] = test_df.apply(create_input, axis=1)


In [None]:
from torch.utils.data import Dataset

class StanceDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(self.labels[idx])
        }


In [None]:
from transformers import AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

train_dataset = StanceDataset(train_df['text'].tolist(), train_df['label'].tolist(), tokenizer)
test_dataset = StanceDataset(test_df['text'].tolist(), test_df['label'].tolist(), tokenizer)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [None]:
from transformers import AutoModel
import torch.nn as nn
import torch

class ContrastiveStanceModel(nn.Module):
    def __init__(self, model_name="xlm-roberta-base", hidden_size=768, num_labels=3):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.classifier = nn.Linear(hidden_size, num_labels)
        self.dropout = nn.Dropout(0.1)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = self.dropout(outputs.last_hidden_state[:, 0, :])  # [CLS] token
        logits = self.classifier(cls_output)
        return logits, cls_output


In [None]:
def contrastive_loss(embeddings, labels, margin=0.5):
    cos_sim = torch.nn.functional.cosine_similarity(
        embeddings.unsqueeze(1), embeddings.unsqueeze(0), dim=2
    )

    label_matrix = labels.unsqueeze(0) == labels.unsqueeze(1)
    positive_mask = label_matrix.triu(diagonal=1)
    negative_mask = (~label_matrix).triu(diagonal=1)

    pos_sim = (1 - cos_sim)[positive_mask]
    neg_sim = torch.clamp(cos_sim[negative_mask] - margin, min=0)

    pos_loss = pos_sim.pow(2).mean() if pos_sim.numel() > 0 else torch.tensor(0.0, device=embeddings.device)
    neg_loss = neg_sim.pow(2).mean() if neg_sim.numel() > 0 else torch.tensor(0.0, device=embeddings.device)

    return pos_loss + neg_loss


In [None]:
def compute_total_loss(logits, embeddings, labels, ce_weight=1.0, contrastive_weight=1.0):
    ce_loss = nn.CrossEntropyLoss()(logits, labels)
    con_loss = contrastive_loss(embeddings, labels)
    return ce_weight * ce_loss + contrastive_weight * con_loss


In [None]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118


Looking in indexes: https://download.pytorch.org/whl/cu118
INFO: pip is looking at multiple versions of torch to determine which version is compatible with other requirements. This could take a while.
Collecting torch
  Downloading https://download.pytorch.org/whl/cu118/torch-2.7.0%2Bcu118-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (28 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading https://download.pytorch.org/whl/sympy-1.13.3-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.8.89 (from torch)
  Downloading https://download.pytorch.org/whl/cu118/nvidia_cuda_nvrtc_cu11-11.8.89-py3-none-manylinux1_x86_64.whl (23.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.2/23.2 MB[0m [31m57.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu11==11.8.89 (from torch)
  Downloading https://download.pytorch.org/whl/cu118/nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux1_x86_64.whl (875 kB)
[2K     [90m━━━━━━━━━━━━━━━━

In [None]:
torch.cuda.is_available()


True

In [None]:
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, f1_score
import torch.nn.functional as F
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = ContrastiveStanceModel().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

In [None]:
def evaluate(model, dataloader):
    model.eval()
    preds, labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            label = batch['label'].to(device)

            logits, _ = model(input_ids, attention_mask)
            pred = torch.argmax(F.softmax(logits, dim=1), dim=1)

            preds.extend(pred.cpu().numpy())
            labels.extend(label.cpu().numpy())

    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='macro')
    return acc, f1


In [None]:
import os
from tqdm import tqdm
import torch

num_epochs = 11
model_save_path = "/content/drive/MyDrive/stancee_model.pth"
model_save_pathh = "/content/drive/MyDrive/stancee2_model.pth"

# 🧠 ساخت مدل
model = ContrastiveStanceModel().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# ✅ اگر فایل مدل وجود داره → ادامه آموزش
if os.path.exists(model_save_path):
    model.load_state_dict(torch.load(model_save_path))
    print(f"📂 مدل از فایل {model_save_path} بارگذاری شد. آموزش ادامه دارد...")
best_f1 = 0.0

# 🔁 حلقه آموزش
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    print(f"\n🔁 Epoch {epoch + 1}/{num_epochs}")
    loop = tqdm(train_loader, desc="Training", leave=False)

    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        logits, embeddings = model(input_ids, attention_mask)
        loss = compute_total_loss(logits, embeddings, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(train_loader)
    acc, f1 = evaluate(model, test_loader)
    if f1 > best_f1:
        best_f1 = f1
        torch.save(model.state_dict(), model_save_path)
        print(f"💾 مدل جدید ذخیره شد (Best F1: {f1:.4f})")
    print(f"✅ Epoch {epoch+1} Done | AvgLoss: {avg_loss:.4f} | 🎯 Acc: {acc:.4f} | 🧠 F1: {f1:.4f}")

    # 💾 ذخیره مدل در پایان هر epoch
    torch.save(model.state_dict(), model_save_pathh)
    print(f"💾 مدل ذخیره شد در {model_save_path}")


📂 مدل از فایل /content/drive/MyDrive/stancee_model.pth بارگذاری شد. آموزش ادامه دارد...

🔁 Epoch 1/11




✅ Epoch 1 Done | AvgLoss: 0.3049 | 🎯 Acc: 0.7176 | 🧠 F1: 0.5812
💾 مدل ذخیره شد در /content/drive/MyDrive/stancee_model.pth

🔁 Epoch 2/11




KeyboardInterrupt: 

In [None]:
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report, accuracy_score, precision_score

# 🧠 مدل و tokenizer مثل آموزش
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = ContrastiveStanceModel()
model.load_state_dict(torch.load("/content/drive/MyDrive/stancee1_model.pth", map_location="cpu"))
model.to(device)
model.eval()

# 📦 داده‌های تست دوباره آماده‌سازی
test_df = pd.read_excel("/content/drive/MyDrive/test_stance_data2.xlsx")
label_map = {"بله": 0, "خیر": 1, "نامربوط": 2}
test_df['label'] = test_df['برچسب نهایی'].map(label_map)
test_df['text'] = test_df.apply(lambda row: f"premise: {str(row['پست'])} hypothesis: {str(row['خبر'])}", axis=1)

test_dataset = StanceDataset(test_df['text'].tolist(), test_df['label'].tolist(), tokenizer)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# 📊 تابع گزارش نهایی
def detailed_report(model, dataloader, label_names=['بله', 'خیر', 'نامربوط']):
    model.eval()
    preds, labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            label = batch['label'].to(device)

            logits, _ = model(input_ids, attention_mask)
            pred = torch.argmax(torch.softmax(logits, dim=1), dim=1)

            preds.extend(pred.cpu().numpy())
            labels.extend(label.cpu().numpy())

    print("📋 Classification Report:\n")
    print(classification_report(labels, preds, target_names=label_names, digits=4))
    acc = accuracy_score(labels, preds)
    prec_macro = precision_score(labels, preds, average='macro')
    print(f"✅ Accuracy: {acc:.4f}")
    print(f"✅ Precision (macro avg): {prec_macro:.4f}")

# اجرای ارزیابی
print("🧪 Loading model & running evaluation on test set...")
detailed_report(model, test_loader)


🧪 Loading model & running evaluation on test set...
📋 Classification Report:

              precision    recall  f1-score   support

         بله     0.8296    0.8679    0.8483      1817
         خیر     0.4806    0.4748    0.4777       575
     نامربوط     0.5429    0.2585    0.3502       147

    accuracy                         0.7436      2539
   macro avg     0.6177    0.5337    0.5587      2539
weighted avg     0.7339    0.7436    0.7355      2539

✅ Accuracy: 0.7436
✅ Precision (macro avg): 0.6177


In [None]:
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

def detailed_report(model, dataloader, label_names=['بله', 'خیر', 'نامربوط']):
    model.eval()
    preds, labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            label = batch['label'].to(device)

            logits, _ = model(input_ids, attention_mask)
            pred = torch.argmax(torch.softmax(logits, dim=1), dim=1)

            preds.extend(pred.cpu().numpy())
            labels.extend(label.cpu().numpy())

    # گزارش متریک‌ها
    acc = accuracy_score(labels, preds)
    prec_macro = precision_score(labels, preds, average='macro')
    rec_macro = recall_score(labels, preds, average='macro')
    f1_macro = f1_score(labels, preds, average='macro')

    print("📋 Classification Report:\n")
    print(classification_report(labels, preds, target_names=label_names, digits=4))

    print("✅ Accuracy:", f"{acc:.4f}")
    print("✅ Precision (macro):", f"{prec_macro:.4f}")
    print("✅ Recall (macro):", f"{rec_macro:.4f}")
    print("✅ F1-score (macro):", f"{f1_macro:.4f}")
detailed_report(model, test_loader)


📋 Classification Report:

              precision    recall  f1-score   support

         بله     0.8296    0.8679    0.8483      1817
         خیر     0.4806    0.4748    0.4777       575
     نامربوط     0.5429    0.2585    0.3502       147

    accuracy                         0.7436      2539
   macro avg     0.6177    0.5337    0.5587      2539
weighted avg     0.7339    0.7436    0.7355      2539

✅ Accuracy: 0.7436
✅ Precision (macro): 0.6177
✅ Recall (macro): 0.5337
✅ F1-score (macro): 0.5587


In [None]:
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report, accuracy_score, precision_score

# 🧠 مدل و tokenizer مثل آموزش
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = ContrastiveStanceModel()
model.load_state_dict(torch.load("/content/drive/MyDrive/stancee2_model.pth", map_location="cpu"))
model.to(device)
model.eval()

# 📦 داده‌های تست دوباره آماده‌سازی
test_df = pd.read_excel("/content/drive/MyDrive/test_stance_data2.xlsx")
label_map = {"بله": 0, "خیر": 1, "نامربوط": 2}
test_df['label'] = test_df['برچسب نهایی'].map(label_map)
test_df['text'] = test_df.apply(lambda row: f"premise: {str(row['پست'])} hypothesis: {str(row['خبر'])}", axis=1)

test_dataset = StanceDataset(test_df['text'].tolist(), test_df['label'].tolist(), tokenizer)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)
def detailed_report(model, dataloader, label_names=['بله', 'خیر', 'نامربوط']):
    model.eval()
    preds, labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            label = batch['label'].to(device)

            logits, _ = model(input_ids, attention_mask)
            pred = torch.argmax(torch.softmax(logits, dim=1), dim=1)

            preds.extend(pred.cpu().numpy())
            labels.extend(label.cpu().numpy())

    # گزارش متریک‌ها
    acc = accuracy_score(labels, preds)
    prec_macro = precision_score(labels, preds, average='macro')
    rec_macro = recall_score(labels, preds, average='macro')
    f1_macro = f1_score(labels, preds, average='macro')

    print("📋 Classification Report:\n")
    print(classification_report(labels, preds, target_names=label_names, digits=4))

    print("✅ Accuracy:", f"{acc:.4f}")
    print("✅ Precision (macro):", f"{prec_macro:.4f}")
    print("✅ Recall (macro):", f"{rec_macro:.4f}")
    print("✅ F1-score (macro):", f"{f1_macro:.4f}")
detailed_report(model, test_loader)


📋 Classification Report:

              precision    recall  f1-score   support

         بله     0.8459    0.8035    0.8242      1817
         خیر     0.4390    0.5252    0.4782       575
     نامربوط     0.4800    0.4082    0.4412       147

    accuracy                         0.7176      2539
   macro avg     0.5883    0.5790    0.5812      2539
weighted avg     0.7325    0.7176    0.7236      2539

✅ Accuracy: 0.7176
✅ Precision (macro): 0.5883
✅ Recall (macro): 0.5790
✅ F1-score (macro): 0.5812


In [None]:
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report, accuracy_score, precision_score

# 🧠 مدل و tokenizer مثل آموزش
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = ContrastiveStanceModel()
model.load_state_dict(torch.load("/content/drive/MyDrive/stancee_model.pth", map_location="cpu"))
model.to(device)
model.eval()

# 📦 داده‌های تست دوباره آماده‌سازی
test_df = pd.read_excel("/content/drive/MyDrive/test_stance_data2.xlsx")
label_map = {"بله": 0, "خیر": 1, "نامربوط": 2}
test_df['label'] = test_df['برچسب نهایی'].map(label_map)
test_df['text'] = test_df.apply(lambda row: f"premise: {str(row['پست'])} hypothesis: {str(row['خبر'])}", axis=1)

test_dataset = StanceDataset(test_df['text'].tolist(), test_df['label'].tolist(), tokenizer)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)
def detailed_report(model, dataloader, label_names=['بله', 'خیر', 'نامربوط']):
    model.eval()
    preds, labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            label = batch['label'].to(device)

            logits, _ = model(input_ids, attention_mask)
            pred = torch.argmax(torch.softmax(logits, dim=1), dim=1)

            preds.extend(pred.cpu().numpy())
            labels.extend(label.cpu().numpy())

    # گزارش متریک‌ها
    acc = accuracy_score(labels, preds)
    prec_macro = precision_score(labels, preds, average='macro')
    rec_macro = recall_score(labels, preds, average='macro')
    f1_macro = f1_score(labels, preds, average='macro')

    print("📋 Classification Report:\n")
    print(classification_report(labels, preds, target_names=label_names, digits=4))

    print("✅ Accuracy:", f"{acc:.4f}")
    print("✅ Precision (macro):", f"{prec_macro:.4f}")
    print("✅ Recall (macro):", f"{rec_macro:.4f}")
    print("✅ F1-score (macro):", f"{f1_macro:.4f}")
detailed_report(model, test_loader)


📋 Classification Report:

              precision    recall  f1-score   support

         بله     0.8387    0.8156    0.8270      1817
         خیر     0.4404    0.5009    0.4687       575
     نامربوط     0.5424    0.4354    0.4830       147

    accuracy                         0.7223      2539
   macro avg     0.6071    0.5840    0.5929      2539
weighted avg     0.7313    0.7223    0.7259      2539

✅ Accuracy: 0.7223
✅ Precision (macro): 0.6071
✅ Recall (macro): 0.5840
✅ F1-score (macro): 0.5929
