In [18]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report


In [19]:
# Load cleaned data
df = pd.read_csv("data/data_processed/train.csv")

# Map labels: 'real' -> 1, 'fake' -> 0
label_map = {"real": 1, "fake": 0}
df['label'] = df['label'].map(label_map)

# Remove rows with invalid or missing labels, convert to int
df = df.dropna(subset=['label'])
df['label'] = df['label'].astype(int)

# (Optional) Check class balance
print(df['label'].value_counts())


label
1    28
0    26
Name: count, dtype: int64


In [20]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42
)


In [21]:
# Load DeBERTa tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-small")

# Define custom Dataset
class DisasterTweetDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# Create Dataset and DataLoader
train_dataset = DisasterTweetDataset(train_texts, train_labels, tokenizer)
val_dataset   = DisasterTweetDataset(val_texts, val_labels, tokenizer)
train_loader  = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader    = DataLoader(val_dataset, batch_size=16)




In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pretrained DeBERTa for sequence classification (2 labels)
model = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/deberta-v3-small", num_labels=2
)
model.to(device)


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-5): 6 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNo

In [23]:
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_loader)

# Linear learning rate scheduler (no warmup steps)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)


In [24]:
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")


Epoch 1/3, Loss: 0.6862
Epoch 2/3, Loss: 0.6983
Epoch 3/3, Loss: 0.6743


In [25]:
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for batch in val_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch['labels'].cpu().numpy())

# Classification report (0='fake', 1='real')
print(classification_report(all_labels, all_preds, target_names=['fake', 'real']))


              precision    recall  f1-score   support

        fake       0.00      0.00      0.00         6
        real       0.45      1.00      0.62         5

    accuracy                           0.45        11
   macro avg       0.23      0.50      0.31        11
weighted avg       0.21      0.45      0.28        11



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [27]:
def predict(text, model, tokenizer, device, max_length=128):
    """Predict the stance ('real' or 'fake') for a single text."""
    model.eval()
    encoding = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )
    encoding = {k: v.to(device) for k, v in encoding.items()}
    with torch.no_grad():
        outputs = model(**encoding)
        pred = torch.argmax(outputs.logits, dim=1).item()
    return "real" if pred == 1 else "fake"

# Example usage:
tweet = "Major earthquake shakes Los Angeles, officials investigate the damage."
print("Tweet:", tweet)
print("Predicted stance:", predict(tweet, model, tokenizer, device))


Tweet: Major earthquake shakes Los Angeles, officials investigate the damage.
Predicted stance: real


In [29]:
# Save the fine-tuned DeBERTa model
model.save_pretrained("models/stance_model")
tokenizer.save_pretrained("models/stance_model")


('models/stance_model\\tokenizer_config.json',
 'models/stance_model\\special_tokens_map.json',
 'models/stance_model\\spm.model',
 'models/stance_model\\added_tokens.json',
 'models/stance_model\\tokenizer.json')