## Loading the Multi-Modal Framing Dataset


In [1]:
import ast
import os
import json
import numpy as np
import pandas as pd
import torch
import random
from datasets import load_dataset, DatasetDict, Dataset
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, average_precision_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [2]:
raw_ds = load_dataset("copenlu/mm-framing")

In [3]:
raw_ds.keys()

dict_keys(['full', 'valid_framing_subset'])

## According to the paper, the valid_framing_subset was created by applying additional filtering on the full dataset in order to focus on the articles that are suitable for framing analysis. 
### Specifically:
- They removed articles for which the framing model predicted “None”
- They removed articles whose text length was below 100 words
- They removed articles whose topic was “sports” or “media”


In [4]:
framing_subset = raw_ds['valid_framing_subset']

### Column name descriptions:
- uuid - Unique ID for each article
- title - Title of the article
- date_publish - Publication date
- source_domain - Domain of the publisher
- url - Article URL
- political_leaning - Political leaning of the publisher
### Annotations
- text-topic - Article topic generated from article text
- text-topic-exp - Article topic explanation
- text-entity-name - Main entity in article text
- text-entity-sentiment - Sentiment towards main entity
- text-entity-sentiment-exp - Explanation of text sentiment
- text-generic-frame - Generic Frame used in Article text
- text-generic-frame-exp - Generic Frame in text explanation
- text-issue-frame - Issue Frame used in article text
- text-issue-frame-exp - Issue Frame explanation
- img-generic-frame - Generic Frame used in Article Image
- img-frame-exp - Generic Frame in image explanation
- img-entity-name - Main subject in Article Image
- img-entity-sentiment - Sentiment towards the subject in Article image
- img-entity-sentiment-exp - Explanation of image sentiment
- gpt-topic - Consolidated topic

In [5]:
MODEL_NAME = "microsoft/deberta-v3-base"
TEXT_COL = "text-generic-frame-exp"      
LABEL_COL = "text-generic-frame"         
MAX_LEN = 512

In [7]:
# Convert string to a list and ensure no NaN or trailing whitespace
def Clean(ds):
    """ Converts a string to a list"""
    labels = ds[LABEL_COL]
    if isinstance(labels, str):
        labels = ast.literal_eval(labels)
    ds[LABEL_COL] = labels 
    txt = ds.get(TEXT_COL) or ""
    ds["__text__"] = txt.strip()
    return ds

# filter to fraiming subset of data, clean, and drop anything with no text
framing_subset = raw_ds["valid_framing_subset"]
framing_subset = framing_subset.map(Clean)
framing_subset = framing_subset.filter(lambda e: len(e["__text__"]) > 0)
df = framing_subset.to_pandas()
df = df.drop_duplicates(subset="__text__")
framing_subset = Dataset.from_pandas(df, preserve_index=False)


# Train/val/test split
split_ds = framing_subset.train_test_split(test_size=0.2, seed=42)
temp_split = split_ds["test"].train_test_split(test_size=0.5, seed=42)
ds = DatasetDict({
    "train": split_ds["train"],
    "validation": temp_split["train"],
    "test": temp_split["test"]
})


# Build the multi-label space of all possible frames
frames = set()
for row in ds["train"]:
    frames.update(row[LABEL_COL])
frames = sorted(frames)
mlb = MultiLabelBinarizer(classes=frames)
mlb.fit([[]])  
num_labels = len(frames)


def Binarize(ds):
    """ Convert labels to multi-hot vectors """
    ds["labels"] = mlb.transform([ds[LABEL_COL]])[0].astype(np.float32)
    return ds
ds = ds.map(Binarize)


tok = AutoTokenizer.from_pretrained(MODEL_NAME)

def Tok(batch):
    """Tokenize's the text"""
    return tok(batch["__text__"], truncation=True, padding="max_length", max_length=MAX_LEN)
ds = ds.map(Tok, batched=True, remove_columns=[c for c in ds["train"].column_names if c not in ["labels"]])

print(ds)
print("num_labels:", num_labels)
print("frames =", frames)

Map:   0%|          | 0/153991 [00:00<?, ? examples/s]

Filter:   0%|          | 0/153991 [00:00<?, ? examples/s]

Map:   0%|          | 0/122566 [00:00<?, ? examples/s]

Map:   0%|          | 0/15321 [00:00<?, ? examples/s]

Map:   0%|          | 0/15321 [00:00<?, ? examples/s]



Map:   0%|          | 0/122566 [00:00<?, ? examples/s]

Map:   0%|          | 0/15321 [00:00<?, ? examples/s]

Map:   0%|          | 0/15321 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 122566
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 15321
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 15321
    })
})
num_labels: 14
frames = ['cap&res', 'crime', 'culture', 'economic', 'fairness', 'health', 'legality', 'morality', 'policy', 'political', 'public_op', 'quality_life', 'regulation', 'security']


In [8]:
# Data Leakage Test
train_texts = set(split_ds["train"]["__text__"])
val_texts   = set(temp_split["train"]["__text__"])
test_texts  = set(temp_split["test"]["__text__"])

print("train > val:", len(train_texts & val_texts))
print("train > test:", len(train_texts & test_texts))
print("val > test:", len(val_texts & test_texts))

train > val: 0
train > test: 0
val > test: 0


In [9]:
best_thresh = 0.5  
epochs = 1

def Compute_metrics(eval_pred):
    """Returns eval metrics for transformer"""
    logits, labels = eval_pred
    probs = 1 / (1 + np.exp(-logits))          
    preds = (probs >= best_thresh).astype(int)
    out = {}
    out["f1_macro"] = f1_score(labels, preds, average="macro", zero_division=0)
    out["f1_micro"] = f1_score(labels, preds, average="micro", zero_division=0)
    return out

# Build model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    problem_type="multi_label_classification",
    )


# Define training arguments and trianing object
# fp16 and per_device are controling GPU may need to tweak for diff computer
args = TrainingArguments(
    output_dir="mmf_deberta_v3",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=epochs,
    eval_strategy="epoch",         
    save_strategy="epoch",
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    fp16=torch.cuda.is_available(), 
    report_to=[], 
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    tokenizer=tok,
    compute_metrics=Compute_metrics,
)


train_result = trainer.train()
print(train_result)
print("Training done.")

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Micro
1,0.0271,0.024649,0.984769,0.986478


TrainOutput(global_step=7661, training_loss=0.0604492592105136, metrics={'train_runtime': 1761.6786, 'train_samples_per_second': 69.573, 'train_steps_per_second': 4.349, 'total_flos': 3.225252250215629e+16, 'train_loss': 0.0604492592105136, 'epoch': 1.0})
Training done.


In [10]:
# Tune threshold on validation set
val_out = trainer.predict(ds["validation"])
val_logits, val_labels = val_out.predictions, val_out.label_ids
val_probs = 1 / (1 + np.exp(-val_logits))

def tune_threshold(probs, labels):
    """ Tune decision threshold and return best threshold and F1 score"""
    log = []   
    candidates = np.linspace(0.1, 0.9, 17)
    best, best_f1 = 0.5, -1
    for t in candidates:
        preds = (probs >= t).astype(int)
        f1 = f1_score(labels, preds, average="macro", zero_division=0)
        print(f"Threshold={t:.2f} → F1={f1:.4f}")
        log.append({"threshold": t, "f1": f1})
        if f1 > best_f1:
            best, best_f1 = t, f1
    return best, best_f1, log

best_thresh, best_f1, log = tune_threshold(val_probs, val_labels)
print(f"Best threshold : {best_thresh:.2f} (F1={best_f1:.3f})")

Threshold=0.10 → F1=0.9764
Threshold=0.15 → F1=0.9796
Threshold=0.20 → F1=0.9815
Threshold=0.25 → F1=0.9825
Threshold=0.30 → F1=0.9831
Threshold=0.35 → F1=0.9837
Threshold=0.40 → F1=0.9843
Threshold=0.45 → F1=0.9845
Threshold=0.50 → F1=0.9848
Threshold=0.55 → F1=0.9848
Threshold=0.60 → F1=0.9848
Threshold=0.65 → F1=0.9848
Threshold=0.70 → F1=0.9847
Threshold=0.75 → F1=0.9844
Threshold=0.80 → F1=0.9840
Threshold=0.85 → F1=0.9834
Threshold=0.90 → F1=0.9820
Best threshold : 0.65 (F1=0.985)


In [16]:
# Final Eval on Test Set
test_out = trainer.predict(ds["test"])
test_logits, test_labels = test_out.predictions, test_out.label_ids
test_probs = 1 / (1 + np.exp(-test_logits))
test_preds = (test_probs >= best_thresh).astype(int)


test_f1_macro = f1_score(test_labels, test_preds, average="macro", zero_division=0)
test_f1_micro = f1_score(test_labels, test_preds, average="micro", zero_division=0)

print("Final Test Performance:")
print(f"  F1 Macro: {test_f1_macro:.4f}")
print(f"  F1 Micro: {test_f1_micro:.4f}")

Final Test Performance:
  F1 Macro: 0.9852
  F1 Micro: 0.9864


In [21]:
# Data Leakage Sanity Check training on 10% of training set to see if we have leakage or signal
train_10pct = ds["train"].train_test_split(test_size=0.9, seed=42)["train"]

model_small = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    problem_type="multi_label_classification",
)


small_args = TrainingArguments(
    output_dir="mmf_deberta_v3",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=epochs,
    eval_strategy="epoch",         
    save_strategy="epoch",
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    fp16=torch.cuda.is_available(), 
    report_to=[], 
)

trainer_small = Trainer(
    model=model_small,
    args=small_args,
    train_dataset=train_10pct,
    eval_dataset=ds["validation"],
    tokenizer=tok,
    compute_metrics=Compute_metrics,
)


trainer_small.train()

# Tune on validation set
val_out_small = trainer_small.predict(ds["validation"])
val_logits_small, val_labels_small = val_out_small.predictions, val_out_small.label_ids
val_probs_small = 1 / (1 + np.exp(-val_logits_small))

best_thresh_small, best_f1_small, log_small = tune_threshold(val_probs_small, val_labels_small)
print(f"Best threshold (10% model): {best_thresh_small:.2f} (F1={best_f1_small:.3f})"


# Final Eval on test set
test_out_small = trainer_small.predict(ds["test"])
test_logits_small, test_labels_small = test_out_small.predictions, test_out_small.label_ids
test_probs_small = 1 / (1 + np.exp(-test_logits_small))
test_preds_small = (test_probs_small >= best_thresh_small).astype(int)

test_f1_macro_small = f1_score(test_labels_small, test_preds_small, average="macro", zero_division=0)
test_f1_micro_small = f1_score(test_labels_small, test_preds_small, average="micro", zero_division=0)



Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_small = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Micro
1,0.204,0.17157,0.77335,0.855168


Threshold=0.10 → F1=0.6832
Threshold=0.15 → F1=0.7693
Threshold=0.20 → F1=0.8244
Threshold=0.25 → F1=0.8554
Threshold=0.30 → F1=0.8732
Threshold=0.35 → F1=0.8793
Threshold=0.40 → F1=0.8769
Threshold=0.45 → F1=0.8692
Threshold=0.50 → F1=0.8539
Threshold=0.55 → F1=0.8338
Threshold=0.60 → F1=0.8061
Threshold=0.65 → F1=0.7734
Threshold=0.70 → F1=0.7302
Threshold=0.75 → F1=0.6781
Threshold=0.80 → F1=0.6087
Threshold=0.85 → F1=0.5122
Threshold=0.90 → F1=0.3777
Best threshold (10% model): 0.35
Validation F1 (10% model): 0.8793


10% test F1 Macro: 0.8788
10% test F1 Micro: 0.9046


In [22]:
save_dir = "best"
os.makedirs(save_dir, exist_ok=True)

# Save model/tokenizer
trainer.save_model(save_dir)
tok.save_pretrained(save_dir)

with open(os.path.join(save_dir, "frames.json"), "w") as f:
    json.dump(frames, f, indent=2)
with open(os.path.join(save_dir, "threshold.json"), "w") as f:
    json.dump({"global": float(best_thresh)}, f, indent=2)
with open(os.path.join(save_dir, "threshold_log.json"), "w") as f:
    json.dump(log, f, indent=2)
with open(os.path.join(save_dir, "threshold_log_small.json"), "w") as f:
    json.dump(log_small, f, indent=2)

print("Saved to", save_dir)

Saved to best
