## Loading the Multi-Modal Framing Dataset


In [1]:
import ast
import os
import json
import numpy as np
import pandas as pd
import torch
import random
from datasets import load_dataset, DatasetDict, Dataset
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, average_precision_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [2]:
raw_ds = load_dataset("copenlu/mm-framing")

In [3]:
raw_ds.keys()

dict_keys(['full', 'valid_framing_subset'])

## According to the paper, the valid_framing_subset was created by applying additional filtering on the full dataset in order to focus on the articles that are suitable for framing analysis. 
### Specifically:
- They removed articles for which the framing model predicted “None”
- They removed articles whose text length was below 100 words
- They removed articles whose topic was “sports” or “media”


In [4]:
framing_subset = raw_ds['valid_framing_subset']

### Column name descriptions:
- uuid - Unique ID for each article
- title - Title of the article
- date_publish - Publication date
- source_domain - Domain of the publisher
- url - Article URL
- political_leaning - Political leaning of the publisher
### Annotations
- text-topic - Article topic generated from article text
- text-topic-exp - Article topic explanation
- text-entity-name - Main entity in article text
- text-entity-sentiment - Sentiment towards main entity
- text-entity-sentiment-exp - Explanation of text sentiment
- text-generic-frame - Generic Frame used in Article text
- text-generic-frame-exp - Generic Frame in text explanation
- text-issue-frame - Issue Frame used in article text
- text-issue-frame-exp - Issue Frame explanation
- img-generic-frame - Generic Frame used in Article Image
- img-frame-exp - Generic Frame in image explanation
- img-entity-name - Main subject in Article Image
- img-entity-sentiment - Sentiment towards the subject in Article image
- img-entity-sentiment-exp - Explanation of image sentiment
- gpt-topic - Consolidated topic

In [5]:
framing_subset[0]

{'uuid': '000002bf-ddb3-4386-9149-55328ce1c651',
 'title': "Philippine military condemns Chinese coast guard's use of water cannon on its boat in disputed sea",
 'date_publish': '2023-08-06 00:10:52',
 'source_domain': 'www.washingtontimes.com',
 'url': 'https://www.washingtontimes.com/news/2023/aug/6/philippine-military-condemns-chinese-coast-guards-/?utm_source=RSS_Feed&utm_medium=RSS',
 'political_leaning': 'right_lean',
 'text-topic': 'South China Sea Dispute',
 'text-topic-exp': "The article discusses a confrontation between the Philippine military and a Chinese coast guard ship over a Philippine-occupied shoal in the South China Sea. The article mentions the involvement of several countries, including the United States, Australia, and Japan, expressing concern over the actions of the Chinese ship. The article also mentions the long-standing territorial conflicts in the South China Sea and the international rulings that invalidated China's territorial claims.",
 'text-entity-name'

In [6]:
MODEL_NAME = "microsoft/deberta-v3-base"
TEXT_COL = "text-generic-frame-exp"      
LABEL_COL = "text-generic-frame"         
MAX_LEN = 512

In [7]:
# Convert string to a list and ensure no NaN or trailing whitespace
def _clean(ds):
    labels = ds[LABEL_COL]
    if isinstance(labels, str):
        labels = ast.literal_eval(labels)
    ds[LABEL_COL] = labels if labels else []
    txt = ds.get(TEXT_COL) or ""
    ds["__text__"] = txt.strip()
    return ds

# filter to fraiming subset of data, clean, and drop anything with no text

framing_subset = raw_ds["valid_framing_subset"]
framing_subset = framing_subset.map(_clean)
framing_subset = framing_subset.filter(lambda e: len(e["__text__"]) > 0)
df = framing_subset.to_pandas()
df = df.drop_duplicates(subset="__text__")
framing_subset = Dataset.from_pandas(df, preserve_index=False)


# Train/val/test split
split_ds = framing_subset.train_test_split(test_size=0.2, seed=42)
temp_split = split_ds["test"].train_test_split(test_size=0.5, seed=42)
ds = DatasetDict({
    "train": split_ds["train"],
    "validation": temp_split["train"],
    "test": temp_split["test"]
})


# Build the multi-label space of all possible frames
frames = set()
for row in ds["train"]:
    frames.update(row[LABEL_COL])
frames = sorted(frames)
mlb = MultiLabelBinarizer(classes=frames)
mlb.fit([[]])  
num_labels = len(frames)


# Convert labels to multi-hot vectors
def _binarize(ds):
    y = mlb.transform([ds[LABEL_COL]])[0].astype(np.float32)
    ds["labels"] = y
    return ds
ds = ds.map(_binarize)

# Tokenize
tok = AutoTokenizer.from_pretrained(MODEL_NAME)
def _tok(batch):
    return tok(batch["__text__"], truncation=True, padding="max_length", max_length=MAX_LEN)
ds = ds.map(_tok, batched=True, remove_columns=[c for c in ds["train"].column_names if c not in ["labels"]])

print(ds)
print("num_labels:", num_labels)
print("frames[:10] =", frames[:10])

Map:   0%|          | 0/153991 [00:00<?, ? examples/s]

Filter:   0%|          | 0/153991 [00:00<?, ? examples/s]

Map:   0%|          | 0/122566 [00:00<?, ? examples/s]

Map:   0%|          | 0/15321 [00:00<?, ? examples/s]

Map:   0%|          | 0/15321 [00:00<?, ? examples/s]



Map:   0%|          | 0/122566 [00:00<?, ? examples/s]

Map:   0%|          | 0/15321 [00:00<?, ? examples/s]

Map:   0%|          | 0/15321 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 122566
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 15321
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 15321
    })
})
num_labels: 14
frames[:10] = ['cap&res', 'crime', 'culture', 'economic', 'fairness', 'health', 'legality', 'morality', 'policy', 'political']


In [8]:
# Eval metrics
best_thresh = 0.5  
epochs = 1

def _compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = 1 / (1 + np.exp(-logits))          
    preds = (probs >= best_thresh).astype(int)
    out = {}
    out["f1_macro"] = f1_score(labels, preds, average="macro", zero_division=0)
    out["f1_micro"] = f1_score(labels, preds, average="micro", zero_division=0)
    try:
        out["avg_precision_macro"] = average_precision_score(labels, probs, average="macro")
    except Exception:
        pass
    return out

# Build model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    problem_type="multi_label_classification",
    )


# Define training arguments and trianing object
# fp16 and per_device are controling GPU may need to tweak for diff computer
args = TrainingArguments(
    output_dir="mmf_deberta_v3",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=epochs,
    eval_strategy="epoch",         
    save_strategy="epoch",
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    fp16=torch.cuda.is_available(), 
    report_to=[], 
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    tokenizer=tok,
    compute_metrics=_compute_metrics,
)


train_result = trainer.train()
print(train_result)
print("Training done.")

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Micro,Avg Precision Macro
1,0.0268,0.024442,0.984846,0.986489,0.996393


TrainOutput(global_step=7661, training_loss=0.060091103372771786, metrics={'train_runtime': 1763.9242, 'train_samples_per_second': 69.485, 'train_steps_per_second': 4.343, 'total_flos': 3.225252250215629e+16, 'train_loss': 0.060091103372771786, 'epoch': 1.0})
Training done.


In [9]:
# Tune decision threshold on the validation set
val_out = trainer.predict(ds["validation"])
val_logits, val_labels = val_out.predictions, val_out.label_ids
val_probs = 1 / (1 + np.exp(-val_logits))

def tune_threshold(probs, labels):
    candidates = np.linspace(0.1, 0.9, 17)
    best, best_f1 = 0.5, -1
    for t in candidates:
        preds = (probs >= t).astype(int)
        f1 = f1_score(labels, preds, average="macro", zero_division=0)
        if f1 > best_f1:
            best, best_f1 = t, f1
    return best, best_f1

best_thresh, best_f1 = tune_threshold(val_probs, val_labels)
print(f"Best threshold : {best_thresh:.2f} (macro-F1={best_f1:.3f})")

Best threshold: 0.60 (macro-F1=0.985)


In [10]:
save_dir = "mmf_deberta_v3/best"
os.makedirs(save_dir, exist_ok=True)

# Save model/tokenizer
trainer.save_model(save_dir)
tok.save_pretrained(save_dir)

# Save frames and threshold
with open(os.path.join(save_dir, "frames.json"), "w") as f:
    json.dump(frames, f, indent=2)
with open(os.path.join(save_dir, "threshold.json"), "w") as f:
    json.dump({"global": float(best_thresh)}, f, indent=2)

print("Saved to", save_dir)

Saved to mmf_deberta_v3/best


In [8]:
raw_ds = load_dataset("copenlu/mm-framing")

framing_subset = raw_ds["valid_framing_subset"]
framing_subset = framing_subset.map(_clean)
framing_subset = framing_subset.filter(lambda e: len(e["__text__"]) > 0)
df = framing_subset.to_pandas()
df = df.drop_duplicates(subset="__text__")

framing_subset = Dataset.from_pandas(df, preserve_index=False)

split_ds = framing_subset.train_test_split(test_size=0.2, seed=42)
temp_split = split_ds["test"].train_test_split(test_size=0.5, seed=42)

ds_raw = DatasetDict({
    "train": split_ds["train"],
    "validation": temp_split["train"],
    "test": temp_split["test"]
})


def exact_overlap(name_a, texts_a, name_b, texts_b):
    set_a = set(texts_a)
    set_b = set(texts_b)
    exact_overlap = set_a.intersection(set_b)
    print(f"[{name_a}-{name_b}] Exact duplicates: {len(exact_overlap)}")

train_texts = [ex["__text__"] for ex in ds_raw["train"]]
val_texts   = [ex["__text__"] for ex in ds_raw["validation"]]
test_texts  = [ex["__text__"] for ex in ds_raw["test"]]


exact_overlap("TRAIN", train_texts, "VAL",  val_texts)
exact_overlap("TRAIN", train_texts, "TEST", test_texts)
exact_overlap("VAL",   val_texts,  "TEST",  test_texts)

[TRAIN-VAL] Exact duplicates: 0
[TRAIN-TEST] Exact duplicates: 0
[VAL-TEST] Exact duplicates: 0
