<a href="https://colab.research.google.com/github/felixiho/LLMs/blob/main/preprocess_glue_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets evaluate transformers[sentencepiece]

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [3

In [34]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
import inspect

def preprocess_glue_dataset(dataset_name, model_checkpoint="bert-base-uncased", max_length=128):
    """
    Preprocess any GLUE dataset for model training.

    Args:
        dataset_name (str): Name of the GLUE dataset to load
        model_checkpoint (str): Checkpoint of the model to use for tokenization
        max_length (int): Maximum sequence length for truncation

    Returns:
        tokenized_dataset: The preprocessed dataset
        data_collator: A data collator with padding
        num_labels: Number of labels in the dataset
        task_info: Information about the task structure
    """
    # Load dataset
    try:
        ds = load_dataset(dataset_name)
    except ValueError as e:
        # If loading directly from huggingface hub fails, try with "glue/" prefix
        if not dataset_name.startswith("glue/"):
            try:
                ds = load_dataset("glue", dataset_name.split("/")[-1])
            except:
                raise ValueError(f"Could not load dataset: {dataset_name}")
        else:
            raise e

    # Print dataset structure info
    print(f"Dataset loaded: {dataset_name}")
    print(f"Available splits: {list(ds.keys())}")

    # Determine the task type and input structure
    task_info = detect_glue_task_structure(ds)

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

    # Define tokenizer function based on task structure
    def tokenize_function(examples):
        # Single sentence tasks (SST-2, CoLA)
        if task_info["type"] == "single_sentence":
            sentence_key = task_info["sentence_key"]
            return tokenizer(examples[sentence_key], truncation=True, max_length=max_length)

        # Sentence pair tasks (MRPC, QQP, MNLI, etc.)
        elif task_info["type"] == "sentence_pair":
            sentence1_key = task_info["sentence1_key"]
            sentence2_key = task_info["sentence2_key"]
            return tokenizer(
                examples[sentence1_key],
                examples[sentence2_key],
                truncation=True,
                max_length=max_length
            )
        else:
            raise ValueError(f"Unknown task type: {task_info['type']}")

    # Tokenize the dataset
    tokenized_ds = ds.map(tokenize_function, batched=True)

    # Create data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Prepare the dataset for training
    for split in tokenized_ds.keys():
        # Remove unnecessary columns (keep only model inputs, labels, and attention masks)
        columns_to_remove = []
        for col in tokenized_ds[split].column_names:
            # Keep tokenizer outputs and label column
            if col not in tokenizer.model_input_names and col != task_info["label_key"]:
                columns_to_remove.append(col)

        tokenized_ds[split] = tokenized_ds[split].remove_columns(columns_to_remove)

        # Rename the label column to "labels" if needed
        if task_info["label_key"] != "labels":
            tokenized_ds[split] = tokenized_ds[split].rename_column(task_info["label_key"], "labels")

    # Format the dataset to return only tensor-convertible data
    tokenized_ds = tokenized_ds.with_format("torch", columns=tokenizer.model_input_names + ["labels"])

    return tokenized_ds, data_collator, task_info["num_labels"], task_info

def detect_glue_task_structure(dataset):
    """
    Detect the structure of a GLUE task.

    Args:
        dataset: A Hugging Face dataset

    Returns:
        info: A dictionary containing information about the task structure
    """
    # Get a sample split (train or validation)
    split_name = "train" if "train" in dataset else "validation"
    features = dataset[split_name].features

    # Initialize task info
    task_info = {
        "type": None,
        "sentence_key": None,
        "sentence1_key": None,
        "sentence2_key": None,
        "label_key": None,
        "num_labels": None
    }

    # Detect label column and number of labels
    for key, feature in features.items():
        if hasattr(feature, "num_classes"):
            task_info["label_key"] = key
            task_info["num_labels"] = feature.num_classes
            break

    # If no ClassLabel feature found, look for a label column
    if task_info["label_key"] is None:
        label_candidates = ["label", "labels", "Label", "class"]
        for candidate in label_candidates:
            if candidate in features:
                task_info["label_key"] = candidate
                # Try to infer number of labels
                if hasattr(dataset[split_name], "info") and hasattr(dataset[split_name].info, "features"):
                    if hasattr(dataset[split_name].info.features[candidate], "num_classes"):
                        task_info["num_labels"] = dataset[split_name].info.features[candidate].num_classes

                # Default to binary classification if we can't determine
                if task_info["num_labels"] is None:
                    unique_labels = set(dataset[split_name][candidate])
                    task_info["num_labels"] = len(unique_labels)

                break

    # Default to binary classification if we still can't determine
    if task_info["num_labels"] is None:
        task_info["num_labels"] = 2

    # Detect task type based on column names
    text_columns = []
    for key in features.keys():
        if key != task_info["label_key"] and features[key].dtype == "string":
            text_columns.append(key)

    # Check for common GLUE text column patterns
    sentence_candidates = ["sentence", "text", "premise", "question"]
    sentence1_candidates = ["sentence1", "premise", "question1", "question"]
    sentence2_candidates = ["sentence2", "hypothesis", "question2", "answer"]

    # Detect single sentence tasks
    if len(text_columns) == 1:
        task_info["type"] = "single_sentence"
        task_info["sentence_key"] = text_columns[0]

    # Detect sentence pair tasks
    elif len(text_columns) == 2:
        task_info["type"] = "sentence_pair"

        # Try to identify which column is sentence1 and which is sentence2
        s1_found = False
        for s1 in sentence1_candidates:
            if s1 in text_columns:
                task_info["sentence1_key"] = s1
                s1_found = True
                break

        s2_found = False
        for s2 in sentence2_candidates:
            if s2 in text_columns:
                task_info["sentence2_key"] = s2
                s2_found = True
                break

        # If we couldn't identify by name, use the first and second text columns
        if not s1_found:
            task_info["sentence1_key"] = text_columns[0]

        if not s2_found:
            task_info["sentence2_key"] = text_columns[1]

    # Handle unknown cases
    else:
        # Default to the first text-like column if available
        for key in features.keys():
            if key != task_info["label_key"] and "sentence" in key.lower():
                task_info["type"] = "single_sentence"
                task_info["sentence_key"] = key
                break
            elif key != task_info["label_key"] and features[key].dtype == "string":
                task_info["type"] = "single_sentence"
                task_info["sentence_key"] = key
                break

        # If still no task type, raise error
        if task_info["type"] is None:
            raise ValueError(f"Could not determine task type from columns: {list(features.keys())}")

    return task_info

# Example usage:
if __name__ == "__main__":
    # Example with SST-2 (sentiment analysis - single sentence)
    dataset_name = "gimmaru/glue-sst2"
    tokenized_ds, data_collator, num_labels, task_info = preprocess_glue_dataset(dataset_name)

    print(f"\nPreprocessed {dataset_name}")
    print(f"Number of labels: {num_labels}")
    print(f"Task type: {task_info['type']}")
    print(f"Task structure: {task_info}")
    for split in tokenized_ds:
        print(f"Split {split}: {len(tokenized_ds[split])} examples")

    # Show sample data
    samples = tokenized_ds["validation"][:2]
    samples_dict = {k: v for k, v in samples.items()}
    print("\nSample data structure:")
    for k, v in samples_dict.items():
        print(f"{k}: {v}")

    # Show batch shape
    # Filter out non-tensor items and prepare batch items properly
    batch_items = []
    for i in range(2):
        item = {k: v[i] for k, v in samples_dict.items() if k in ['input_ids', 'attention_mask', 'token_type_ids', 'labels']}
        batch_items.append(item)

    batch = data_collator(batch_items)
    print("\nBatch shapes:")
    for k, v in batch.items():
        print(f"{k}: {v.shape}")

    # Example with MRPC (paraphrase detection - sentence pair)
    dataset_name = "SetFit/mrpc"
    tokenized_ds, data_collator, num_labels, task_info = preprocess_glue_dataset(dataset_name)

    print(f"\nPreprocessed {dataset_name}")
    print(f"Number of labels: {num_labels}")
    for split in tokenized_ds:
        print(f"Split {split}: {len(tokenized_ds[split])} examples")

Dataset loaded: gimmaru/glue-sst2
Available splits: ['validation']

Preprocessed gimmaru/glue-sst2
Number of labels: 2
Task type: single_sentence
Task structure: {'type': 'single_sentence', 'sentence_key': 'sentence', 'sentence1_key': None, 'sentence2_key': None, 'label_key': 'label', 'num_labels': 2}
Split validation: 872 examples

Sample data structure:
labels: tensor([1, 1])
input_ids: [tensor([  101,  2009,  4152,  3031,  1996,  3898,  2074,  2055,  2004,  2172,
         1997,  1996, 20674,  2004,  2028,  2071, 16286,  5987,  1010,  1998,
         2003, 25540, 25725,  2075,  1998,  3048,  1999,  2049,  2219,  2157,
         1012,   102]), tensor([  101,  2026,  2502,  6638,  3306,  5030,  3594, 22807,  1999,  1037,
        26380, 12586,  1997,  4086,  7472,  1998,  8295,  2135,  9841,  2098,
         2041,  8562,  1012,   102])]
token_type_ids: [tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0]), tensor([0, 0, 0, 0, 0, 0

Repo card metadata block was not found. Setting CardData to empty.


Dataset loaded: SetFit/mrpc
Available splits: ['train', 'validation', 'test']


Map:   0%|          | 0/408 [00:00<?, ? examples/s]


Preprocessed SetFit/mrpc
Number of labels: 2
Split train: 3668 examples
Split validation: 408 examples
Split test: 1725 examples
