In [None]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    pipeline
)
import torch
import torch.nn.utils.prune as prune
import numpy as np
import evaluate
import glob

# Load accuracy metric for model evaluation
accuracy = evaluate.load('accuracy')

def train_model_albert(model, tokenizer, tokenized_train, tokenized_valid, out_dir, lr, batch_size, epochs):
    """
    Trains the ALBERT model on the specified training and validation datasets.

    Args:
        model (transformers.PreTrainedModel): The ALBERT model to train.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer for ALBERT.
        tokenized_train (Dataset): Tokenized training dataset.
        tokenized_valid (Dataset): Tokenized validation dataset.
        out_dir (str): Directory to save the trained model.
        lr (float): Learning rate for training.
        batch_size (int): Batch size for training and evaluation.
        epochs (int): Number of epochs to train the model.

    Returns:
        Trainer: The trained model's Trainer instance for further use.
    """
    # Define training arguments
    training_args = TrainingArguments(
        output_dir=out_dir,
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        save_total_limit=3,
        report_to='tensorboard',
        fp16=False
    )

    # Define Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_valid,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
        compute_metrics=compute_metrics_albert
    )

    print("Starting model training...")
    trainer.train()
    return trainer

def apply_pruning_albert(model, amount, steps):
    """
    Applies iterative pruning on the model's linear layers.

    Args:
        model (transformers.PreTrainedModel): The ALBERT model to prune.
        amount (float): Fraction of weights to prune in each step.
        steps (int): Number of pruning iterations to perform.

    Returns:
        None
    """
    for step in range(steps):
        print(f"Pruning step {step + 1}/{steps}")
        # Prune weights in all linear layers
        for name, module in model.named_modules():
            if isinstance(module, torch.nn.Linear):
                prune.l1_unstructured(module, name='weight', amount=amount)
        print(f"Pruning step {step + 1} completed.")

def fine_tune_pruned_model_albert(model, tokenizer, tokenized_train, tokenized_valid, out_dir, lr, batch_size, epochs):
    """
    Fine-tunes a pruned ALBERT model on the training and validation datasets.

    Args:
        model (transformers.PreTrainedModel): The pruned ALBERT model to fine-tune.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer for ALBERT.
        tokenized_train (Dataset): Tokenized training dataset.
        tokenized_valid (Dataset): Tokenized validation dataset.
        out_dir (str): Directory to save the fine-tuned model.
        lr (float): Learning rate for fine-tuning.
        batch_size (int): Batch size for training and evaluation.
        epochs (int): Number of epochs to fine-tune the model.

    Returns:
        Trainer: The fine-tuned model's Trainer instance for further use.
    """
    # Fine-tuning training arguments
    fine_tuning_args = TrainingArguments(
        output_dir=out_dir,
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        save_total_limit=3,
        report_to='tensorboard',
        fp16=False
    )

    trainer = Trainer(
        model=model,
        args=fine_tuning_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_valid,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
        compute_metrics=compute_metrics_albert
    )

    print("Starting fine-tuning on pruned model...")
    trainer.train()
    return trainer

def compute_metrics_albert(eval_pred):
    """
    Computes accuracy for model evaluation.

    Args:
        eval_pred (tuple): Tuple containing model predictions and true labels.

    Returns:
        dict: Dictionary with computed accuracy score.
    """
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

def evaluate_and_infer_albert(trainer, model, tokenizer, tokenized_test):
    """
    Evaluates the pruned ALBERT model on the test set and performs inference.

    Args:
        trainer (Trainer): Trainer object for the pruned ALBERT model.
        model (transformers.PreTrainedModel): The pruned and fine-tuned ALBERT model.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer for ALBERT.
        tokenized_test (Dataset): Tokenized test dataset.

    Returns:
        None
    """
    # Evaluate the model
    eval_result = trainer.evaluate(tokenized_test)
    print(f"Evaluation results after pruning: {eval_result}")

    # Inference pipeline
    classify = pipeline(task='text-classification', model=model, tokenizer=tokenizer)
    all_files = glob.glob('inference_data/*')
    
    for file_name in all_files:
        try:
            with open(file_name) as file:
                content = file.read()
                result = classify(content)
                print(f'File: {file_name}, Prediction: {result}, Ground Truth: {file_name.split("_")[-1].split(".txt")[0]}')
        except Exception as e:
            print(f"Error reading {file_name}: {e}")

def run_albert_pruning_pipeline(
    dataset_name="ccdv/arxiv-classification",
    model_name='albert-base-v2',
    out_dir='arxiv_albert',
    batch_size=32,
    num_procs=4,
    lr=0.00005,
    epochs=50,
    pruning_amount=0.1,
    pruning_steps=3,
):
    """
    Full pipeline for training, pruning, fine-tuning, and evaluating the ALBERT model.

    Args:
        dataset_name (str): Name of the Hugging Face dataset to use.
        model_name (str): Pre-trained ALBERT model name.
        out_dir (str): Output directory for saving model checkpoints.
        batch_size (int): Batch size for training and evaluation.
        num_procs (int): Number of processes for parallel tokenization.
        lr (float): Learning rate for training.
        epochs (int): Number of epochs for both training and fine-tuning.
        pruning_amount (float): Fraction of weights to prune in each pruning step.
        pruning_steps (int): Number of pruning steps to apply.

    Returns:
        None
    """
    # Load dataset splits
    train_dataset = load_dataset(dataset_name, split='train[:95%]')
    valid_dataset = load_dataset(dataset_name, split='validation[:100%]')
    test_dataset = load_dataset(dataset_name, split='test[:100%]')

    # Label mapping
    id2label = {0: "math.AC", 1: "cs.CV", 2: "cs.AI", 3: "cs.SY", 
                4: "math.GR", 5: "cs.CE", 6: "cs.PL", 7: "cs.IT", 
                8: "cs.DS", 9: "cs.NE", 10: "math.ST"}
    label2id = {v: k for k, v in id2label.items()}

    # Initialize tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=11, id2label=id2label, label2id=label2id
    )

    # Tokenize datasets
    def preprocess_function(examples):
        return tokenizer(examples["text"], truncation=True)

    tokenized_train = train_dataset.map(preprocess_function, batched=True, num_proc=num_procs)
    tokenized_valid = valid_dataset.map(preprocess_function, batched=True, num_proc=num_procs)
    tokenized_test = test_dataset.map(preprocess_function, batched=True, num_proc=num_procs)

    # Phase 1: Train the model
    trainer = train_model_albert(
        model=model,
        tokenizer=tokenizer,
        tokenized_train=tokenized_train,
        tokenized_valid=tokenized_valid,
        out_dir=out_dir,
        lr=lr,
        batch_size=batch_size,
        epochs=epochs
    )

    # Phase 2: Apply pruning
    apply_pruning_albert(model, amount=pruning_amount, steps=pruning_steps)

    # Phase 3: Fine-tune the pruned model
    trainer = fine_tune_pruned_model_albert(
        model=model,
        tokenizer=tokenizer,
        tokenized_train=tokenized_train,
        tokenized_valid=tokenized_valid,
        out_dir=out_dir,
        lr=lr,
        batch_size=batch_size,
        epochs=epochs
    )

    # Save the pruned and fine-tuned model
    model.save_pretrained('pruned_albert_finetuned')
    tokenizer.save_pretrained('pruned_albert_finetuned')

    # Load pruned model for evaluation
    model = AutoModelForSequenceClassification.from_pretrained('pruned_albert_finetuned')
    tokenizer = AutoTokenizer.from_pretrained('pruned_albert_finetuned')

    # Final Evaluation and Inference
    evaluate_and_infer_albert(trainer, model, tokenizer, tokenized_test)

# ALBERT Call the function with custom arguments
run_albert_pruning_pipeline(
    dataset_name="ccdv/arxiv-classification",
    model_name='albert-base-v2',
    out_dir='arxiv_albert',
    batch_size=32,
    num_procs=4,
    lr=0.00005,
    epochs=50,
    pruning_amount=0.1,
    pruning_steps=3
)


In [None]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    pipeline
)
import torch
import torch.nn.utils.prune as prune
import numpy as np
import evaluate
import glob

def dynamic_prune_model_distilbert(model, trainer, prune_amount=0.1, prune_steps=3):
    """
    Applies iterative dynamic pruning on the model's linear layers during training.

    Args:
        model (transformers.PreTrainedModel): The DistilBERT model to prune.
        trainer (Trainer): The Trainer instance for managing the training process.
        prune_amount (float): Fraction of weights to prune in each step.
        prune_steps (int): Number of pruning iterations to perform.

    Returns:
        None: The model is pruned in place and the pruning is applied dynamically.
    """
    for step in range(prune_steps):
        print(f"Pruning step {step + 1}/{prune_steps}")
        # Prune weights in all linear layers
        for name, module in model.named_modules():
            if isinstance(module, torch.nn.Linear):
                prune.l1_unstructured(module, name='weight', amount=prune_amount)
        # Fine-tune the model after pruning
        trainer.train()
        # Remove pruning and make it permanent
        for name, module in model.named_modules():
            if isinstance(module, torch.nn.Linear):
                prune.remove(module, 'weight')

def train_distilbert_model(dataset_name, train_split, valid_split, model_name, 
                           batch_size, num_procs, lr, epochs, out_dir):
    """
    Trains the DistilBERT model on the specified dataset.

    Args:
        dataset_name (str): Name of the dataset to load.
        train_split (str): Slice of the training data.
        valid_split (str): Slice of the validation data.
        model_name (str): Name of the pretrained DistilBERT model.
        batch_size (int): Batch size for training and evaluation.
        num_procs (int): Number of processes for tokenization.
        lr (float): Learning rate for training.
        epochs (int): Number of epochs for training.
        out_dir (str): Output directory for saving the model.

    Returns:
        Trainer: The Trainer instance for the trained model.
    """
    # Load datasets
    train_dataset = load_dataset(dataset_name, split=train_split)
    valid_dataset = load_dataset(dataset_name, split=valid_split)

    # Label mapping
    id2label = {0: "math.AC", 1: "cs.CV", 2: "cs.AI", 3: "cs.SY", 
                4: "math.GR", 5: "cs.CE", 6: "cs.PL", 7: "cs.IT", 
                8: "cs.DS", 9: "cs.NE", 10: "math.ST"}
    label2id = {v: k for k, v in id2label.items()}

    # Tokenization
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def preprocess_function(examples):
        """Tokenizes the input text examples."""
        return tokenizer(examples["text"], truncation=True)

    # Tokenize the datasets
    tokenized_train = train_dataset.map(preprocess_function, batched=True, num_proc=num_procs)
    tokenized_valid = valid_dataset.map(preprocess_function, batched=True, num_proc=num_procs)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Metric computation
    accuracy = evaluate.load('accuracy')
    
    def compute_metrics(eval_pred):
        """
        Computes accuracy for model evaluation.

        Args:
            eval_pred (tuple): Tuple containing model predictions and true labels.

        Returns:
            dict: Dictionary with computed accuracy score.
        """
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return accuracy.compute(predictions=predictions, references=labels)

    # Model training setup
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=len(id2label), id2label=id2label, label2id=label2id
    )

    training_args = TrainingArguments(
        output_dir=out_dir,
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        save_total_limit=3,
        report_to='tensorboard',
        fp16=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_valid,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()
    return trainer, model, tokenizer

def fine_tune_pruned_model_DistilBERT(dataset_name, train_split, valid_split, model, tokenizer, 
                           batch_size, num_procs, lr, epochs, out_dir):
    """
    Fine-tunes the pruned DistilBERT model.

    Args:
        dataset_name (str): Name of the dataset to load.
        train_split (str): Slice of the training data.
        valid_split (str): Slice of the validation data.
        model (transformers.PreTrainedModel): The pruned DistilBERT model.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer used for the model.
        batch_size (int): Batch size for training and evaluation.
        num_procs (int): Number of processes for tokenization.
        lr (float): Learning rate for training.
        epochs (int): Number of epochs for fine-tuning.
        out_dir (str): Output directory for saving the model.

    Returns:
        Trainer: The Trainer instance for the fine-tuned model.
    """
    # Load datasets
    train_dataset = load_dataset(dataset_name, split=train_split)
    valid_dataset = load_dataset(dataset_name, split=valid_split)

    def preprocess_function(examples):
        """Tokenizes the input text examples."""
        return tokenizer(examples["text"], truncation=True)

    # Tokenize the datasets
    tokenized_train = train_dataset.map(preprocess_function, batched=True, num_proc=num_procs)
    tokenized_valid = valid_dataset.map(preprocess_function, batched=True, num_proc=num_procs)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Metric computation
    accuracy = evaluate.load('accuracy')
    
    def compute_metrics(eval_pred):
        """
        Computes accuracy for model evaluation.

        Args:
            eval_pred (tuple): Tuple containing model predictions and true labels.

        Returns:
            dict: Dictionary with computed accuracy score.
        """
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return accuracy.compute(predictions=predictions, references=labels)

    training_args = TrainingArguments(
        output_dir=out_dir,
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        save_total_limit=3,
        report_to='tensorboard',
        fp16=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_valid,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Fine-tune the model
    trainer.train()
    return trainer

def run_distilbert_pipeline(
    dataset_name="ccdv/arxiv-classification",
    train_split='train[:95%]',
    valid_split='validation[:100%]',
    test_split='test[:100%]',
    model_name='distilbert-base-uncased',
    batch_size=32,
    num_procs=4,
    lr=0.00005,
    epochs=1,
    prune_amount=0.1,
    prune_steps=3,
    out_dir='arxiv_distilbert'
):
    """
    Executes the DistilBERT model pipeline including dataset loading, tokenization,
    model training, dynamic pruning, evaluation, and inference.

    Args:
        dataset_name (str): Name of the dataset to load.
        train_split (str): Slice of the training data.
        valid_split (str): Slice of the validation data.
        test_split (str): Slice of the test data.
        model_name (str): Name of the pretrained DistilBERT model.
        batch_size (int): Batch size for training and evaluation.
        num_procs (int): Number of processes for tokenization.
        lr (float): Learning rate for training.
        epochs (int): Number of epochs for training.
        prune_amount (float): Fraction of weights to prune.
        prune_steps (int): Number of pruning steps.
        out_dir (str): Output directory for saving the model.

    Returns:
        None: Prints evaluation results and predictions for inference data.
    """
    # Train the DistilBERT model
    trainer, model, tokenizer = train_distilbert_model(
        dataset_name, train_split, valid_split, model_name, 
        batch_size, num_procs, lr, epochs, out_dir
    )

    # Apply dynamic pruning
    dynamic_prune_model_distilbert(model, trainer, prune_amount, prune_steps)

    # Save the pruned model
    model.save_pretrained('pruned_arxiv_distilbert')
    tokenizer.save_pretrained('pruned_arxiv_distilbert')

    # Fine-tune the pruned model
    fine_tune_pruned_model_DistilBERT(
        dataset_name, train_split, valid_split, model, tokenizer,
        batch_size, num_procs, lr, epochs, 'fine_tuned_pruned_model'
    )

    # Load the fine-tuned pruned model for inference
    model = AutoModelForSequenceClassification.from_pretrained('fine_tuned_pruned_model')
    tokenizer = AutoTokenizer.from_pretrained('fine_tuned_pruned_model')
    classify = pipeline(task='text-classification', model=model, tokenizer=tokenizer)

    # Evaluate the fine-tuned model on the test dataset
    test_dataset = load_dataset(dataset_name, split=test_split)
    tokenized_test = test_dataset.map(lambda examples: tokenizer(examples["text"], truncation=True), batched=True, num_proc=num_procs)
    
    eval_result = trainer.evaluate(tokenized_test)
    print(f"Evaluation results after fine-tuning: {eval_result}")

    # Inference with the fine-tuned pruned model
    all_files = glob.glob('inference_data/*')
    for file_name in all_files:
        try:
            with open(file_name) as file:
                content = file.read()
                result = classify(content)
                print(f'File: {file_name}, Prediction: {result}, Ground Truth: {file_name.split("_")[-1].split(".txt")[0]}')
        except Exception as e:
            print(f"Error reading {file_name}: {e}")

# Call the function with specified arguments
run_distilbert_pipeline(
  dataset_name="ccdv/arxiv-classification", 
  train_split='train[:95%]', 
  valid_split='validation[:85%]', 
  test_split='test[:85%]', 
  model_name='distilbert-base-uncased', 
  batch_size=32, 
  num_procs=4, 
  lr=0.00005, 
  epochs=50, 
  prune_amount=0.1, 
  prune_steps=3, 
  out_dir='arxiv_distilbert'
)


In [None]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    pipeline
)
import torch
import torch.nn.utils.prune as prune
import numpy as np
import evaluate
import glob

def train_roberta_model(model_name: str, 
                        dataset_name: str, 
                        sample_fraction: float, 
                        out_dir: str, 
                        batch_size: int = 16, 
                        num_epochs: int = 5, 
                        learning_rate: float = 5e-5):
    """
    Trains the RoBERTa model on the specified dataset.

    Args:
        model_name (str): Name of the pretrained RoBERTa model.
        dataset_name (str): Name of the dataset to be loaded.
        sample_fraction (float): Fraction of the dataset to use for training/validation/testing.
        out_dir (str): Directory to save the model.
        batch_size (int): Training batch size. Default is 16.
        num_epochs (int): Number of training epochs. Default is 5.
        learning_rate (float): Learning rate for training. Default is 5e-5.

    Returns:
        Trainer: The Trainer instance for the trained model.
        AutoModelForSequenceClassification: The trained model.
        AutoTokenizer: The tokenizer used for the model.
    """
    # Load datasets
    train_dataset = load_dataset(dataset_name, split=f'train[:{int(sample_fraction * 100)}%]')
    valid_dataset = load_dataset(dataset_name, split=f'validation[:{int(sample_fraction * 100)}%]')

    # Label mapping
    id2label = {0: "math.AC", 1: "cs.CV", 2: "cs.AI", 3: "cs.SY", 
                4: "math.GR", 5: "cs.CE", 6: "cs.PL", 7: "cs.IT", 
                8: "cs.DS", 9: "cs.NE", 10: "math.ST"}
    label2id = {v: k for k, v in id2label.items()}

    # Tokenization
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def preprocess_function(examples):
        return tokenizer(examples["text"], truncation=True)

    tokenized_train = train_dataset.map(preprocess_function, batched=True)
    tokenized_valid = valid_dataset.map(preprocess_function, batched=True)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Metric computation
    accuracy = evaluate.load('accuracy')
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return accuracy.compute(predictions=predictions, references=labels)

    # Model training setup
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(id2label), id2label=id2label, label2id=label2id)
    training_args = TrainingArguments(
        output_dir=out_dir,
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        save_total_limit=3,
        report_to='tensorboard',
        fp16=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_valid,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()
    return trainer, model, tokenizer

def dynamic_prune_roberta_model(model, trainer, amount=0.1, steps=3):
    """
    Applies iterative dynamic pruning on the model's linear layers during training.

    Args:
        model (transformers.PreTrainedModel): The RoBERTa model to prune.
        trainer (Trainer): The Trainer instance for managing the training process.
        amount (float): Fraction of weights to prune in each step.
        steps (int): Number of pruning iterations to perform.

    Returns:
        None: The model is pruned in place and the pruning is applied dynamically.
    """
    for step in range(steps):
        print(f"Pruning step {step + 1}/{steps}")
        # Prune weights in all linear layers
        for name, module in model.named_modules():
            if isinstance(module, torch.nn.Linear):
                prune.l1_unstructured(module, name='weight', amount=amount)
        # Fine-tune the model after pruning
        trainer.train()
        # Remove pruning and make it permanent
        for name, module in model.named_modules():
            if isinstance(module, torch.nn.Linear):
                prune.remove(module, 'weight')

def fine_tune_pruned_roberta_model(model, tokenizer, dataset_name, sample_fraction, out_dir, 
                                    batch_size=16, num_epochs=5, learning_rate=5e-5):
    """
    Fine-tunes the pruned RoBERTa model.

    Args:
        model (transformers.PreTrainedModel): The pruned RoBERTa model.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer used for the model.
        dataset_name (str): Name of the dataset to be loaded.
        sample_fraction (float): Fraction of the dataset to use for fine-tuning.
        out_dir (str): Directory to save the fine-tuned model.
        batch_size (int): Training batch size. Default is 16.
        num_epochs (int): Number of training epochs. Default is 5.
        learning_rate (float): Learning rate for training. Default is 5e-5.

    Returns:
        Trainer: The Trainer instance for the fine-tuned model.
    """
    # Load datasets
    train_dataset = load_dataset(dataset_name, split=f'train[:{int(sample_fraction * 100)}%]')
    valid_dataset = load_dataset(dataset_name, split=f'validation[:{int(sample_fraction * 100)}%]')

    # Tokenization
    def preprocess_function(examples):
        return tokenizer(examples["text"], truncation=True)

    tokenized_train = train_dataset.map(preprocess_function, batched=True)
    tokenized_valid = valid_dataset.map(preprocess_function, batched=True)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Metric computation
    accuracy = evaluate.load('accuracy')
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return accuracy.compute(predictions=predictions, references=labels)

    training_args = TrainingArguments(
        output_dir=out_dir,
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        save_total_limit=3,
        report_to='tensorboard',
        fp16=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_valid,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Fine-tune the model
    trainer.train()
    return trainer

def roberta_pipeline(
    model_name: str, 
    dataset_name: str, 
    sample_fraction: float, 
    out_dir: str, 
    batch_size: int = 16, 
    num_epochs: int = 5, 
    learning_rate: float = 5e-5, 
    pruning_amount: float = 0.1, 
    pruning_steps: int = 3):
    """
    Executes the RoBERTa model pipeline including dataset loading, training,
    dynamic pruning, and fine-tuning.

    Args:
        model_name (str): Name of the pretrained RoBERTa model.
        dataset_name (str): Name of the dataset to be loaded.
        sample_fraction (float): Fraction of the dataset to use for training/validation/testing.
        out_dir (str): Directory to save the model.
        batch_size (int): Training batch size. Default is 16.
        num_epochs (int): Number of training epochs. Default is 5.
        learning_rate (float): Learning rate for training. Default is 5e-5.
        pruning_amount (float): Fraction of weights to prune. Default is 0.1.
        pruning_steps (int): Number of pruning steps to perform. Default is 3.

    Returns:
        None: Prints evaluation results and predictions for inference data.
    """
    # Train the RoBERTa model
    trainer, model, tokenizer = train_roberta_model(model_name, dataset_name, sample_fraction, out_dir, batch_size, num_epochs, learning_rate)

    # Apply dynamic pruning
    dynamic_prune_roberta_model(model, trainer, pruning_amount, pruning_steps)

    # Save the pruned model
    model.save_pretrained(f'pruned_{out_dir}')
    tokenizer.save_pretrained(f'pruned_{out_dir}')

    # Load the pruned model for fine-tuning
    model = AutoModelForSequenceClassification.from_pretrained(f'pruned_{out_dir}')
    tokenizer = AutoTokenizer.from_pretrained(f'pruned_{out_dir}')

    # Fine-tune the pruned model
    fine_tune_trainer = fine_tune_pruned_roberta_model(model, tokenizer, dataset_name, sample_fraction, f'fine_tuned_{out_dir}', batch_size, num_epochs, learning_rate)

    # Evaluate the pruned and fine-tuned model on the test dataset
    test_dataset = load_dataset(dataset_name, split=f'test[:{int(sample_fraction * 100)}%]')
    eval_result = fine_tune_trainer.evaluate(test_dataset)
    print(f"Evaluation results after pruning and fine-tuning: {eval_result}")

    # Inference with the pruned and fine-tuned model
    classify = pipeline(task='text-classification', model=model, tokenizer=tokenizer)
    all_files = glob.glob('inference_data/*')
    for file_name in all_files:
        with open(file_name) as file:
            content = file.read()
            result = classify(content)
            print(f'File: {file_name}, Prediction: {result}, Ground Truth: {file_name.split("_")[-1].split(".txt")[0]}')


# Main execution for RoBERTa
roberta_pipeline(
    model_name='roberta-base',
    dataset_name='ccdv/arxiv-classification',
    sample_fraction=0.95,  # Use 95% of training data
    out_dir='arxiv_roberta',
    batch_size=16,
    num_epochs=50,
    learning_rate=5e-5,
    pruning_amount=0.1,
    pruning_steps=3
)

In [None]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    pipeline
)
import torch
import torch.nn.utils.prune as prune
import numpy as np
import evaluate
import glob

def train_bert_model(model_name: str, 
                     dataset_name: str, 
                     sample_fraction: float, 
                     out_dir: str, 
                     batch_size: int = 32, 
                     num_epochs: int = 3, 
                     learning_rate: float = 5e-5):
    """
    Trains the BERT model on the specified dataset.

    Args:
        model_name (str): Name of the pretrained BERT model.
        dataset_name (str): Name of the dataset to be loaded.
        sample_fraction (float): Fraction of the dataset to use for training/validation/testing.
        out_dir (str): Directory to save the model.
        batch_size (int): Training batch size. Default is 32.
        num_epochs (int): Number of training epochs. Default is 3.
        learning_rate (float): Learning rate for training. Default is 5e-5.

    Returns:
        Trainer: The Trainer instance for the trained model.
        AutoModelForSequenceClassification: The trained model.
        AutoTokenizer: The tokenizer used for the model.
    """
    # Load datasets
    train_dataset = load_dataset(dataset_name, split=f'train[:{int(sample_fraction * 100)}%]')
    valid_dataset = load_dataset(dataset_name, split=f'validation[:{int(sample_fraction * 100)}%]')
    test_dataset = load_dataset(dataset_name, split=f'test[:{int(sample_fraction * 100)}%]')

    # Label mapping
    id2label = {0: "math.AC", 1: "cs.CV", 2: "cs.AI", 3: "cs.SY", 
                4: "math.GR", 5: "cs.CE", 6: "cs.PL", 7: "cs.IT", 
                8: "cs.DS", 9: "cs.NE", 10: "math.ST"}
    label2id = {v: k for k, v in id2label.items()}

    # Tokenization
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def preprocess_function(examples):
        return tokenizer(examples["text"], truncation=True)

    tokenized_train = train_dataset.map(preprocess_function, batched=True)
    tokenized_valid = valid_dataset.map(preprocess_function, batched=True)
    tokenized_test = test_dataset.map(preprocess_function, batched=True)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Metric computation
    accuracy = evaluate.load('accuracy')
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return accuracy.compute(predictions=predictions, references=labels)

    # Model training setup
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(id2label), id2label=id2label, label2id=label2id)
    training_args = TrainingArguments(
        output_dir=out_dir,
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        save_total_limit=3,
        report_to='tensorboard',
        fp16=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_valid,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()
    return trainer, model, tokenizer

def dynamic_prune_bert_model(model, trainer, amount=0.1, steps=3):
    """
    Applies iterative dynamic pruning on the model's linear layers during training.

    Args:
        model (transformers.PreTrainedModel): The BERT model to prune.
        trainer (Trainer): The Trainer instance for managing the training process.
        amount (float): Fraction of weights to prune in each step. Default is 0.1.
        steps (int): Number of pruning iterations to perform. Default is 3.

    Returns:
        None: The model is pruned in place and the pruning is applied dynamically.
    """
    for step in range(steps):
        print(f"Pruning step {step + 1}/{steps}")
        # Prune weights in all linear layers
        for name, module in model.named_modules():
            if isinstance(module, torch.nn.Linear):
                prune.l1_unstructured(module, name='weight', amount=amount)
        # Fine-tune the model after pruning
        trainer.train()
        # Remove pruning and make it permanent
        for name, module in model.named_modules():
            if isinstance(module, torch.nn.Linear):
                prune.remove(module, 'weight')

def fine_tune_pruned_bert_model(model, tokenizer, dataset_name, sample_fraction, out_dir, 
                                 batch_size=32, num_epochs=3, learning_rate=5e-5):
    """
    Fine-tunes the pruned BERT model.

    Args:
        model (transformers.PreTrainedModel): The pruned BERT model.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer used for the model.
        dataset_name (str): Name of the dataset to be loaded.
        sample_fraction (float): Fraction of the dataset to use for fine-tuning.
        out_dir (str): Directory to save the fine-tuned model.
        batch_size (int): Training batch size. Default is 32.
        num_epochs (int): Number of training epochs. Default is 3.
        learning_rate (float): Learning rate for training. Default is 5e-5.

    Returns:
        Trainer: The Trainer instance for the fine-tuned model.
    """
    # Load datasets
    train_dataset = load_dataset(dataset_name, split=f'train[:{int(sample_fraction * 100)}%]')
    valid_dataset = load_dataset(dataset_name, split=f'validation[:{int(sample_fraction * 100)}%]')

    # Tokenization
    def preprocess_function(examples):
        return tokenizer(examples["text"], truncation=True)

    tokenized_train = train_dataset.map(preprocess_function, batched=True)
    tokenized_valid = valid_dataset.map(preprocess_function, batched=True)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Metric computation
    accuracy = evaluate.load('accuracy')
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return accuracy.compute(predictions=predictions, references=labels)

    training_args = TrainingArguments(
        output_dir=out_dir,
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        save_total_limit=3,
        report_to='tensorboard',
        fp16=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_valid,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Fine-tune the model
    trainer.train()
    return trainer

def bert_pipeline(
    model_name: str, 
    dataset_name: str, 
    sample_fraction: float, 
    out_dir: str, 
    batch_size: int = 32, 
    num_epochs: int = 3, 
    learning_rate: float = 5e-5, 
    pruning_amount: float = 0.1, 
    pruning_steps: int = 3):
    """
    Executes the BERT model pipeline including dataset loading, training,
    dynamic pruning, and fine-tuning.

    Args:
        model_name (str): Name of the pretrained BERT model.
        dataset_name (str): Name of the dataset to be loaded.
        sample_fraction (float): Fraction of the dataset to use for training/validation/testing.
        out_dir (str): Directory to save the model.
        batch_size (int): Training batch size. Default is 32.
        num_epochs (int): Number of training epochs. Default is 3.
        learning_rate (float): Learning rate for training. Default is 5e-5.
        pruning_amount (float): Fraction of weights to prune. Default is 0.1.
        pruning_steps (int): Number of pruning steps to perform. Default is 3.

    Returns:
        None: Prints evaluation results and predictions for inference data.
    """
    # Train the BERT model
    trainer, model, tokenizer = train_bert_model(model_name, dataset_name, sample_fraction, out_dir, batch_size, num_epochs, learning_rate)

    # Apply dynamic pruning
    dynamic_prune_bert_model(model, trainer, pruning_amount, pruning_steps)

    # Save the pruned model
    model.save_pretrained(f'pruned_{out_dir}')
    tokenizer.save_pretrained(f'pruned_{out_dir}')

    # Load the pruned model for inference
    model = AutoModelForSequenceClassification.from_pretrained(f'pruned_{out_dir}')
    tokenizer = AutoTokenizer.from_pretrained(f'pruned_{out_dir}')

    # Fine-tune the pruned model
    fine_tune_trainer = fine_tune_pruned_bert_model(model, tokenizer, dataset_name, sample_fraction, f'fine_tuned_{out_dir}', batch_size, num_epochs, learning_rate)

    # Evaluate the pruned model on the test dataset
    test_dataset = load_dataset(dataset_name, split=f'test[:{int(sample_fraction * 100)}%]')
    eval_result = fine_tune_trainer.evaluate(test_dataset)
    print(f"Evaluation results after pruning and fine-tuning: {eval_result}")

    # Inference with the pruned model
    classify = pipeline(task='text-classification', model=model, tokenizer=tokenizer)
    all_files = glob.glob('inference_data/*')
    for file_name in all_files:
        with open(file_name) as file:
            content = file.read()
            result = classify(content)
            print(f'File: {file_name}, Prediction: {result}, Ground Truth: {file_name.split("_")[-1].split(".txt")[0]}')


# Main execution for BERT
bert_pipeline(
    model_name='bert-base-uncased',
    dataset_name='ccdv/arxiv-classification',
    sample_fraction=0.8,
    out_dir='arxiv_bert',
    batch_size=32,
    num_epochs=50,
    learning_rate=5e-5,
    pruning_amount=0.1,
    pruning_steps=3
)

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments
)

def load_and_preprocess_data(dataset_name: str, version: str, sample_train_size: int, sample_val_size: int):
    """
    Loads and preprocesses the dataset.

    Args:
        dataset_name (str): Name of the dataset.
        version (str): Version of the dataset.
        sample_train_size (int): Size of the training sample.
        sample_val_size (int): Size of the validation sample.

    Returns:
        Tuple: (train_dataset, validation_dataset)
    """
    dataset = load_dataset(dataset_name, version)
    processed_data = dataset['train'].map(lambda x: {"text": x['article'], "target": x['highlights']})
    split_data = processed_data.train_test_split(test_size=0.1)
    train_data = split_data['train'].select(range(min(sample_train_size, len(split_data['train']))))
    validation_data = split_data['test'].select(range(min(sample_val_size, len(split_data['test']))))
    
    return train_data, validation_data

def tokenize_data(train_data, validation_data):
    """
    Tokenizes the training and validation datasets.

    Args:
        train_data: The training dataset.
        validation_data: The validation dataset.

    Returns:
        Tuple: (train_dataset, validation_dataset)
    """
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token

    def tokenize_function(examples):
        tokens = tokenizer(examples['text'], truncation=True, padding="max_length", max_length=128)
        tokens['labels'] = tokens['input_ids'].copy()
        return tokens

    train_dataset = train_data.map(tokenize_function, batched=True).set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    validation_dataset = validation_data.map(tokenize_function, batched=True).set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    return train_dataset, validation_dataset

def train_gpt2_model(train_dataset, validation_dataset, output_dir: str):
    """
    Trains the GPT-2 model.

    Args:
        train_dataset: The tokenized training dataset.
        validation_dataset: The tokenized validation dataset.
        output_dir (str): Directory to save the model.

    Returns:
        Trainer: The Trainer instance for the trained model.
        GPT2LMHeadModel: The trained GPT-2 model.
    """
    model = GPT2LMHeadModel.from_pretrained('gpt2')

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=50,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir=os.path.join(output_dir, "logs"),
        logging_steps=10,
        save_steps=10,
        evaluation_strategy="steps",
        eval_steps=10,
        save_total_limit=1
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=validation_dataset
    )

    trainer.train()
    trainer.save_model(output_dir)
    model.save_pretrained(output_dir)

    return trainer, model

def compute_head_importance(model, validation_dataset):
    """
    Computes the importance of attention heads.

    Args:
        model (GPT2LMHeadModel): The GPT-2 model.
        validation_dataset: The validation dataset.

    Returns:
        Dict: Head importance scores for each layer.
    """
    model.eval()
    head_importance = {layer: [0.0] * model.config.n_head for layer in range(model.config.n_layer)}
    dataloader = validation_dataset

    for batch in dataloader:
        outputs = model(**batch, output_attentions=True)
        attentions = outputs.attentions

        for layer_idx, layer_att in enumerate(attentions):
            layer_mean = layer_att.mean(dim=0)
            for head_idx in range(layer_mean.size(0)):
                head_importance[layer_idx][head_idx] += layer_mean[head_idx].mean().item()

    for layer in head_importance:
        total = sum(head_importance[layer])
        head_importance[layer] = [head / total for head in head_importance[layer]]

    return head_importance

def prune_gpt2_heads(model, head_importance):
    """
    Prunes heads based on importance scores.

    Args:
        model (GPT2LMHeadModel): The GPT-2 model.
        head_importance (Dict): Head importance scores for each layer.

    Returns:
        None: The model is pruned in place.
    """
    heads_to_prune = {}
    for layer, importance_scores in head_importance.items():
        heads_to_prune[layer] = [head_idx for head_idx, score in enumerate(importance_scores) if score < 0.2]
        if len(heads_to_prune[layer]) >= model.config.n_head - 1:
            heads_to_prune[layer] = []  # Skip pruning if too many heads would be removed

    model.prune_heads(heads_to_prune)

def fine_tune_pruned_gpt2_model(model, train_dataset, validation_dataset, output_dir: str):
    """
    Fine-tunes the pruned GPT-2 model.

    Args:
        model (GPT2LMHeadModel): The pruned GPT-2 model.
        train_dataset: The tokenized training dataset.
        validation_dataset: The tokenized validation dataset.
        output_dir (str): Directory to save the fine-tuned model.

    Returns:
        Trainer: The Trainer instance for the fine-tuned model.
    """
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=50,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir=os.path.join(output_dir, "logs"),
        logging_steps=10,
        save_steps=10,
        evaluation_strategy="steps",
        eval_steps=10,
        save_total_limit=1
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=validation_dataset
    )

    trainer.train()
    trainer.save_model(output_dir)

    return trainer

def evaluate_gpt2_model(trainer, validation_dataset):
    """
    Evaluates the GPT-2 model.

    Args:
        trainer (Trainer): The Trainer instance for the model.
        validation_dataset: The validation dataset.

    Returns:
        float: Perplexity of the model on the validation dataset.
    """
    eval_results = trainer.evaluate(eval_dataset=validation_dataset)
    perplexity = torch.exp(torch.tensor(eval_results['eval_loss'])).item()
    return perplexity

def generate_text(model, tokenizer, input_text: str):
    """
    Generates text using the trained GPT-2 model.

    Args:
        model (GPT2LMHeadModel): The GPT-2 model.
        tokenizer (GPT2Tokenizer): The tokenizer used for the model.
        input_text (str): Input text for generation.

    Returns:
        List: Generated texts.
    """
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    with torch.no_grad():
        outputs = model.generate(input_ids, max_length=50, num_return_sequences=1, no_repeat_ngram_size=2, early_stopping=True)
    
    generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return generated_texts

def gpt2_pipeline(dataset_name: str, version: str, sample_train_size: int, sample_val_size: int, output_dir: str, input_text: str):
    """
    Complete GPT-2 pipeline including loading data, training, pruning, evaluation, and text generation.

    Args:
        dataset_name (str): Name of the dataset.
        version (str): Version of the dataset.
        sample_train_size (int): Size of the training sample.
        sample_val_size (int): Size of the validation sample.
        output_dir (str): Directory to save the model.
        input_text (str): Input text for generation.

    Returns:
        List: Generated texts.
    """
    # Load and preprocess data
    train_data, validation_data = load_and_preprocess_data(dataset_name, version, sample_train_size, sample_val_size)

    # Tokenize data
    train_dataset, validation_dataset = tokenize_data(train_data, validation_data)

    # Train the GPT-2 model
    trainer, model = train_gpt2_model(train_dataset, validation_dataset, output_dir)

    # Compute head importance
    head_importance = compute_head_importance(model, validation_dataset)

    # Prune heads based on importance
    prune_gpt2_heads(model, head_importance)

    # Fine-tune the pruned model
    fine_tune_trainer = fine_tune_pruned_gpt2_model(model, train_dataset, validation_dataset, output_dir)

    # Evaluate the pruned model
    perplexity = evaluate_gpt2_model(fine_tune_trainer, validation_dataset)
    print(f"GPT-2 Automated Pruning Perplexity: {perplexity:.2f}")

    # Text generation
    generated_texts = generate_text(model, GPT2Tokenizer.from_pretrained('gpt2'), input_text)

    return generated_texts

generated = gpt2_pipeline(
        dataset_name="cnn_dailymail",
        version="3.0.0",
        sample_train_size=27000,
        sample_val_size=10000,
        output_dir="./results-gpt2-automated-pruning",
        input_text="The future of AI in healthcare is"
    )

    # Print the generated texts
    for i, text in enumerate(generated):
        print(f"Generated Text {i + 1}: {text}")

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Trainer,
    TrainingArguments
)

def load_and_preprocess_t5_data(dataset_name: str, version: str, sample_train_size: int, sample_val_size: int):
    """
    Loads and preprocesses the T5 dataset.

    Args:
        dataset_name (str): Name of the dataset to load (e.g., "cnn_dailymail").
        version (str): Version of the dataset to load.
        sample_train_size (int): Number of training samples to use.
        sample_val_size (int): Number of validation samples to use.

    Returns:
        Tuple: Training and validation datasets.
    """
    dataset = load_dataset(dataset_name, version)
    processed_data = dataset['train'].map(lambda x: {"input_text": x['article'], "target_text": x['highlights']})
    split_data = processed_data.train_test_split(test_size=0.1)
    train_data = split_data['train'].select(range(min(sample_train_size, len(split_data['train']))))
    validation_data = split_data['test'].select(range(min(sample_val_size, len(split_data['test']))))
    
    print(f"T5 Training size: {len(train_data)}, Validation size: {len(validation_data)}")
    
    return train_data, validation_data

def tokenize_t5_data(train_data, validation_data):
    """
    Tokenizes the T5 data.

    Args:
        train_data: The training dataset.
        validation_data: The validation dataset.

    Returns:
        Tuple: Tokenized training and validation datasets.
    """
    tokenizer = T5Tokenizer.from_pretrained('t5-small')
    
    def tokenize_function(examples):
        inputs = tokenizer(examples['input_text'], truncation=True, padding="max_length", max_length=128)
        labels = tokenizer(examples['target_text'], truncation=True, padding="max_length", max_length=128)
        inputs['labels'] = labels['input_ids']
        return inputs

    train_dataset = train_data.map(tokenize_function, batched=True)
    validation_dataset = validation_data.map(tokenize_function, batched=True)
    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    validation_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    return train_dataset, validation_dataset

def train_t5_model(train_dataset, validation_dataset, output_dir: str):
    """
    Trains the T5 model.

    Args:
        train_dataset: The tokenized training dataset.
        validation_dataset: The tokenized validation dataset.
        output_dir (str): Directory to save the trained model and tokenizer.

    Returns:
        Trainer: Trained Trainer instance.
    """
    model = T5ForConditionalGeneration.from_pretrained('t5-small')
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=50,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir=os.path.join(output_dir, "logs"),
        logging_steps=10,
        save_steps=10,
        evaluation_strategy="steps",
        eval_steps=10,
        save_total_limit=1
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=validation_dataset
    )

    if os.path.exists(output_dir) and any('checkpoint' in file for file in os.listdir(output_dir)):
        print(f"T5 Training already completed. Skipping...")
    else:
        print(f"Starting training for T5...")
        trainer.train()
        print(f"Training complete!")

    print(f"Saving T5 model...")
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"T5 Model and tokenizer saved.")
    
    return trainer, model

def compute_head_importance_and_prune_t5_model(model, trainer, validation_dataset):
    """
    Computes head importance and prunes the T5 model.

    Args:
        model: The T5 model to prune.
        trainer: The Trainer instance.
        validation_dataset: The validation dataset.

    Returns:
        None
    """
    model.eval()
    head_importance = {layer: [0.0] * model.config.num_heads for layer in range(model.config.num_layers)}
    dataloader = trainer.get_eval_dataloader(validation_dataset)

    for batch in dataloader:
        with torch.no_grad():
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']
            decoder_input_ids = labels[:, :-1]

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, output_attentions=True, return_dict=True)
            encoder_attentions = outputs.encoder_attentions
            decoder_attentions = outputs.decoder_attentions

            for layer_idx in range(len(encoder_attentions)):
                layer_mean = encoder_attentions[layer_idx].mean(dim=(0, 1, 2)).detach()
                for head_idx in range(min(layer_mean.size(0), len(head_importance[layer_idx]))):
                    head_importance[layer_idx][head_idx] += layer_mean[head_idx].item()

            for layer_idx in range(len(decoder_attentions)):
                layer_dec_mean = decoder_attentions[layer_idx].mean(dim=(0, 1, 2)).detach()
                for head_idx in range(min(layer_dec_mean.size(0), len(head_importance[layer_idx]))):
                    head_importance[layer_idx][head_idx] += layer_dec_mean[head_idx].item()

    for layer in head_importance:
        total = sum(head_importance[layer])
        if total > 0:
            head_importance[layer] = [head / total for head in head_importance[layer]]
        else:
            head_importance[layer] = [0.0] * model.config.num_heads

    heads_to_prune = {layer: [head_idx for head_idx, score in enumerate(head_importance[layer]) if score < 0.05] for layer in head_importance}
    
    # Prune heads
    for layer, heads in heads_to_prune.items():
        if heads:
            model.encoder.block[layer].layer[0].SelfAttention.prune_heads(heads)
            model.decoder.block[layer].layer[0].SelfAttention.prune_heads(heads)

def fine_tune_pruned_t5_model(model, train_dataset, validation_dataset):
    """
    Fine-tunes the pruned T5 model.

    Args:
        model: The pruned T5 model.
        train_dataset: The tokenized training dataset.
        validation_dataset: The tokenized validation dataset.

    Returns:
        None
    """
    trainer_pruned = Trainer(
        model=model,
        args=TrainingArguments(
            output_dir="./results-pruned",
            num_train_epochs=50,
            per_device_train_batch_size=4,
            per_device_eval_batch_size=4,
            warmup_steps=500,
            weight_decay=0.01,
            logging_dir=os.path.join("./results-pruned", "logs"),
            logging_steps=10,
            save_steps=10,
            evaluation_strategy="steps",
            eval_steps=10,
            save_total_limit=1
        ),
        train_dataset=train_dataset,
        eval_dataset=validation_dataset
    )
    
    eval_results = trainer_pruned.evaluate(eval_dataset=validation_dataset)
    print(f"Eval Results: {eval_results}")
    perplexity = torch.exp(torch.tensor(eval_results['eval_loss'])).item()
    print(f"T5 Automated Pruning Perplexity: {perplexity:.2f}")

def run_t5_model_pipeline(dataset_name: str, version: str, sample_train_size: int, sample_val_size: int, output_dir: str):
    """
    Executes the T5 model pipeline including data loading, preprocessing, training, head importance computation, pruning, and evaluation.

    Args:
        dataset_name (str): Name of the dataset to load (e.g., "cnn_dailymail").
        version (str): Version of the dataset to load.
        sample_train_size (int): Number of training samples to use.
        sample_val_size (int): Number of validation samples to use.
        output_dir (str): Directory to save the trained model and tokenizer.

    Returns:
        None: The function prints results and saves the model.
    """
    # Load and preprocess data
    train_data, validation_data = load_and_preprocess_t5_data(dataset_name, version, sample_train_size, sample_val_size)
    train_dataset, validation_dataset = tokenize_t5_data(train_data, validation_data)
    
    # Train the T5 model
    trainer, model = train_t5_model(train_dataset, validation_dataset, output_dir)

    # Compute head importance and prune the T5 model
    compute_head_importance_and_prune_t5_model(model, trainer, validation_dataset)

    # Fine-tune the pruned T5 model
    fine_tune_pruned_t5_model(model, train_dataset, validation_dataset)
