In [9]:
import os
import warnings
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import torch
from typing import Dict

warnings.filterwarnings("ignore")


def load_and_preprocess_data(dataset_name: str, version: str, sample_train_size: int, sample_val_size: int):
    dataset = load_dataset(dataset_name, version)
    processed_data = dataset['train'].map(lambda x: {"text": x['article'], "target": x['highlights']})
    split_data = processed_data.train_test_split(test_size=0.1)
    train_data = split_data['train'].select(range(min(sample_train_size, len(split_data['train']))))
    validation_data = split_data['test'].select(range(min(sample_val_size, len(split_data['test']))))
    print(f"Training size: {len(train_data)}, Validation size: {len(validation_data)}")
    return train_data, validation_data


def tokenize_data(train_data, validation_data, tokenizer):
    def tokenize_function(examples):
        tokens = tokenizer(examples['text'], truncation=True, padding="max_length", max_length=128)
        tokens['labels'] = tokens['input_ids'].copy()
        return tokens

    train_dataset = train_data.map(tokenize_function, batched=True)
    validation_dataset = validation_data.map(tokenize_function, batched=True)

    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    validation_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    return train_dataset, validation_dataset


def train_and_save_model(model: GPT2LMHeadModel, tokenizer: GPT2Tokenizer, train_dataset, validation_dataset, output_dir: str, phase: str):
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=1,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir=os.path.join(output_dir, "logs"),
        logging_steps=10,
        save_steps=10,
        evaluation_strategy="steps",
        eval_steps=10,
        save_total_limit=1
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=validation_dataset
    )

    if os.path.exists(output_dir) and any('checkpoint' in file for file in os.listdir(output_dir)):
        print(f"Training for {phase} already completed. Skipping...")
    else:
        print(f"Starting training for {phase}...")
        trainer.train()
        print(f"Training for {phase} complete!")

    print(f"Saving model for {phase}...")
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Model and tokenizer saved for {phase}.")
    return trainer


def compute_head_importance(trainer: Trainer, eval_dataset) -> Dict[int, float]:
    model = trainer.model
    model.eval()

    head_importance = {}
    for layer in range(model.config.n_layer):
        num_heads = model.config.n_head
        head_importance[layer] = [0.0] * num_heads

    dataloader = trainer.get_eval_dataloader(eval_dataset)

    for batch in dataloader:
        outputs = model(**batch, output_attentions=True)
        attentions = outputs.attentions

        for layer_idx, layer_att in enumerate(attentions):
            layer_mean = layer_att.mean(dim=0)
            for head_idx in range(layer_mean.size(0)):
                head_importance[layer_idx][head_idx] += layer_mean[head_idx].mean().item()

    # Normalize head importance
    for layer in head_importance:
        total = sum(head_importance[layer])
        head_importance[layer] = [head / total for head in head_importance[layer]]

    return head_importance


def prune_automated_heads(model: GPT2LMHeadModel, head_importance: Dict[int, float], threshold: float = 0.1):
    heads_to_prune = {}
    for layer, importance_scores in head_importance.items():
        heads_to_prune[layer] = [head_idx for head_idx, score in enumerate(importance_scores) if score < threshold]
        
        # Avoid pruning all heads from a layer
        if len(heads_to_prune[layer]) >= model.config.n_head:
            print(f"Warning: All heads in layer {layer} are marked for pruning. Skipping pruning for this layer.")
            heads_to_prune[layer] = []  # Skip this layer to avoid division by zero

    # Perform pruning
    model.prune_heads(heads_to_prune)

    # Resize token embeddings if necessary (to accommodate pruning)
    model.resize_token_embeddings(len(GPT2Tokenizer.from_pretrained('gpt2')))
    
    # Reset model parameters if necessary
    model.init_weights()  # Ensures weights are reinitialized after structure change
    
    return model


def evaluate_model(model: GPT2LMHeadModel, validation_dataset):
    model.eval()
    training_args = TrainingArguments(
        per_device_eval_batch_size=4,
        output_dir='./results',
        evaluation_strategy="no",
    )
    trainer = Trainer(
        model=model,
        args=training_args
    )
    eval_results = trainer.evaluate(eval_dataset=validation_dataset)
    print(f"Eval Results: {eval_results}")
    perplexity = torch.exp(torch.tensor(eval_results['eval_loss'])).item()
    return perplexity


if __name__ == "__main__":
    # Load and preprocess the CNN/DailyMail dataset
    train_data, validation_data = load_and_preprocess_data("cnn_dailymail", "3.0.0", 100, 50)

    # Load the GPT-2 tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token

    # Tokenize the training and validation data
    train_dataset, validation_dataset = tokenize_data(train_data, validation_data, tokenizer)

    # Initialize the model
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    output_dir = "./results-automated-pruning"

    # Train the model
    trainer = train_and_save_model(model, tokenizer, train_dataset, validation_dataset, output_dir, phase='Initial Training')

    # Compute head importance
    head_importance = compute_head_importance(trainer, validation_dataset)

    # Prune heads automatically based on importance
    model = prune_automated_heads(model, head_importance, threshold=0.1)

    # Reinitialize the model after pruning
    model.eval()

    # Train the pruned model
    trainer_pruned = train_and_save_model(model, tokenizer, train_dataset, validation_dataset, output_dir, phase='Automated Pruning')

    # Evaluate the pruned model
    perplexity = evaluate_model(model, validation_dataset)
    print(f"Automated Pruning Perplexity: {perplexity:.2f}")


Training size: 100, Validation size: 50
Training for Initial Training already completed. Skipping...
Saving model for Initial Training...
Model and tokenizer saved for Initial Training.
Training for Automated Pruning already completed. Skipping...
Saving model for Automated Pruning...
Model and tokenizer saved for Automated Pruning.


  0%|          | 0/13 [00:00<?, ?it/s]

Eval Results: {'eval_loss': 3.339203119277954, 'eval_runtime': 19.0693, 'eval_samples_per_second': 2.622, 'eval_steps_per_second': 0.682}
Automated Pruning Perplexity: 28.20


In [16]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

def load_pruned_model_and_tokenizer(output_dir: str):
    # Load the pruned model and tokenizer
    model = GPT2LMHeadModel.from_pretrained(output_dir)
    tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
    tokenizer.pad_token = tokenizer.eos_token
    model.eval()  # Set the model to evaluation mode
    return model, tokenizer

def generate_text(model, tokenizer, input_text: str, max_length: int = 50, num_return_sequences: int = 1):
    # Encode the input text
    input_ids = tokenizer.encode(input_text, return_tensors='pt')

    # Generate predictions using beam search
    with torch.no_grad():  # No need to track gradients for inference
        outputs = model.generate(input_ids, max_length=max_length, num_return_sequences=num_return_sequences, 
                                 num_beams=num_return_sequences,  # Use beam search
                                 no_repeat_ngram_size=2, 
                                 early_stopping=True,
                                 top_k=50, 
                                 top_p=0.95)

    # Decode the generated sequences
    generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return generated_texts

if __name__ == "__main__":
    output_dir = "./results-automated-pruning"  # Directory where the pruned model is saved

    # Load the pruned model and tokenizer
    model, tokenizer = load_pruned_model_and_tokenizer(output_dir)

    # Prepare input text for inference
    input_text = "The future of AI in healthcare is"
    
    # Generate predictions
    generated_texts = generate_text(model, tokenizer, input_text, max_length=100, num_return_sequences=1)

    # Print the generated texts
    for i, text in enumerate(generated_texts):
        print(f"Generated Text {i + 1}: {text}")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text 1: The future of AI in healthcare is uncertain. The future is not yet clear.

The Future of Healthcare
...
,
 (1) The Future Of Healthcare is a book that explores the future and the challenges facing healthcare. It is an exploration of the ways in which healthcare will change the way we live, work, and play. This book is intended to be a resource for those who are interested in the development of healthcare and how it will affect their lives.


In [27]:
import os
import warnings
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import torch
from typing import Dict

warnings.filterwarnings("ignore")


def load_and_preprocess_data(dataset_name: str, version: str, sample_train_size: int, sample_val_size: int):
    dataset = load_dataset(dataset_name, version)
    processed_data = dataset['train'].map(lambda x: {"text": x['article'], "target": x['highlights']})
    split_data = processed_data.train_test_split(test_size=0.1)
    train_data = split_data['train'].select(range(min(sample_train_size, len(split_data['train']))))
    validation_data = split_data['test'].select(range(min(sample_val_size, len(split_data['test']))))
    print(f"Training size: {len(train_data)}, Validation size: {len(validation_data)}")
    return train_data, validation_data


def tokenize_data(train_data, validation_data, tokenizer):
    def tokenize_function(examples):
        tokens = tokenizer(examples['text'], truncation=True, padding="max_length", max_length=128)
        tokens['labels'] = tokens['input_ids'].copy()
        return tokens

    train_dataset = train_data.map(tokenize_function, batched=True)
    validation_dataset = validation_data.map(tokenize_function, batched=True)

    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    validation_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    return train_dataset, validation_dataset


def train_and_save_model(model: GPT2LMHeadModel, tokenizer: GPT2Tokenizer, train_dataset, validation_dataset, output_dir: str, phase: str):
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=1,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir=os.path.join(output_dir, "logs"),
        logging_steps=10,
        save_steps=10,
        evaluation_strategy="steps",
        eval_steps=10,
        save_total_limit=1
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=validation_dataset
    )

    if os.path.exists(output_dir) and any('checkpoint' in file for file in os.listdir(output_dir)):
        print(f"Training for {phase} already completed. Skipping...")
    else:
        print(f"Starting training for {phase}...")
        trainer.train()
        print(f"Training for {phase} complete!")

    print(f"Saving model for {phase}...")
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Model and tokenizer saved for {phase}.")
    return trainer


def compute_head_importance(trainer: Trainer, eval_dataset) -> Dict[int, float]:
    model = trainer.model
    model.eval()

    head_importance = {}
    for layer in range(model.config.n_layer):
        num_heads = model.config.n_head
        head_importance[layer] = [0.0] * num_heads

    dataloader = trainer.get_eval_dataloader(eval_dataset)

    for batch in dataloader:
        outputs = model(**batch, output_attentions=True)
        attentions = outputs.attentions

        for layer_idx, layer_att in enumerate(attentions):
            layer_mean = layer_att.mean(dim=0)
            for head_idx in range(layer_mean.size(0)):
                head_importance[layer_idx][head_idx] += layer_mean[head_idx].mean().item()

    # Normalize head importance
    for layer in head_importance:
        total = sum(head_importance[layer])
        head_importance[layer] = [head / total for head in head_importance[layer]]

    return head_importance


def prune_automated_heads(model: GPT2LMHeadModel, head_importance: Dict[int, float], threshold: float = 0.2, min_heads_to_retain: int = 2):
    heads_to_prune = {}
    for layer, importance_scores in head_importance.items():
        heads_to_prune[layer] = [head_idx for head_idx, score in enumerate(importance_scores) if score < threshold]
        
        # Avoid pruning all heads from a layer
        if len(heads_to_prune[layer]) >= model.config.n_head - min_heads_to_retain:
            print(f"Warning: Not pruning layer {layer} to retain at least {min_heads_to_retain} heads.")
            heads_to_prune[layer] = []  # Skip this layer

        print(f"Layer {layer} Head Importance Scores: {importance_scores}")

    # Perform pruning
    model.prune_heads(heads_to_prune)

    # Resize token embeddings if necessary
    model.resize_token_embeddings(len(GPT2Tokenizer.from_pretrained('gpt2')))
    
    # Reset model parameters if necessary
    model.init_weights()  # Ensures weights are reinitialized after structure change
    
    return model


def evaluate_model(model: GPT2LMHeadModel, validation_dataset):
    model.eval()
    training_args = TrainingArguments(
        per_device_eval_batch_size=4,
        output_dir='./results',
        evaluation_strategy="no",
    )
    trainer = Trainer(
        model=model,
        args=training_args
    )
    eval_results = trainer.evaluate(eval_dataset=validation_dataset)
    print(f"Eval Results: {eval_results}")
    perplexity = torch.exp(torch.tensor(eval_results['eval_loss'])).item()
    return perplexity


def generate_text(model: GPT2LMHeadModel, tokenizer: GPT2Tokenizer, input_text: str, max_length: int = 50, num_return_sequences: int = 3):
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    # Generate predictions
    with torch.no_grad():  # No need to track gradients for inference
        outputs = model.generate(input_ids, max_length=max_length, num_return_sequences=num_return_sequences, 
                                 no_repeat_ngram_size=2, 
                                 early_stopping=True,
                                 top_k=50, 
                                 top_p=0.95)

    # Decode the generated sequences
    generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return generated_texts


if __name__ == "__main__":
    # Load and preprocess the CNN/DailyMail dataset
    train_data, validation_data = load_and_preprocess_data("cnn_dailymail", "3.0.0", 100, 50)

    # Load the GPT-2 tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token

    # Tokenize the training and validation data
    train_dataset, validation_dataset = tokenize_data(train_data, validation_data, tokenizer)

    # Initialize the model
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    output_dir = "./results-automated-pruning"

    # Train the model
    trainer = train_and_save_model(model, tokenizer, train_dataset, validation_dataset, output_dir, phase='Initial Training')

    # Compute head importance
    head_importance = compute_head_importance(trainer, validation_dataset)

    # Prune heads automatically based on importance
    # model = prune_automated_heads(model, head_importance, threshold=0.2, min_heads_to_retain = 1)
    model = prune_automated_heads(model, head_importance, threshold=0.05, min_heads_to_retain=1)


    # Reinitialize the model after pruning
    model.eval()

    # Train the pruned model
    trainer_pruned = train_and_save_model(model, tokenizer, train_dataset, validation_dataset, output_dir, phase='Automated Pruning')

    # Evaluate the pruned model
    perplexity = evaluate_model(model, validation_dataset)
    print(f"Automated Pruning Perplexity: {perplexity:.2f}")

    # Inference with the pruned model
    input_text = "The future of AI in healthcare is"
    generated_texts = generate_text(model, tokenizer, input_text, max_length=200, num_return_sequences=1)

    # Print the generated texts
    for i, text in enumerate(generated_texts):
        print(f"Generated Text {i + 1}: {text}")


Training size: 100, Validation size: 50
Training for Initial Training already completed. Skipping...
Saving model for Initial Training...
Model and tokenizer saved for Initial Training.
Layer 0 Head Importance Scores: [0.08333333361989413, 0.08333333323781307, 0.08333333323781307, 0.08333333285573201, 0.08333333323781307, 0.08333333400197519, 0.08333333361989413, 0.08333333400197519, 0.08333333400197519, 0.08333333247365095, 0.08333333361989413, 0.08333333209156989]
Layer 1 Head Importance Scores: [0.08333333412933554, 0.08333333298309237, 0.08333333451141658, 0.08333333412933554, 0.0833333326010113, 0.0833333318368492, 0.08333333336517341, 0.08333333221893026, 0.08333333374725448, 0.08333333336517341, 0.08333333374725448, 0.08333333336517341]
Layer 2 Head Importance Scores: [0.08333333336517341, 0.08333333336517341, 0.08333333298309237, 0.08333333298309237, 0.08333333298309237, 0.08333333221893026, 0.08333333069060603, 0.08333333527557871, 0.08333333298309237, 0.08333333527557871, 0.0

  0%|          | 0/13 [00:00<?, ?it/s]

Eval Results: {'eval_loss': 3.339203119277954, 'eval_runtime': 18.2681, 'eval_samples_per_second': 2.737, 'eval_steps_per_second': 0.712}


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Automated Pruning Perplexity: 28.20
Generated Text 1: The future of AI in healthcare is uncertain. The future is not yet clear.

The Future of Healthcare
...
,
 (1) The Future Of Healthcare is a book that explores the future and the challenges facing healthcare. It is an exploration of the ways in which healthcare will change the way we live, work, and play. This book is intended to be a resource for those who are interested in the development of healthcare and how it will affect their lives.


In [26]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

def load_pruned_model_and_tokenizer(output_dir: str):
    # Load the pruned model and tokenizer
    model = GPT2LMHeadModel.from_pretrained(output_dir)
    tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
    tokenizer.pad_token = tokenizer.eos_token
    model.eval()  # Set the model to evaluation mode
    return model, tokenizer

def generate_text(model, tokenizer, input_text: str, max_length: int = 50, num_return_sequences: int = 1):
    # Encode the input text
    input_ids = tokenizer.encode(input_text, return_tensors='pt')

    # Generate predictions using beam search
    with torch.no_grad():  # No need to track gradients for inference
        outputs = model.generate(input_ids, max_length=max_length, num_return_sequences=num_return_sequences, 
                                 num_beams=num_return_sequences,  # Use beam search
                                 no_repeat_ngram_size=2, 
                                 early_stopping=True,
                                 top_k=50, 
                                 top_p=0.95)

    # Decode the generated sequences
    generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return generated_texts

if __name__ == "__main__":
    output_dir = "./results-automated-pruning"  # Directory where the pruned model is saved

    # Load the pruned model and tokenizer
    model, tokenizer = load_pruned_model_and_tokenizer(output_dir)

    # Prepare input text for inference
    input_text = "The future of AI in healthcare is"
    
    # Generate predictions
    generated_texts = generate_text(model, tokenizer, input_text, max_length=100, num_return_sequences=1)

    # Print the generated texts
    for i, text in enumerate(generated_texts):
        print(f"Generated Text {i + 1}: {text}")



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text 1: The future of AI in healthcare is uncertain. The future is not yet clear.

The Future of Healthcare
...
,
 (1) The Future Of Healthcare is a book that explores the future and the challenges facing healthcare. It is an exploration of the ways in which healthcare will change the way we live, work, and play. This book is intended to be a resource for those who are interested in the development of healthcare and how it will affect their lives.


In [None]:
# Recommendations for Further Pruning
# To enable more effective pruning, consider these options:

# Lower the Threshold Further: Lowering the pruning threshold (currently set to 0.2) may help identify less important heads for pruning. You could try:

# python
# Copy code
# model = prune_automated_heads(model, head_importance, threshold=0.05, min_heads_to_retain=1)
# Increase Training Dataset Size: Using a larger training dataset could help create more distinct importance scores, enabling better pruning decisions.

# Manual Head Pruning Based on Scores: If automated pruning doesn’t yield effective results, try manual pruning by removing the heads with the lowest importance scores across layers.

# Different Metrics for Calculating Head Importance: You could explore other ways of calculating importance, such as using gradient-based metrics or fine-tuning loss-based metrics to identify heads for pruning.

# Layer-wise Pruning: Instead of pruning heads from every layer, consider focusing on specific layers where pruning is most beneficial, such as middle or later layers in the transformer.

In [30]:
#  GPT-2 Model

import os
import warnings
from datasets import load_dataset
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments
)
import torch
from typing import Dict

warnings.filterwarnings("ignore")

# Load and preprocess data function for GPT-2
def load_and_preprocess_data_gpt2(dataset_name: str, version: str, sample_train_size: int, sample_val_size: int):
    dataset = load_dataset(dataset_name, version)
    processed_data = dataset['train'].map(lambda x: {"text": x['article'], "target": x['highlights']})
    split_data = processed_data.train_test_split(test_size=0.1)
    train_data = split_data['train'].select(range(min(sample_train_size, len(split_data['train']))))
    validation_data = split_data['test'].select(range(min(sample_val_size, len(split_data['test']))))
    print(f"GPT-2 Training size: {len(train_data)}, Validation size: {len(validation_data)}")
    return train_data, validation_data

# Tokenization function for GPT-2
def tokenize_data_gpt2(train_data, validation_data, tokenizer):
    def tokenize_function(examples):
        tokens = tokenizer(examples['text'], truncation=True, padding="max_length", max_length=128)
        tokens['labels'] = tokens['input_ids'].copy()
        return tokens

    train_dataset = train_data.map(tokenize_function, batched=True)
    validation_dataset = validation_data.map(tokenize_function, batched=True)

    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    validation_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    return train_dataset, validation_dataset

# Training function for GPT-2
def train_and_save_model_gpt2(model, tokenizer, train_dataset, validation_dataset, output_dir: str, phase: str):
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=1,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir=os.path.join(output_dir, "logs"),
        logging_steps=10,
        save_steps=10,
        evaluation_strategy="steps",
        eval_steps=10,
        save_total_limit=1
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=validation_dataset
    )

    if os.path.exists(output_dir) and any('checkpoint' in file for file in os.listdir(output_dir)):
        print(f"GPT-2 Training for {phase} already completed. Skipping...")
    else:
        print(f"Starting training for GPT-2 {phase}...")
        trainer.train()
        print(f"Training for {phase} complete!")

    print(f"Saving GPT-2 model for {phase}...")
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"GPT-2 Model and tokenizer saved for {phase}.")
    return trainer

# Head importance computation for GPT-2
def compute_head_importance_gpt2(trainer: Trainer, eval_dataset) -> Dict[int, float]:
    model = trainer.model
    model.eval()

    head_importance = {}
    for layer in range(model.config.n_layer):
        num_heads = model.config.n_head
        head_importance[layer] = [0.0] * num_heads

    dataloader = trainer.get_eval_dataloader(eval_dataset)

    for batch in dataloader:
        outputs = model(**batch, output_attentions=True)
        attentions = outputs.attentions

        for layer_idx, layer_att in enumerate(attentions):
            layer_mean = layer_att.mean(dim=0)
            for head_idx in range(layer_mean.size(0)):
                head_importance[layer_idx][head_idx] += layer_mean[head_idx].mean().item()

    # Normalize head importance
    for layer in head_importance:
        total = sum(head_importance[layer])
        head_importance[layer] = [head / total for head in head_importance[layer]]

    return head_importance

# Pruning function for GPT-2
def prune_automated_heads_gpt2(model, head_importance: Dict[int, float], threshold: float = 0.2, min_heads_to_retain: int = 2):
    heads_to_prune = {}
    for layer, importance_scores in head_importance.items():
        heads_to_prune[layer] = [head_idx for head_idx, score in enumerate(importance_scores) if score < threshold]
        
        # Avoid pruning all heads from a layer
        if len(heads_to_prune[layer]) >= model.config.n_head - min_heads_to_retain:
            print(f"Warning: Not pruning layer {layer} to retain at least {min_heads_to_retain} heads.")
            heads_to_prune[layer] = []  # Skip this layer

        print(f"Layer {layer} Head Importance Scores: {importance_scores}")

    # Perform pruning
    model.prune_heads(heads_to_prune)

    # Resize token embeddings if necessary
    model.resize_token_embeddings(len(GPT2Tokenizer.from_pretrained('gpt2')))
    
    # Reset model parameters if necessary
    model.init_weights()  # Ensures weights are reinitialized after structure change
    
    return model

# Evaluation function for GPT-2
def evaluate_model_gpt2(model, validation_dataset):
    model.eval()
    training_args = TrainingArguments(
        per_device_eval_batch_size=4,
        output_dir='./results',
        evaluation_strategy="no",
    )
    trainer = Trainer(
        model=model,
        args=training_args
    )
    eval_results = trainer.evaluate(eval_dataset=validation_dataset)
    print(f"Eval Results: {eval_results}")
    perplexity = torch.exp(torch.tensor(eval_results['eval_loss'])).item()
    return perplexity

# Text generation function for GPT-2
def generate_text_gpt2(model, tokenizer, input_text: str, max_length: int = 50, num_return_sequences: int = 3):
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    with torch.no_grad():  # No need to track gradients for inference
        outputs = model.generate(input_ids, max_length=max_length, num_return_sequences=num_return_sequences, 
                                 no_repeat_ngram_size=2, 
                                 early_stopping=True,
                                 top_k=50, 
                                 top_p=0.95)

    generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return generated_texts

# Main execution for GPT-2
if __name__ == "__main__":
    # Load and preprocess the CNN/DailyMail dataset for GPT-2
    train_data, validation_data = load_and_preprocess_data_gpt2("cnn_dailymail", "3.0.0", 100, 50)

    # Load the GPT-2 tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token

    # Tokenize the training and validation data
    train_dataset, validation_dataset = tokenize_data_gpt2(train_data, validation_data, tokenizer)

    # Initialize the model
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    output_dir = "./results-gpt2-automated-pruning"

    # Train the model
    trainer = train_and_save_model_gpt2(model, tokenizer, train_dataset, validation_dataset, output_dir, phase='Initial Training')

    # Compute head importance
    head_importance = compute_head_importance_gpt2(trainer, validation_dataset)

    # Prune heads automatically based on importance
    model = prune_automated_heads_gpt2(model, head_importance, threshold=0.05, min_heads_to_retain=1)

    # Reinitialize the model after pruning
    model.eval()

    # Train the pruned model
    trainer_pruned = train_and_save_model_gpt2(model, tokenizer, train_dataset, validation_dataset, output_dir, phase='Automated Pruning')

    # Evaluate the pruned model
    perplexity = evaluate_model_gpt2(model, validation_dataset)
    print(f"GPT-2 Automated Pruning Perplexity: {perplexity:.2f}")

    # Inference with the pruned model
    input_text = "The future of AI in healthcare is"
    generated_texts = generate_text_gpt2(model, tokenizer, input_text, max_length=200, num_return_sequences=1)

    # Print the generated texts
    for i, text in enumerate(generated_texts):
        print(f"Generated Text {i + 1}: {text}")


Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

GPT-2 Training size: 100, Validation size: 50


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Starting training for GPT-2 Initial Training...


  0%|          | 0/25 [00:00<?, ?it/s]

{'loss': 3.6498, 'grad_norm': 12.507431030273438, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.4}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 3.3359274864196777, 'eval_runtime': 20.0587, 'eval_samples_per_second': 2.493, 'eval_steps_per_second': 0.648, 'epoch': 0.4}
{'loss': 3.6782, 'grad_norm': 11.138589859008789, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.8}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 3.3262970447540283, 'eval_runtime': 19.2813, 'eval_samples_per_second': 2.593, 'eval_steps_per_second': 0.674, 'epoch': 0.8}
{'train_runtime': 172.6697, 'train_samples_per_second': 0.579, 'train_steps_per_second': 0.145, 'train_loss': 3.6711663818359375, 'epoch': 1.0}
Training for Initial Training complete!
Saving GPT-2 model for Initial Training...
GPT-2 Model and tokenizer saved for Initial Training.
Layer 0 Head Importance Scores: [0.08333333454325668, 0.08333333530741881, 0.08333333263285139, 0.08333333225077033, 0.08333333530741881, 0.08333333263285139, 0.08333333301493245, 0.08333333263285139, 0.08333333301493245, 0.08333333377909456, 0.08333333263285139, 0.08333333225077033]
Layer 1 Head Importance Scores: [0.08333333333333333, 0.08333333524373862, 0.08333333256917122, 0.08333333409749545, 0.08333333371541439, 0.08333333371541439, 0.08333333409749545, 0.0833333318050091, 0.08333333409749545, 0.08333333409749545, 0.08333333218709016, 0.08333333104084699]
Layer 2 Hea

  0%|          | 0/13 [00:00<?, ?it/s]

Eval Results: {'eval_loss': 3.3204500675201416, 'eval_runtime': 18.8328, 'eval_samples_per_second': 2.655, 'eval_steps_per_second': 0.69}


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


GPT-2 Automated Pruning Perplexity: 27.67
Generated Text 1: The future of AI in healthcare is uncertain. The future is not yet clear.

The Future of Healthcare
...
,
 (1) The Future Of Healthcare is a new book by Dr. David S. Siegel, PhD, a professor of medicine at the University of California, San Francisco, and a co-author of the new paper. It is available from Amazon.com. (2) Drs.Siegel and Sussman are coauthors of a paper in the journal Science Advances. They are also coauthoring the paper with Dr David A. Schoenfeld, MD, of Harvard Medical School. Dr Skelton is also a member of The American Academy of Pediatrics.


In [None]:
# RoBERTa Model


import glob
import numpy as np
import torch
import torch.nn.utils.prune as prune
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline,
)
import evaluate
from functools import partial

# Set parameters
BATCH_SIZE = 32
NUM_PROCS = 4
LR = 0.00005
EPOCHS = 1
MODEL = 'roberta-base'  # Change to 'roberta-base'
OUT_DIR = 'arxiv_roberta'  # Update the output directory

# Load datasets (using a sample for demonstration)
train_dataset = load_dataset("ccdv/arxiv-classification", split='train[:5%]')
valid_dataset = load_dataset("ccdv/arxiv-classification", split='validation[:10%]')
test_dataset = load_dataset("ccdv/arxiv-classification", split='test[:10%]')

# Label mapping
id2label = {0: "math.AC", 1: "cs.CV", 2: "cs.AI", 3: "cs.SY", 4: "math.GR",
            5: "cs.CE", 6: "cs.PL", 7: "cs.IT", 8: "cs.DS", 9: "cs.NE", 10: "math.ST"}
label2id = {v: k for k, v in id2label.items()}

# Tokenization
tokenizer = AutoTokenizer.from_pretrained(MODEL)

def preprocess_function(tokenizer, examples):
    return tokenizer(examples["text"], truncation=True)

# Tokenize the datasets
tokenized_train = train_dataset.map(partial(preprocess_function, tokenizer), batched=True, num_proc=NUM_PROCS)
tokenized_valid = valid_dataset.map(partial(preprocess_function, tokenizer), batched=True, num_proc=NUM_PROCS)
tokenized_test = test_dataset.map(partial(preprocess_function, tokenizer), batched=True, num_proc=NUM_PROCS)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Metric computation
accuracy = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# Model training setup
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=11, id2label=id2label, label2id=label2id)

training_args = TrainingArguments(
    output_dir=OUT_DIR,
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=3,
    report_to='tensorboard',
    # Remove the following line
    # fp16=True
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Pruning function for dynamic pruning
def dynamic_prune_model(model, amount=0.2, steps=5):
    """Prune the model iteratively during training."""
    for step in range(steps):
        print(f"Pruning step {step + 1}/{steps}")
        # Prune weights in all linear layers
        for name, module in model.named_modules():
            if isinstance(module, torch.nn.Linear):
                prune.l1_unstructured(module, name='weight', amount=amount)
        # Fine-tune the model after pruning
        trainer.train()
        # Remove pruning and make it permanent
        for name, module in model.named_modules():
            if isinstance(module, torch.nn.Linear):
                prune.remove(module, 'weight')

# Apply dynamic pruning
dynamic_prune_model(model, amount=0.1, steps=3)

# Save the pruned model
model.save_pretrained('pruned_arxiv_roberta')
tokenizer.save_pretrained('pruned_arxiv_roberta')

# Load the pruned model for inference
model = AutoModelForSequenceClassification.from_pretrained('pruned_arxiv_roberta')
tokenizer = AutoTokenizer.from_pretrained('pruned_arxiv_roberta')
classify = pipeline(task='text-classification', model=model, tokenizer=tokenizer)

# Evaluate the pruned model on the test dataset
eval_result = trainer.evaluate(tokenized_test)
print(f"Evaluation results after pruning: {eval_result}")

# Inference with the pruned model
all_files = glob.glob('inference_data/*')
for file_name in all_files:
    with open(file_name) as file:
        content = file.read()
        result = classify(content)
        print(f'File: {file_name}, Prediction: {result}, Ground Truth: {file_name.split("_")[-1].split(".txt")[0]}')



In [1]:
# Final Codes

In [None]:
# ALBERT Model

import glob
import numpy as np
import torch
import torch.nn.utils.prune as prune
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline,
)
import evaluate

def run_albert_pruning_pipeline(
    dataset_name="ccdv/arxiv-classification",
    model_name='albert-base-v2',
    out_dir='arxiv_albert',
    batch_size=32,
    num_procs=4,
    lr=0.00005,
    epochs=1,
    pruning_amount=0.1,
    pruning_steps=3,
):
    """
    Train and prune the ALBERT model for text classification on the specified dataset.

    Args:
        dataset_name (str): Name of the dataset to use.
        model_name (str): Pre-trained model name.
        out_dir (str): Output directory for saving the model.
        batch_size (int): Batch size for training and evaluation.
        num_procs (int): Number of processes for tokenization.
        lr (float): Learning rate for training.
        epochs (int): Number of training epochs.
        pruning_amount (float): Fraction of weights to prune in each step.
        pruning_steps (int): Number of pruning steps.

    Returns:
        None
    """
    
    # Load datasets
    train_dataset = load_dataset(dataset_name, split='train[:95%]')
    valid_dataset = load_dataset(dataset_name, split='validation[:100%]')
    test_dataset = load_dataset(dataset_name, split='test[:100%]')

    # Label mapping
    id2label = {0: "math.AC", 1: "cs.CV", 2: "cs.AI", 3: "cs.SY", 
                4: "math.GR", 5: "cs.CE", 6: "cs.PL", 7: "cs.IT", 
                8: "cs.DS", 9: "cs.NE", 10: "math.ST"}
    label2id = {v: k for k, v in id2label.items()}

    # Tokenization
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def preprocess_function(examples):
        return tokenizer(examples["text"], truncation=True)

    # Tokenize the datasets
    tokenized_train = train_dataset.map(preprocess_function, batched=True, num_proc=num_procs)
    tokenized_valid = valid_dataset.map(preprocess_function, batched=True, num_proc=num_procs)
    tokenized_test = test_dataset.map(preprocess_function, batched=True, num_proc=num_procs)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Metric computation
    accuracy = evaluate.load('accuracy')

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return accuracy.compute(predictions=predictions, references=labels)

    # Model training setup
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=11, id2label=id2label, label2id=label2id
    )

    training_args = TrainingArguments(
        output_dir=out_dir,
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        save_total_limit=3,
        report_to='tensorboard',
        fp16=False
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_valid,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()

    # Dynamic pruning
    def dynamic_prune_model(model, amount=0.2, steps=5):
        """Prune the model iteratively during training."""
        for step in range(steps):
            print(f"Pruning step {step + 1}/{steps}")
            # Prune weights in all linear layers
            for name, module in model.named_modules():
                if isinstance(module, torch.nn.Linear):
                    prune.l1_unstructured(module, name='weight', amount=amount)
            # Fine-tune the model after pruning
            trainer.train()
            # Remove pruning and make it permanent
            for name, module in model.named_modules():
                if isinstance(module, torch.nn.Linear):
                    prune.remove(module, 'weight')

    # Apply dynamic pruning
    dynamic_prune_model(model, amount=pruning_amount, steps=pruning_steps)

    # Save the pruned model
    model.save_pretrained('pruned_arxiv_albert')
    tokenizer.save_pretrained('pruned_arxiv_albert')

    # Load the pruned model for inference
    model = AutoModelForSequenceClassification.from_pretrained('pruned_arxiv_albert')
    tokenizer = AutoTokenizer.from_pretrained('pruned_arxiv_albert')
    classify = pipeline(task='text-classification', model=model, tokenizer=tokenizer)

    # Evaluate the pruned model on the test dataset
    eval_result = trainer.evaluate(tokenized_test)
    print(f"Evaluation results after pruning: {eval_result}")

    # Inference with the pruned model
    all_files = glob.glob('inference_data/*')
    for file_name in all_files:
        try:
            with open(file_name) as file:
                content = file.read()
                result = classify(content)
                print(f'File: {file_name}, Prediction: {result}, Ground Truth: {file_name.split("_")[-1].split(".txt")[0]}')
        except Exception as e:
            print(f"Error reading {file_name}: {e}")

# Call the function with custom arguments
run_albert_pruning_pipeline(
    dataset_name="ccdv/arxiv-classification",
    model_name='albert-base-v2',
    out_dir='arxiv_albert',
    batch_size=32,
    num_procs=4,
    lr=0.00005,
    epochs=50,
    pruning_amount=0.1,
    pruning_steps=3
)






# DistilBERT Model

import glob
import numpy as np
import torch
import torch.nn.utils.prune as prune
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline,
)
import evaluate

def run_distilbert_pipeline(dataset_name="ccdv/arxiv-classification", 
                            train_split='train[:95%]', 
                            valid_split='validation[:100%]', 
                            test_split='test[:100%]', 
                            model_name='distilbert-base-uncased', 
                            batch_size=32, 
                            num_procs=4, 
                            lr=0.00005, 
                            epochs=1, 
                            prune_amount=0.1, 
                            prune_steps=3, 
                            out_dir='arxiv_distilbert'):
    """
    Executes the DistilBERT model pipeline including dataset loading, tokenization,
    model training, dynamic pruning, evaluation, and inference.

    Args:
        dataset_name (str): Name of the dataset to load.
        train_split (str): Slice of the training data.
        valid_split (str): Slice of the validation data.
        test_split (str): Slice of the test data.
        model_name (str): Name of the pretrained DistilBERT model.
        batch_size (int): Batch size for training and evaluation.
        num_procs (int): Number of processes for tokenization.
        lr (float): Learning rate for training.
        epochs (int): Number of epochs for training.
        prune_amount (float): Fraction of weights to prune.
        prune_steps (int): Number of pruning steps.
        out_dir (str): Output directory for saving the model.

    Returns:
        None: Prints evaluation results and predictions for inference data.
    """
    # Load datasets
    train_dataset = load_dataset(dataset_name, split=train_split)
    valid_dataset = load_dataset(dataset_name, split=valid_split)
    test_dataset = load_dataset(dataset_name, split=test_split)

    # Label mapping
    id2label = {0: "math.AC", 1: "cs.CV", 2: "cs.AI", 3: "cs.SY", 
                4: "math.GR", 5: "cs.CE", 6: "cs.PL", 7: "cs.IT", 
                8: "cs.DS", 9: "cs.NE", 10: "math.ST"}
    label2id = {v: k for k, v in id2label.items()}

    # Tokenization
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def preprocess_function(examples):
        return tokenizer(examples["text"], truncation=True)

    # Tokenize the datasets
    tokenized_train = train_dataset.map(preprocess_function, batched=True, num_proc=num_procs)
    tokenized_valid = valid_dataset.map(preprocess_function, batched=True, num_proc=num_procs)
    tokenized_test = test_dataset.map(preprocess_function, batched=True, num_proc=num_procs)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Metric computation
    accuracy = evaluate.load('accuracy')
    
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return accuracy.compute(predictions=predictions, references=labels)

    # Model training setup
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(id2label), id2label=id2label, label2id=label2id)

    training_args = TrainingArguments(
        output_dir=out_dir,
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        save_total_limit=3,
        report_to='tensorboard',
        fp16=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_valid,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()

    # Pruning function for dynamic pruning
    def dynamic_prune_model(model, amount=prune_amount, steps=prune_steps):
        """Prune the model iteratively during training."""
        for step in range(steps):
            print(f"Pruning step {step + 1}/{steps}")
            # Prune weights in all linear layers
            for name, module in model.named_modules():
                if isinstance(module, torch.nn.Linear):
                    prune.l1_unstructured(module, name='weight', amount=amount)
            # Fine-tune the model after pruning
            trainer.train()
            # Remove pruning and make it permanent
            for name, module in model.named_modules():
                if isinstance(module, torch.nn.Linear):
                    prune.remove(module, 'weight')

    # Apply dynamic pruning
    dynamic_prune_model(model)

    # Save the pruned model
    model.save_pretrained('pruned_arxiv_distilbert')
    tokenizer.save_pretrained('pruned_arxiv_distilbert')

    # Load the pruned model for inference
    model = AutoModelForSequenceClassification.from_pretrained('pruned_arxiv_distilbert')
    tokenizer = AutoTokenizer.from_pretrained('pruned_arxiv_distilbert')
    classify = pipeline(task='text-classification', model=model, tokenizer=tokenizer)

    # Evaluate the pruned model on the test dataset
    eval_result = trainer.evaluate(tokenized_test)
    print(f"Evaluation results after pruning: {eval_result}")

    # Inference with the pruned model
    all_files = glob.glob('inference_data/*')
    for file_name in all_files:
        with open(file_name) as file:
            content = file.read()
            result = classify(content)
            print(f'File: {file_name}, Prediction: {result}, Ground Truth: {file_name.split("_")[-1].split(".txt")[0]}')

if __name__ == "__main__":
  # Call the function with specified arguments
  run_distilbert_pipeline(
      dataset_name="ccdv/arxiv-classification", 
      train_split='train[:95%]', 
      valid_split='validation[:85%]', 
      test_split='test[:85%]', 
      model_name='distilbert-base-uncased', 
      batch_size=32, 
      num_procs=4, 
      lr=0.00005, 
      epochs=50, 
      prune_amount=0.1, 
      prune_steps=3, 
      out_dir='arxiv_distilbert'
)

# T5 Model

import os
import warnings
import torch
from datasets import load_dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Trainer,
    TrainingArguments
)
from typing import Dict, Tuple

warnings.filterwarnings("ignore")

def run_t5_model_pipeline(dataset_name: str, version: str, sample_train_size: int, sample_val_size: int, output_dir: str):
    """
    Executes the T5 model pipeline including data loading, preprocessing, training, head importance computation, pruning, and evaluation.

    Args:
        dataset_name (str): Name of the dataset to load (e.g., "cnn_dailymail").
        version (str): Version of the dataset to load.
        sample_train_size (int): Number of training samples to use.
        sample_val_size (int): Number of validation samples to use.
        output_dir (str): Directory to save the trained model and tokenizer.

    Returns:
        None: The function prints results and saves the model.
    """

    # Load and preprocess data
    dataset = load_dataset(dataset_name, version)
    processed_data = dataset['train'].map(lambda x: {"input_text": x['article'], "target_text": x['highlights']})
    split_data = processed_data.train_test_split(test_size=0.1)
    train_data = split_data['train'].select(range(min(sample_train_size, len(split_data['train']))))
    validation_data = split_data['test'].select(range(min(sample_val_size, len(split_data['test']))))
    print(f"T5 Training size: {len(train_data)}, Validation size: {len(validation_data)}")

    # Tokenization
    tokenizer = T5Tokenizer.from_pretrained('t5-small')
    
    def tokenize_function(examples):
        inputs = tokenizer(examples['input_text'], truncation=True, padding="max_length", max_length=128)
        labels = tokenizer(examples['target_text'], truncation=True, padding="max_length", max_length=128)
        inputs['labels'] = labels['input_ids']
        return inputs

    train_dataset = train_data.map(tokenize_function, batched=True)
    validation_dataset = validation_data.map(tokenize_function, batched=True)
    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    validation_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    # Training
    model = T5ForConditionalGeneration.from_pretrained('t5-small')
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=50,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir=os.path.join(output_dir, "logs"),
        logging_steps=10,
        save_steps=10,
        evaluation_strategy="steps",
        eval_steps=10,
        save_total_limit=1
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=validation_dataset
    )

    if os.path.exists(output_dir) and any('checkpoint' in file for file in os.listdir(output_dir)):
        print(f"T5 Training already completed. Skipping...")
    else:
        print(f"Starting training for T5...")
        trainer.train()
        print(f"Training complete!")

    print(f"Saving T5 model...")
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"T5 Model and tokenizer saved.")

    # Compute head importance
    model.eval()
    head_importance = {layer: [0.0] * model.config.num_heads for layer in range(model.config.num_layers)}
    dataloader = trainer.get_eval_dataloader(validation_dataset)

    for batch in dataloader:
        with torch.no_grad():
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']
            decoder_input_ids = labels[:, :-1]

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, output_attentions=True, return_dict=True)
            encoder_attentions = outputs.encoder_attentions
            decoder_attentions = outputs.decoder_attentions

            for layer_idx in range(len(encoder_attentions)):
                layer_mean = encoder_attentions[layer_idx].mean(dim=(0, 1, 2)).detach()
                for head_idx in range(min(layer_mean.size(0), len(head_importance[layer_idx]))):
                    head_importance[layer_idx][head_idx] += layer_mean[head_idx].item()

            for layer_idx in range(len(decoder_attentions)):
                layer_dec_mean = decoder_attentions[layer_idx].mean(dim=(0, 1, 2)).detach()
                for head_idx in range(min(layer_dec_mean.size(0), len(head_importance[layer_idx]))):
                    head_importance[layer_idx][head_idx] += layer_dec_mean[head_idx].item()

    for layer in head_importance:
        total = sum(head_importance[layer])
        if total > 0:
            head_importance[layer] = [head / total for head in head_importance[layer]]
        else:
            head_importance[layer] = [0.0] * model.config.num_heads

    heads_to_prune = {layer: [head_idx for head_idx, score in enumerate(head_importance[layer]) if score < 0.05] for layer in head_importance}
    
    # Prune heads
    for layer, heads in heads_to_prune.items():
        if heads:
            model.encoder.block[layer].layer[0].SelfAttention.prune_heads(heads)
            model.decoder.block[layer].layer[0].SelfAttention.prune_heads(heads)

    model.eval()
    trainer_pruned = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=validation_dataset
    )
    
    # Evaluate
    eval_results = trainer_pruned.evaluate(eval_dataset=validation_dataset)
    print(f"Eval Results: {eval_results}")
    perplexity = torch.exp(torch.tensor(eval_results['eval_loss'])).item()
    print(f"T5 Automated Pruning Perplexity: {perplexity:.2f}")

if __name__ == "__main__":
    run_t5_model_pipeline("cnn_dailymail", "3.0.0", 27000, 10000, "./results-t5-automated-pruning")



# RoBERTa Model

import glob
import numpy as np
import torch
import torch.nn.utils.prune as prune
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline,
)
import evaluate

def roberta_pipeline(model_name: str, 
                    dataset_name: str, 
                    sample_fraction: float, 
                    out_dir: str, 
                    batch_size: int = 16, 
                    num_epochs: int = 5, 
                    learning_rate: float = 5e-5,
                    pruning_amount: float = 0.1, 
                    pruning_steps: int = 3):
    """
    Complete RoBERTa pipeline including loading data, training, pruning, evaluation, and text classification.

    Args:
        model_name (str): Name of the pretrained RoBERTa model.
        dataset_name (str): Name of the dataset to be loaded.
        sample_fraction (float): Fraction of the dataset to use for training/validation/testing.
        out_dir (str): Directory to save the model.
        batch_size (int): Training batch size. Default is 16.
        num_epochs (int): Number of training epochs. Default is 5.
        learning_rate (float): Learning rate for training. Default is 5e-5.
        pruning_amount (float): Fraction of weights to prune. Default is 0.1.
        pruning_steps (int): Number of pruning steps to perform. Default is 3.
    """

    # Load datasets
    train_dataset = load_dataset(dataset_name, split=f'train[:{int(sample_fraction * 100)}%]')
    valid_dataset = load_dataset(dataset_name, split=f'validation[:{int(sample_fraction * 100)}%]')
    test_dataset = load_dataset(dataset_name, split=f'test[:{int(sample_fraction * 100)}%]')

    # Label mapping
    id2label = {0: "math.AC", 1: "cs.CV", 2: "cs.AI", 3: "cs.SY", 4: "math.GR",
                5: "cs.CE", 6: "cs.PL", 7: "cs.IT", 8: "cs.DS", 9: "cs.NE", 10: "math.ST"}
    label2id = {v: k for k, v in id2label.items()}

    # Tokenization
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def preprocess_function(examples):
        return tokenizer(examples["text"], truncation=True)

    tokenized_train = train_dataset.map(preprocess_function, batched=True)
    tokenized_valid = valid_dataset.map(preprocess_function, batched=True)
    tokenized_test = test_dataset.map(preprocess_function, batched=True)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Metric computation
    accuracy = evaluate.load('accuracy')
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return accuracy.compute(predictions=predictions, references=labels)

    # Model training setup
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(id2label), id2label=id2label, label2id=label2id)
    training_args = TrainingArguments(
        output_dir=out_dir,
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        save_total_limit=3,
        report_to='tensorboard',
        fp16=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_valid,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()

    # Dynamic pruning function
    def dynamic_prune_model(model, amount=0.2, steps=5):
        """Prune the model iteratively during training."""
        for step in range(steps):
            print(f"Pruning step {step+1}/{steps}")
            # Prune weights in all linear layers
            for name, module in model.named_modules():
                if isinstance(module, torch.nn.Linear):
                    prune.l1_unstructured(module, name='weight', amount=amount)
            # Fine-tune the model after pruning
            trainer.train()
            # Remove pruning and make it permanent
            for name, module in model.named_modules():
                if isinstance(module, torch.nn.Linear):
                    prune.remove(module, 'weight')

    # Apply dynamic pruning
    dynamic_prune_model(model, amount=pruning_amount, steps=pruning_steps)

    # Save the pruned model
    model.save_pretrained(f'pruned_{out_dir}')
    tokenizer.save_pretrained(f'pruned_{out_dir}')

    # Load the pruned model for inference
    model = AutoModelForSequenceClassification.from_pretrained(f'pruned_{out_dir}')
    tokenizer = AutoTokenizer.from_pretrained(f'pruned_{out_dir}')
    classify = pipeline(task='text-classification', model=model, tokenizer=tokenizer)

    # Evaluate the pruned model on the test dataset
    eval_result = trainer.evaluate(tokenized_test)
    print(f"Evaluation results after pruning: {eval_result}")

    # Inference with the pruned model
    all_files = glob.glob('inference_data/*')
    for file_name in all_files:
        with open(file_name) as file:
            content = file.read()
            result = classify(content)
            print(f'File: {file_name}, Prediction: {result}, Ground Truth: {file_name.split("_")[-1].split(".txt")[0]}')

# Main execution for RoBERTa
if __name__ == "__main__":
    roberta_pipeline(
        model_name='roberta-base',
        dataset_name='ccdv/arxiv-classification',
        sample_fraction=0.95,  # Use 95% of training data
        out_dir='arxiv_roberta',
        batch_size=16,
        num_epochs=50,
        learning_rate=5e-5,
        pruning_amount=0.1,
        pruning_steps=3
    )


# BERT Model
import glob
import numpy as np
import torch
import torch.nn.utils.prune as prune
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline,
)
import evaluate

def bert_pipeline(model_name: str, 
                 dataset_name: str, 
                 sample_fraction: float, 
                 out_dir: str, 
                 batch_size: int = 32, 
                 num_epochs: int = 3, 
                 learning_rate: float = 5e-5,
                 pruning_amount: float = 0.1, 
                 pruning_steps: int = 3):
    """
    Complete BERT pipeline including loading data, training, pruning, evaluation, and text classification.

    Args:
        model_name (str): Name of the pretrained BERT model.
        dataset_name (str): Name of the dataset to be loaded.
        sample_fraction (float): Fraction of the dataset to use for training/validation/testing.
        out_dir (str): Directory to save the model.
        batch_size (int): Training batch size. Default is 32.
        num_epochs (int): Number of training epochs. Default is 3.
        learning_rate (float): Learning rate for training. Default is 5e-5.
        pruning_amount (float): Fraction of weights to prune. Default is 0.1.
        pruning_steps (int): Number of pruning steps to perform. Default is 3.
    """
    
    # Load datasets
    train_dataset = load_dataset(dataset_name, split=f'train[:{int(sample_fraction * 100)}%]')
    valid_dataset = load_dataset(dataset_name, split=f'validation[:{int(sample_fraction * 100)}%]')
    test_dataset = load_dataset(dataset_name, split=f'test[:{int(sample_fraction * 100)}%]')

    # Label mapping
    id2label = {0: "math.AC", 1: "cs.CV", 2: "cs.AI", 3: "cs.SY", 4: "math.GR",
                5: "cs.CE", 6: "cs.PL", 7: "cs.IT", 8: "cs.DS", 9: "cs.NE", 10: "math.ST"}
    label2id = {v: k for k, v in id2label.items()}

    # Tokenization
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def preprocess_function(examples):
        return tokenizer(examples["text"], truncation=True)

    tokenized_train = train_dataset.map(preprocess_function, batched=True)
    tokenized_valid = valid_dataset.map(preprocess_function, batched=True)
    tokenized_test = test_dataset.map(preprocess_function, batched=True)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Metric computation
    accuracy = evaluate.load('accuracy')
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return accuracy.compute(predictions=predictions, references=labels)

    # Model training setup
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(id2label), id2label=id2label, label2id=label2id)
    training_args = TrainingArguments(
        output_dir=out_dir,
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        save_total_limit=3,
        report_to='tensorboard',
        fp16=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_valid,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()

    # Dynamic pruning function
    def dynamic_prune_model(model, amount=0.2, steps=5):
        """Prune the model iteratively during training."""
        for step in range(steps):
            print(f"Pruning step {step+1}/{steps}")
            # Prune weights in all linear layers
            for name, module in model.named_modules():
                if isinstance(module, torch.nn.Linear):
                    prune.l1_unstructured(module, name='weight', amount=amount)
            # Fine-tune the model after pruning
            trainer.train()
            # Remove pruning and make it permanent
            for name, module in model.named_modules():
                if isinstance(module, torch.nn.Linear):
                    prune.remove(module, 'weight')

    # Apply dynamic pruning
    dynamic_prune_model(model, amount=pruning_amount, steps=pruning_steps)

    # Save the pruned model
    model.save_pretrained(f'pruned_{out_dir}')
    tokenizer.save_pretrained(f'pruned_{out_dir}')

    # Load the pruned model for inference
    model = AutoModelForSequenceClassification.from_pretrained(f'pruned_{out_dir}')
    tokenizer = AutoTokenizer.from_pretrained(f'pruned_{out_dir}')
    classify = pipeline(task='text-classification', model=model, tokenizer=tokenizer)

    # Evaluate the pruned model on the test dataset
    eval_result = trainer.evaluate(tokenized_test)
    print(f"Evaluation results after pruning: {eval_result}")

    # Inference with the pruned model
    all_files = glob.glob('inference_data/*')
    for file_name in all_files:
        with open(file_name) as file:
            content = file.read()
            result = classify(content)
            print(f'File: {file_name}, Prediction: {result}, Ground Truth: {file_name.split("_")[-1].split(".txt")[0]}')

# Main execution for BERT
if __name__ == "__main__":
    bert_pipeline(
        model_name='bert-base-uncased',
        dataset_name='ccdv/arxiv-classification',
        sample_fraction=0.8,
        out_dir='arxiv_bert',
        batch_size=32,
        num_epochs=50,
        learning_rate=5e-5,
        pruning_amount=0.1,
        pruning_steps=3
    )


# GPT2 Model
import os
import warnings
from datasets import load_dataset
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments
)
import torch
from typing import Dict, List

warnings.filterwarnings("ignore")

def gpt2_pipeline(dataset_name: str, version: str, sample_train_size: int, sample_val_size: int, output_dir: str, input_text: str):
    """Complete GPT-2 pipeline including loading data, training, pruning, evaluation, and text generation."""
    
    # Load and preprocess data
    dataset = load_dataset(dataset_name, version)
    processed_data = dataset['train'].map(lambda x: {"text": x['article'], "target": x['highlights']})
    split_data = processed_data.train_test_split(test_size=0.1)
    train_data = split_data['train'].select(range(min(sample_train_size, len(split_data['train']))))
    validation_data = split_data['test'].select(range(min(sample_val_size, len(split_data['test']))))
    
    # Load tokenizer and set padding token
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token

    # Tokenization function
    def tokenize_data(data):
        def tokenize_function(examples):
            tokens = tokenizer(examples['text'], truncation=True, padding="max_length", max_length=128)
            tokens['labels'] = tokens['input_ids'].copy()
            return tokens

        return data.map(tokenize_function, batched=True).set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    train_dataset = tokenize_data(train_data)
    validation_dataset = tokenize_data(validation_data)

    # Initialize model
    model = GPT2LMHeadModel.from_pretrained('gpt2')

    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=50,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir=os.path.join(output_dir, "logs"),
        logging_steps=10,
        save_steps=10,
        evaluation_strategy="steps",
        eval_steps=10,
        save_total_limit=1
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=validation_dataset
    )

    # Train the model
    trainer.train()
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    # Compute head importance
    model.eval()
    head_importance = {}
    for layer in range(model.config.n_layer):
        head_importance[layer] = [0.0] * model.config.n_head

    dataloader = trainer.get_eval_dataloader(validation_dataset)

    for batch in dataloader:
        outputs = model(**batch, output_attentions=True)
        attentions = outputs.attentions

        for layer_idx, layer_att in enumerate(attentions):
            layer_mean = layer_att.mean(dim=0)
            for head_idx in range(layer_mean.size(0)):
                head_importance[layer_idx][head_idx] += layer_mean[head_idx].mean().item()

    for layer in head_importance:
        total = sum(head_importance[layer])
        head_importance[layer] = [head / total for head in head_importance[layer]]

    # Prune heads based on importance
    heads_to_prune = {}
    for layer, importance_scores in head_importance.items():
        heads_to_prune[layer] = [head_idx for head_idx, score in enumerate(importance_scores) if score < 0.2]
        if len(heads_to_prune[layer]) >= model.config.n_head - 1:
            heads_to_prune[layer] = []  # Skip pruning if too many heads would be removed

    model.prune_heads(heads_to_prune)

    # Train the pruned model
    trainer.train()
    trainer.save_model(output_dir)

    # Evaluate the pruned model
    eval_results = trainer.evaluate(eval_dataset=validation_dataset)
    perplexity = torch.exp(torch.tensor(eval_results['eval_loss'])).item()
    print(f"GPT-2 Automated Pruning Perplexity: {perplexity:.2f}")

    # Text generation
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    with torch.no_grad():
        outputs = model.generate(input_ids, max_length=50, num_return_sequences=1, no_repeat_ngram_size=2, early_stopping=True)
    
    generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return generated_texts

# Main execution for GPT-2
if __name__ == "__main__":
    generated = gpt2_pipeline("cnn_dailymail", "3.0.0", 27000, 10000, "./results-gpt2-automated-pruning", "The future of AI in healthcare is")
    for i, text in enumerate(generated):
        print(f"Generated Text {i + 1}: {text}")


In [None]:
GPT2 model: Perplexity: 25.96

BERT Model: Accuracy: 85.95
RoBERTa Model: Accuracy: 81.69
T5Model: Perplexity: 155.25
DistilBERT Model: Accuracy: 89.25
ALBERT Model: Accuracy: 75.69

In [None]:
# Model Evaluation and Results

This repository contains the implementation and evaluation of various transformer models for natural language processing tasks. The primary models evaluated include GPT-2, BERT, RoBERTa, T5, DistilBERT, and ALBERT.

## Models and Evaluation Metrics

The following models have been evaluated with their respective metrics:

| Model                | Evaluation Accuracy | Perplexity | Model Size After Pruning |
|----------------------|---------------------|------------|--------------------------|
| ALBERT               | 75.69%              | -          | 11 MB                    |
| BERT                 | 85.95%              | -          | 420 MB                   |
| DistilBERT           | 89.25%              | -          | 66 MB                    |
| RoBERTa              | 81.69%              | -          | 500 MB                   |
| GPT-2                | -                   | 25.96      | 345 MB                   |
| T5                   | -                   | 155.25     | 220 MB                   |

## Dependencies

To run the models in this repository, you will need the following Python packages:

- `torch`
- `transformers`
- `datasets`
- `evaluate`
- `numpy`

You can install the required packages using pip:

```bash
pip install torch transformers datasets evaluate numpy


In [None]:
# Using Unsloth for fine-tuning the LLM-transformer model
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
def fine_tune_llm_qwen2_5_or_llama3_or_phi3_5(model_name = "unsloth/Qwen2.5-0.5B",dataset_name = "mlabonne/FineTome-100k",epochs = 5):
    try:
        if "Qwen2" in model_name:
            chat_template = "qwen-2.5"
        elif "Llama-3.2" in model_name:
            chat_template = "llama-3.1"
        elif "Llama" in model_name:
            chat_template = "llama-3.1"
        elif "Phi" in model_name:
            chat_template = "phi-3"


        from unsloth import FastLanguageModel
        import torch
        max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
        dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
        load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

        # 4bit pre quantized models we support for 4x faster downloading + no OOMs.
        fourbit_models = [
            "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
            "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
            "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
            "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
            "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
            "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
            "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
            "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
            "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
            "unsloth/Phi-3-medium-4k-instruct",
            "unsloth/gemma-2-9b-bnb-4bit",
            "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
        ] # More models at https://huggingface.co/unsloth

        model, tokenizer = FastLanguageModel.from_pretrained(
            # Can select any from the below:
            # "unsloth/Qwen2.5-0.5B", "unsloth/Qwen2.5-1.5B", "unsloth/Qwen2.5-3B"
            # "unsloth/Qwen2.5-14B",  "unsloth/Qwen2.5-32B",  "unsloth/Qwen2.5-72B",
            # And also all Instruct versions and Math. Coding verisons!
            model_name = model_name,
            max_seq_length = max_seq_length,
            dtype = dtype,
            load_in_4bit = load_in_4bit,
            # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
        )

        model = FastLanguageModel.get_peft_model(
                    model,
                    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
                    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                                    "gate_proj", "up_proj", "down_proj",],
                    lora_alpha = 16,
                    lora_dropout = 0, # Supports any, but = 0 is optimized
                    bias = "none",    # Supports any, but = "none" is optimized
                    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
                    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
                    random_state = 3407,
                    use_rslora = False,  # We support rank stabilized LoRA
                    loftq_config = None, # And LoftQ
                )


        def formatting_prompts_func(examples):
            convos = examples["conversations"]
            texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
            return { "text" : texts, }
        pass

        from datasets import load_dataset
        dataset = load_dataset(dataset_name, split = "train")
        from unsloth.chat_templates import standardize_sharegpt
        dataset = standardize_sharegpt(dataset)
        dataset = dataset.map(formatting_prompts_func, batched = True,)
        from rich import print
        print(dataset)
        print(dataset[5]["conversations"])
        print(dataset[5]["text"])
        from trl import SFTTrainer
        from transformers import TrainingArguments, DataCollatorForSeq2Seq
        from unsloth import is_bfloat16_supported

        trainer = SFTTrainer(
            model = model,
            tokenizer = tokenizer,
            train_dataset = dataset,
            dataset_text_field = "text",
            max_seq_length = max_seq_length,
            data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
            dataset_num_proc = 2,
            packing = False, # Can make training 5x faster for short sequences.
            args = TrainingArguments(
                per_device_train_batch_size = 2,
                gradient_accumulation_steps = 4,
                warmup_steps = epochs,
                num_train_epochs = 10, # Set this for 1 full training run.
                # max_steps = 60,
                learning_rate = 2e-4,
                fp16 = not is_bfloat16_supported(),
                bf16 = is_bfloat16_supported(),
                logging_steps = 1,
                optim = "adamw_8bit",
                weight_decay = 0.01,
                lr_scheduler_type = "linear",
                seed = 3407,
                output_dir = "outputs",
            ),
        )
        from unsloth.chat_templates import train_on_responses_only
        trainer = train_on_responses_only(
            trainer,
            instruction_part = "<|im_start|>user\n",
            response_part = "<|im_start|>assistant\n",
        )
        print(tokenizer.decode(trainer.train_dataset[5]["input_ids"]))
        space = tokenizer(" ", add_special_tokens = False).input_ids[0]
        print(tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]]))
        #@title Show current memory stats
        gpu_stats = torch.cuda.get_device_properties(0)
        start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
        max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
        print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
        print(f"{start_gpu_memory} GB of memory reserved.")
        trainer_stats = trainer.train()
        #@title Show final memory and time stats
        used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
        used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
        used_percentage = round(used_memory         /max_memory*100, 3)
        lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
        print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
        print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
        print(f"Peak reserved memory = {used_memory} GB.")
        print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
        print(f"Peak reserved memory % of max memory = {used_percentage} %.")
        print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
        from unsloth.chat_templates import get_chat_template

        tokenizer = get_chat_template(
            tokenizer,
            chat_template = chat_template,
        )
        FastLanguageModel.for_inference(model) # Enable native 2x faster inference

        messages = [
            {"role": "user", "content": "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,"},
        ]
        inputs = tokenizer.apply_chat_template(
            messages,
            tokenize = True,
            add_generation_prompt = True, # Must add for generation
            return_tensors = "pt",
        ).to("cuda")

        outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True,
                                temperature = 1.5, min_p = 0.1)
        print(tokenizer.batch_decode(outputs))

        FastLanguageModel.for_inference(model) # Enable native 2x faster inference
        messages = [
            {"role": "user", "content": "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,"},
        ]
        inputs = tokenizer.apply_chat_template(
            messages,
            tokenize = True,
            add_generation_prompt = True, # Must add for generation
            return_tensors = "pt",
        ).to("cuda")

        from transformers import TextStreamer
        text_streamer = TextStreamer(tokenizer, skip_prompt = True)
        _ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
                        use_cache = True, temperature = 1.5, min_p = 0.1)
        
        model.save_pretrained("lora_model") # Local saving
        tokenizer.save_pretrained("lora_model")
    except Exception as e:
        print(e)
