In [None]:
import datasets
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, TaskType, PeftModel
from transformers import TrainingArguments, Trainer
from evaluate import load

def load_data(input: str) -> datasets.DatasetDict:
    """
    Load a dataset from the Hugging Face Hub or a local file in CSV or JSON format.

    Args:
        input (str): The name of the dataset on the Hub or the path of the local file.

    Returns:
        datasets.DatasetDict: A dictionary of datasets with splits as keys.
    """
    try:
        dataset = datasets.load_dataset(input)
    except ValueError:
        ext = input.split(".")[-1]
        if ext == "csv":
            dataset = datasets.load_dataset("csv", data_files=input)
        elif ext == "json":
            dataset = datasets.load_dataset("json", data_files=input)
        else:
            raise ValueError(f"Unsupported file format: {ext}")
    return dataset

def process_data(dataset: datasets.DatasetDict, func: Callable) -> datasets.DatasetDict:
    """
    Process a dataset using a custom function.

    Args:
        dataset (datasets.DatasetDict): A dictionary of datasets with splits as keys.
        func (Callable): A function that takes a dictionary of features as input and returns a modified dictionary of features as output.

    Returns:
        datasets.DatasetDict: A dictionary of processed datasets with splits as keys.
    """
    processed_dataset = dataset.map(func, batched=True)
    return processed_dataset

def load_tokenizer(input: str) -> AutoTokenizer:
    """
    Load a tokenizer from a model name or a local path.

    Args:
        input (str): The model name on the Hub or the local path of the model/tokenizer.

    Returns:
        AutoTokenizer: An instance of the tokenizer that corresponds to the model.
    """
    try:
        tokenizer = AutoTokenizer.from_pretrained(input)
    except OSError:
        raise ValueError(f"Invalid model name or path: {input}")
    return tokenizer

def generate_text(input: str, model: AutoModelForCausalLM, tokenizer: AutoTokenizer) -> str:
    """
    Generate text from a given input using the model and tokenizer.

    Args:
        input (str): The input text to generate text from.
        model (AutoModelForCausalLM): The model to use for generating text.
        tokenizer (AutoTokenizer): The tokenizer to use for encoding and decoding the input and output text.

    Returns:
        str: The generated text.
    """
    input_ids = tokenizer.encode(input, return_tensors="pt")
    output_ids = model.generate(input_ids, max_length=200, pad_token_id=tokenizer.eos_token_id)
    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return output

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"],
    task_type=TaskType.CAUSAL_LM
)

model = AutoModelForCausalLM.from_pretrained("t5-base")
tokenizer = AutoTokenizer.from_pretrained("t5-base")

dataset = load_data("wikitext")
tokenized_dataset = process_data(dataset, lambda examples: tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512))

model = PeftModel.from_pretrained(model, lora_config)

training_args = transformers.TrainingArguments(
    output_dir="output",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    learning_rate=1e-4,
    fp16=True,
    fp16_backend="amp",
    save_total_limit=1,
    save_steps=500,
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="perplexity",
    greater_is_better=False,
)

trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

trainer.train()

accuracy = load("accuracy")
perplexity = load("perplexity")
bleu = load("bleu")

generated_texts = []

for example in tokenized_dataset["test"]:
    input = tokenizer.decode(example["input_ids"][0], skip_special_tokens=True)
    output = generate_text(input, model, tokenizer)
    generated_texts.append(output)

accuracy_score = accuracy.compute(references=tokenized_dataset["test"]["text"], predictions=generated_texts)
perplexity_score = perplexity.compute(references=tokenized_dataset["test"]["text"], predictions=generated_texts)
bleu_score = bleu.compute(references=tokenized_dataset["test"]["text"], predictions=generated_texts)

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from peft import LoraConfig, TaskType, PeftModel
from transformers import TrainingArguments
from transformers import Trainer
from evaluate import load
def load_data(input: str) -> datasets.DatasetDict:
    """
    Load a dataset from the Hugging Face Hub or a local file in CSV or JSON format.

    Args:
        input (str): The name of the dataset on the Hub or the path of the local file.

    Returns:
        datasets.DatasetDict: A dictionary of datasets with splits as keys.
    """
    # Try to load the dataset from the Hub
    try:
        dataset = load_dataset(input)
    # If the input is not a valid dataset name, try to load it from a local file
    except ValueError:
        # Check the file extension
        ext = input.split(".")[-1]
        # If the file is in CSV format, use the csv script
        if ext == "csv":
            dataset = load_dataset("csv", data_files=input)
        # If the file is in JSON format, use the json script
        elif ext == "json":
            dataset = load_dataset("json", data_files=input)
        # Otherwise, raise an error
        else:
            raise ValueError(f"Unsupported file format: {ext}")
    # Return the dataset
    return dataset

# Define a function to process a dataset using a custom function
def process_data(dataset: datasets.DatasetDict, func: Callable) -> datasets.DatasetDict:
    """
    Process a dataset using a custom function.

    Args:
        dataset (datasets.DatasetDict): A dictionary of datasets with splits as keys.
        func (Callable): A function that takes a dictionary of features as input and returns a modified dictionary of features as output.

    Returns:
        datasets.DatasetDict: A dictionary of processed datasets with splits as keys.
    """
    # Apply the function to each split of the dataset
    processed_dataset = dataset.map(func)
    # Return the processed dataset
    return processed_dataset


# Define a function to load a tokenizer from a model name or a local path
def load_tokenizer(input: str) -> AutoTokenizer:
    """
    Load a tokenizer from a model name or a local path.

    Args:
        input (str): The model name on the Hub or the local path of the model/tokenizer.

    Returns:
        AutoTokenizer: An instance of the tokenizer that corresponds to the model.
    """
    # Try to load the tokenizer using the AutoTokenizer class
    try:
        tokenizer = AutoTokenizer.from_pretrained(input)
    # If the input is not a valid model name or path, raise an error
    except OSError:
        raise ValueError(f"Invalid model name or path: {input}")
    # Return the tokenizer
    return tokenizer

# Define a function to generate text from a given input
def generate_text(input):
    # Encode the input and generate the output
    input_ids = model.tokenizer.encode(input, return_tensors="pt")
    output_ids = model.generate(input_ids)
    # Decode the output and return it
    output = model.tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return output
# # Example usage
# # Load the tokenizer from a model name on the Hub
# tokenizer = load_tokenizer("bert-base-cased")
# # Load the tokenizer from a local path
# tokenizer = load_tokenizer("./saved_model/")




# Define a function to load a model from a model name or a local path
def load_model(input: str) -> AutoModelForCausalLM:
    """
    Load a model from a model name or a local path.

    Args:
        input (str): The model name on the Hub or the local path of the model.

    Returns:
        AutoModelForCausalLM: An instance of the model that corresponds to the input.
    """
    # Try to load the model using the AutoModelForCausalLM class
    try:
        model = AutoModelForCausalLM.from_pretrained(input)
    # If the input is not a valid model name or path, raise an error
    except OSError:
        raise ValueError(f"Invalid model name or path: {input}")
    # Return the model
    return model
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"], # apply LoRA to the query and value projection layers
    task_type=TaskType.CAUSAL_LM # specify the task type as causal language modeling
)

training_args = TrainingArguments(
    output_dir="output", # specify the output directory
    num_train_epochs=1, # specify the number of training epochs
    per_device_train_batch_size=1, # specify the batch size per device
    gradient_accumulation_steps=16, # specify the number of steps to accumulate gradients
    learning_rate=1e-4, # specify the learning rate
    fp16=True, # use mixed precision training
    fp16_backend="amp", # use the apex library for mixed precision
    save_total_limit=1, # limit the number of saved checkpoints
    save_steps=500, # save a checkpoint every 500 steps
    logging_steps=100, # log the training metrics every 100 steps
    evaluation_strategy="steps", # evaluate the model every eval_steps
    eval_steps=500, # evaluate the model every 500 steps
    load_best_model_at_end=True, # load the best model at the end of training
    metric_for_best_model="perplexity", # use perplexity as the metric to select the best model
    greater_is_better=False # lower perplexity is better
)

trainer = Trainer(
    model=lora_model, # use the LoRA model
    args=training_args, # use the training arguments
    train_dataset=dataset["train"], # use the train split of the dataset
    eval_dataset=dataset["test"], # use the test split of the dataset
    tokenizer=None # no need to use a tokenizer for this task
)
trainer.train()

# Load the accuracy, perplexity and BLEU metrics from the evaluate module
accuracy = load("accuracy")
perplexity = load("perplexity")
bleu = load("bleu")

# Define a function to generate text from a given input
def generate_text(input):
    # Encode the input and generate the output
    input_ids = model.tokenizer.encode(input, return_tensors="pt")
    output_ids = model.generate(input_ids)
    # Decode the output and return it
    output = model.tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return output

# Define a list to store the generated texts
generated_texts = []

# Loop through the test split of the dataset
for example in dataset["test"]:
    # Get the input and the reference text
    input = example["input"]
    reference = example["target"]
    # Generate the output text
    output = generate_text(input)
    # Append the output text to the list
    generated_texts.append(output)

# Compute the accuracy, perplexity and BLEU scores using the metrics
accuracy_score = accuracy.compute(references=dataset["test"]["target"], predictions=generated_texts)
perplexity_score = perplexity.compute(references=dataset["test"]["target"], predictions=generated_texts)
bleu_score = bleu.compute(references=dataset["test"]["target"], predictions=generated_texts)



In [None]:
# Import the PEFT library and the AutoModelForCausalLM class from transformers
from peft import LoraConfig, TaskType, PeftModel
from transformers import AutoModelForCausalLM

# Define the model name or path and the dataset name or path
model_name_or_path = "databricks/dolly-v2-3b"
dataset_name_or_path = "path/to/dataset"

# Define the best LoRA arguments based on the PEFT documentation and the model size
# For a 3B model, the recommended values are r=16, lora_alpha=32, and lora_dropout=0.05
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"], # apply LoRA to the query and value projection layers
    task_type=TaskType.CAUSAL_LM # specify the task type as causal language modeling
)

# Load the base model and the LoRA model using the AutoModelForCausalLM class
base_model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
lora_model = PeftModel(base_model, lora_config)

# Load the dataset using the datasets library
from datasets import load_dataset
dataset = load_dataset(dataset_name_or_path)

# Define the training arguments using the TrainingArguments class from transformers
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="output", # specify the output directory
    num_train_epochs=1, # specify the number of training epochs
    per_device_train_batch_size=1, # specify the batch size per device
    gradient_accumulation_steps=16, # specify the number of steps to accumulate gradients
    learning_rate=1e-4, # specify the learning rate
    fp16=True, # use mixed precision training
    fp16_backend="amp", # use the apex library for mixed precision
    save_total_limit=1, # limit the number of saved checkpoints
    save_steps=500, # save a checkpoint every 500 steps
    logging_steps=100, # log the training metrics every 100 steps
    evaluation_strategy="steps", # evaluate the model every eval_steps
    eval_steps=500, # evaluate the model every 500 steps
    load_best_model_at_end=True, # load the best model at the end of training
    metric_for_best_model="perplexity", # use perplexity as the metric to select the best model
    greater_is_better=False # lower perplexity is better
)

# Define the trainer using the Trainer class from transformers
from transformers import Trainer
trainer = Trainer(
    model=lora_model, # use the LoRA model
    args=training_args, # use the training arguments
    train_dataset=dataset["train"], # use the train split of the dataset
    eval_dataset=dataset["test"], # use the test split of the dataset
    tokenizer=None # no need to use a tokenizer for this task
)

# Train the LoRA model
trainer.train()

# Save the LoRA model
trainer.save_model("lora_model")
