In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
from peft import LoraConfig
import platform
import pandas as pd

EXT_DATA_ROOT = "../external/" 

# Define the LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./lora_output",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    remove_unused_columns=False,
    use_mps_device=False if platform.system() == "Linux" else True
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the pre-trained model and tokenizer
model_name = "mistralai/Mistral-7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Add LoRA to the model
model = model.apply_lora(lora_config)

ValueError: Unrecognized configuration class <class 'transformers.models.mistral.configuration_mistral.MistralConfig'> for this kind of AutoModel: AutoModelForSeq2SeqLM.
Model type should be one of BartConfig, BigBirdPegasusConfig, BlenderbotConfig, BlenderbotSmallConfig, EncoderDecoderConfig, FSMTConfig, GPTSanJapaneseConfig, LEDConfig, LongT5Config, M2M100Config, MarianConfig, MBartConfig, MT5Config, MvpConfig, NllbMoeConfig, PegasusConfig, PegasusXConfig, PLBartConfig, ProphetNetConfig, SeamlessM4TConfig, SeamlessM4Tv2Config, SwitchTransformersConfig, T5Config, UMT5Config, XLMProphetNetConfig.

In [None]:
# Prepare the training data
nbroad_dataset = pd.read_csv(f"{EXT_DATA_ROOT}/nbroad_mistral.csv")
original_sentences = nbroad_dataset['original_text'].tolist()  # List of original sentences
generated_sentences = nbroad_dataset['rewritten_text'].tolist()  # List of corresponding generated sentences
prompts = nbroad_dataset['rewrite_prompt'].tolist() # List of corresponding prompts

In [None]:

# Tokenize the training data
original_encodings = tokenizer(original_sentences, padding=True, truncation=True, return_tensors="pt")
generated_encodings = tokenizer(generated_sentences, padding=True, truncation=True, return_tensors="pt")
prompt_encodings = tokenizer(prompts, padding=True, truncation=True, return_tensors="pt")

# Define the data collator
class DataCollatorForPromptRecovery:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, examples):
        original_batch = self.tokenizer.pad([{"input_ids": e["original_input_ids"], "attention_mask": e["original_attention_mask"]} for e in examples], return_tensors="pt")
        generated_batch = self.tokenizer.pad([{"input_ids": e["generated_input_ids"], "attention_mask": e["generated_attention_mask"]} for e in examples], return_tensors="pt")
        prompt_batch = self.tokenizer.pad([{"input_ids": e["prompt_input_ids"], "attention_mask": e["prompt_attention_mask"]} for e in examples], return_tensors="pt")

        batch = {
            "input_ids": original_batch["input_ids"],
            "attention_mask": original_batch["attention_mask"],
            "labels": prompt_batch["input_ids"],
            "decoder_input_ids": generated_batch["input_ids"],
            "decoder_attention_mask": generated_batch["attention_mask"],
        }
        return batch

data_collator = DataCollatorForPromptRecovery(tokenizer)

In [None]:
# Create the Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset={"original": original_encodings, "generated": generated_encodings, "prompt": prompt_encodings},
    data_collator=data_collator,
)

# Fine-tune the model with LoRA
trainer.train()

# Save the fine-tuned model
model.save_pretrained("lora_prompt_recovery")