In [1]:
from datasets import load_dataset,load_from_disk

# Load dataset from the hub
train_dataset = load_from_disk("../data/orgl")
val_dataset = load_from_disk("../data/all_val")

print(f"Train dataset size: {len(train_dataset['train'])}")
print(f"Validation dataset size: {len(val_dataset)}")

  from .autonotebook import tqdm as notebook_tqdm


Train dataset size: 24845
Validation dataset size: 8


In [2]:
val_dataset

DatasetDict({
    copa-ck: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'idx', 'label'],
        num_rows: 100
    })
    copa-en: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'label', 'idx'],
        num_rows: 100
    })
    copa-hr: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'label', 'idx', 'changed'],
        num_rows: 100
    })
    copa-mk: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'label', 'idx', 'changed'],
        num_rows: 100
    })
    copa-sl: Dataset({
        features: ['choice1', 'choice2', 'idx', 'label', 'premise', 'question'],
        num_rows: 100
    })
    copa-sl-cer: Dataset({
        features: ['choice1', 'choice2', 'idx', 'label', 'premise', 'question'],
        num_rows: 100
    })
    copa-sr: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'label', 'idx', 'changed'],
        num_rows: 100
    })
    copa-sr-tor: D

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id="../models/aya-101"
tokenizer = AutoTokenizer.from_pretrained(model_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
all_lang_data={
    'english':'copa-en',
    'croatian':'copa-hr',
    'Slovenian':'copa-sl',
    'Cerkno-dialect-of-Slovenian':'copa-sl-cer',
    'Serbian':'copa-sr',
    'Torlak-dialect':'copa-sr-tor',
    'Macedonian':'copa-mk' ,
    'Chakavian':'copa-ck' 
}
prompt_template="""Instruction: Given the premise, ""{premise}"", What is the most plausible {question}?
    A: {choice1}
    B: {choice2}
    Plausible {question}:"""

choices=["A","B"]

def construct_prompt(row):
    prompt=(prompt_template.format(**row, correct_answer="")).strip()
    return {'inputs':prompt,'labels':choices[row['label']]}
    
train_dataset_p=train_dataset.map(construct_prompt)
val_dataset_p=val_dataset.map(construct_prompt)

100%|██████████| 24845/24845 [00:02<00:00, 12388.82ex/s]
100%|██████████| 100/100 [00:00<00:00, 13823.88ex/s]
100%|██████████| 100/100 [00:00<00:00, 14223.77ex/s]
100%|██████████| 100/100 [00:00<00:00, 12930.62ex/s]
100%|██████████| 100/100 [00:00<00:00, 12807.04ex/s]
100%|██████████| 100/100 [00:00<00:00, 13454.06ex/s]
100%|██████████| 100/100 [00:00<00:00, 13613.45ex/s]
100%|██████████| 100/100 [00:00<00:00, 12855.32ex/s]
100%|██████████| 100/100 [00:00<00:00, 12796.88ex/s]


In [5]:
from datasets import concatenate_datasets
import numpy as np
# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
def tokenize_input(dataset):
    tokenized_inputs = dataset.map(lambda x: tokenizer(x["inputs"], truncation=True), batched=True, remove_columns=["inputs", "labels"])
    input_lenghts = [len(x) for x in tokenized_inputs["input_ids"]]
    # take 85 percentile of max length for better utilization
    max_source_length = int(np.percentile(input_lenghts, 85))
    print(f"Max source length: {max_source_length}")

    # The maximum total sequence length for target text after tokenization.
    # Sequences longer than this will be truncated, sequences shorter will be padded."
    tokenized_targets = dataset.map(lambda x: tokenizer(x["labels"], truncation=True), batched=True, remove_columns=["inputs", "labels"])
    target_lenghts = [len(x) for x in tokenized_targets["input_ids"]]
    # take 90 percentile of max length for better utilization
    max_target_length = int(np.percentile(target_lenghts, 90))
    print(f"Max target length: {max_target_length}")
    return max_source_length,max_target_length
    
max_source_length,max_target_length=tokenize_input(train_dataset_p['train'])

  0%|          | 0/25 [00:00<?, ?ba/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 25/25 [00:02<00:00,  9.61ba/s]


Max source length: 68


100%|██████████| 25/25 [00:00<00:00, 65.27ba/s]


Max target length: 2


In [6]:
def preprocess_function(sample,padding="max_length"):
    # add prefix to the input for t5
    inputs = [item for item in sample["inputs"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["labels"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = train_dataset_p.map(preprocess_function, batched=True, remove_columns=["premise", "choice1","choice2","question","idx","label","changed","inputs"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset.features)}")

# save datasets to disk for later easy loading
# tokenized_dataset.save_to_disk("data/train")
# tokenized_dataset["test"].save_to_disk("data/eval")

100%|██████████| 25/25 [00:02<00:00,  8.46ba/s]


AttributeError: 'DatasetDict' object has no attribute 'features'

In [52]:
from datasets import load_from_disk

tokenized_dataset=load_from_disk('data/train')

In [53]:
tokenized_dataset

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 24845
})

In [5]:
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

# Define LoRA Config
lora_config = LoraConfig(
 r=16,
 lora_alpha=32,
 target_modules=["q", "v"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [51]:
from transformers import AutoModelForSeq2SeqLM

# huggingface hub model id
model_id = "../models/aya-101"

# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, load_in_8bit=True, device_map="auto")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


RuntimeError: Failed to import transformers.models.t5.modeling_t5 because of the following error (look up to see its traceback):
[Errno 13] Permission denied: '/root/orc-open-ondemand/www-ood/ood/apps/sys/dashboard'

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

output_dir="../output_models/lora"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
	auto_find_batch_size=True,
    learning_rate=1e-3, # higher learning rate
    num_train_epochs=5,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="no",
    max_steps=10
)

In [None]:
# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
)

In [None]:
# Save our LoRA model & tokenizer results
peft_model_id="temp"
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)

In [1]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load peft config for pre-trained checkpoint etc.
peft_model_id = "temp"
config = PeftConfig.from_pretrained(peft_model_id)

# load base LLM model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path,   device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id, device_map={"":0})

  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 

Loading checkpoint shards: 100%|██████████| 11/11 [02:16<00:00, 12.44s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


False
'CUDASetup' object has no attribute 'cuda_available'


  warn("The installed version of bitsandbytes was compiled without GPU support. "
