In [1]:
# install Hugging Face Libraries
!pip install "peft==0.2.0"
!pip install "transformers==4.27.2" "datasets==2.9.0" "accelerate==0.17.1" "evaluate==0.4.0" "bitsandbytes==0.37.1" loralib --upgrade --quiet
# install additional dependencies needed for training
!pip install rouge-score tensorboard py7zr




In [2]:
from datasets import load_dataset

# Load dataset from the hub
dataset = load_dataset("egalize/legal_summarization")

print(f"Train dataset size: {len(dataset['train'])}")
print(f"Test dataset size: {len(dataset['test'])}")

# Train dataset size: 14732
# Test dataset size: 819


Using custom data configuration egalize--legal_summarization-454cda284ab7f119
Found cached dataset csv (C:/Users/Khanh/.cache/huggingface/datasets/egalize___csv/egalize--legal_summarization-454cda284ab7f119/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/2 [00:00<?, ?it/s]

Train dataset size: 356
Test dataset size: 90


In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id="nsi319/legal-pegasus"

# Load tokenizer of Legal-pegasus
tokenizer = AutoTokenizer.from_pretrained(model_id)


In [4]:
from datasets import concatenate_datasets
import numpy as np
# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["original_text"], truncation=True), batched=True, remove_columns=["original_text", "reference_summary"])
input_lengths = [len(x) for x in tokenized_inputs["input_ids"]]
# take 85 percentile of max length for better utilization
max_source_length = int(np.percentile(input_lengths, 85))
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["reference_summary"], truncation=True), batched=True, remove_columns=["original_text", "reference_summary"])
target_lengths = [len(x) for x in tokenized_targets["input_ids"]]
# take 90 percentile of max length for better utilization
max_target_length = int(np.percentile(target_lengths, 90))
print(f"Max target length: {max_target_length}")


  0%|          | 0/1 [00:00<?, ?ba/s]

Loading cached processed dataset at C:\Users\Khanh\.cache\huggingface\datasets\egalize___csv\egalize--legal_summarization-454cda284ab7f119\0.0.0\6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-5491b9397dda3d8b.arrow


Max source length: 152
Max target length: 35


In [6]:
def preprocess_function(sample,padding="max_length"):
    # add prefix to the input for t5
    inputs = ["summarize: " + item for item in sample["original_text"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["reference_summary"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["original_text", "reference_summary"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

# save datasets to disk for later easy loading
tokenized_dataset["train"].save_to_disk("data/train")
tokenized_dataset["test"].save_to_disk("data/eval")


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


Saving the dataset (0/1 shards):   0%|          | 0/356 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/90 [00:00<?, ? examples/s]

In [18]:
pip install git+https://github.com/huggingface/transformers.git

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to c:\users\khanh\appdata\local\temp\pip-req-build-nc0mtij2
  Resolved https://github.com/huggingface/transformers.git to commit 63864e057fd4ecbf54c77599702873f7be871e65
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting tokenizers<0.15,>=0.14
  Downloading tokenizers-0.14.0-cp39-none-win_amd64.whl (2.2 MB)
     ---------------------------------------- 2.2/2.2 MB 5.8 MB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.16.4
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
     -------------------------------------- 268.8/268.8 kB 8.1 MB/s eta 0:00:00
Building wheels for

  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git 'C:\Users\Khanh\AppData\Local\Temp\pip-req-build-nc0mtij2'
ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\Khanh\\anaconda3\\Lib\\site-packages\\~okenizers\\tokenizers.cp39-win_amd64.pyd'
Consider using the `--user` option or check the permissions.



In [22]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, load_in_8bit=True, device_map = {"shared": 0, "encoder": 0, "decoder": 1, "lm_head": 1})



ValueError: final_logits_bias doesn't have any device set.

In [16]:
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType



model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


ValueError: Target modules ['q', 'v'] not found in the base model. Please check the target modules and try again.