In [1]:
%pip install -U --quiet transformers huggingface_hub datasets bitsandbytes accelerate wandb

Note: you may need to restart the kernel to use updated packages.


In [None]:
!pip install git+https://github.com/huggingface/trl.git@7630f877f91c556d9e5a3baa4b6e2894d90ff84c --upgrade
!pip install git+https://github.com/huggingface/peft@4a1559582281fc3c9283892caea8ccef1d6f5a4f --upgrade

In [3]:
from huggingface_hub import login

login(
  token="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"
)

In [14]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch


model_name = "meta-llama/Llama-3.2-3B-Instruct"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    low_cpu_mem_usage=True,
    trust_remote_code=True,
    #device_map="auto",
    #device_map={'':torch.cuda.current_device()},
    #device_map={'': 'cuda:0'},
    attn_implementation="eager",
    quantization_config=bnb_config
).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.unk_token if tokenizer.unk_token else tokenizer.eos_token
tokenizer.padding_side = 'right' # to prevent warnings

In [16]:
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
if model.config.pad_token_id is None:
    model.config.pad_token_id = model.config.eos_token_id

In [6]:
from datasets import load_dataset

dataset = load_dataset("Geraldine/Ead-Instruct-33k", split="train")

README.md:   0%|          | 0.00/383 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/33300 [00:00<?, ? examples/s]

In [7]:
system_message = """You are an archivist expert in EAD/XML format for archival records metadata."""

def create_conversation(row):
  return {
    "messages": [
      {"role": "system", "content": system_message},
      {"role": "user", "content": row["prompt"]},
      {"role": "assistant", "content": row["completion"]}
    ]
  }

dataset = dataset.map(create_conversation, remove_columns=dataset.features, batched=False)

Map:   0%|          | 0/33300 [00:00<?, ? examples/s]

In [8]:
dataset

Dataset({
    features: ['messages'],
    num_rows: 33300
})

In [17]:
dataset[0]

{'messages': [{'content': 'You are an archivist expert in EAD/XML format for archival records metadata.',
   'role': 'system'},
  {'content': 'Given this EAD/XML snippet representing a parent element, generate a valid child element that fits within the EAD structure. Snippet: <ead id="N65537"></ead>',
   'role': 'user'},
  {'content': '<ead id="N65537"><eadheader countryencoding="iso3166-1" dateencoding="iso8601" langencoding="iso639-2b" relatedencoding="dc" repositoryencoding="iso15511" scriptencoding="iso15924"><eadid countrycode="fr" encodinganalog="identifier" identifier="FRCGMNOV-330636101-jEG" mainagencycode="751131015">FRCGMNOV-330636101-jEG.xml</eadid></eadheader></ead>',
   'role': 'assistant'}]}

In [18]:
import wandb

wandb.login(key="744a17e2b8e4079049d05739e13e7756d1f56e1a")
run = wandb.init(
    project='Fine-tune Llama 3.2 on EAD 33k dataset', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: Currently logged in as: [33mgrldn-geoffroy[0m ([33mgrldn-geoffroy-smartbibl-ia[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/user/.netrc


In [19]:
from peft import LoraConfig

# LoRA config based on QLoRA paper & Sebastian Raschka experiment
peft_config = LoraConfig(
        lora_alpha=128,
        lora_dropout=0.05,
        r=256,
        bias="none",
        target_modules="all-linear",
        task_type="CAUSAL_LM",
)

In [20]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="outputs", # directory to save and repository id
    num_train_epochs=3,                     # number of training epochs
    per_device_train_batch_size=3,          # batch size per device during training
    gradient_accumulation_steps=2,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=10,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    bf16=True,                              # use bfloat16 precision
    #tf32=True,                              # use tf32 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    push_to_hub=False,                       # push model to hub
    report_to="wandb",                # report metrics to tensorboard
)

In [21]:
from trl import SFTTrainer

max_seq_length = 2048 # max sequence length for model and packing of the dataset

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    },
    args=args,
)

Generating train split: 0 examples [00:00, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (268674 > 131072). Running this sequence through the model will result in indexing errors
  super().__init__(


In [22]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,0.8713
20,0.654
30,0.527
40,0.5225
50,0.5001
60,0.4389
70,0.5317
80,0.4754
90,0.478
100,0.4329




TrainOutput(global_step=1437, training_loss=0.3322137685475784, metrics={'train_runtime': 6375.116, 'train_samples_per_second': 1.353, 'train_steps_per_second': 0.225, 'total_flos': 3.395386112905052e+17, 'train_loss': 0.3322137685475784, 'epoch': 2.994786235662148})

In [23]:
wandb.finish()

0,1
train/epoch,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇██
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇██
train/grad_norm,▄▁▂▂▄▁▄▂▂▄▁▃▁▂▄▂▂▆▅▃▄▄▆▂▃▅▃▄▃▂▂▇█▆▅▅▄▇▅▃
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,█▄▄▄▃▄▃▃▃▃▃▃▂▂▂▃▂▃▃▂▂▂▂▂▃▃▂▂▁▁▂▁▁▂▁▁▂▁▁▁

0,1
total_flos,3.395386112905052e+17
train/epoch,2.99479
train/global_step,1437.0
train/grad_norm,0.10498
train/learning_rate,0.0002
train/loss,0.2146
train_loss,0.33221
train_runtime,6375.116
train_samples_per_second,1.353
train_steps_per_second,0.225


In [24]:
trainer.save_model()

In [25]:
trainer.model.push_to_hub("outputs", use_temp_dir=False)

adapter_model.safetensors:   0%|          | 0.00/778M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Geraldine/outputs/commit/aecf4e83f4c8aa80dfff39c029352227eeb4e85d', commit_message='Upload model', commit_description='', oid='aecf4e83f4c8aa80dfff39c029352227eeb4e85d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Geraldine/outputs', endpoint='https://huggingface.co', repo_type='model', repo_id='Geraldine/outputs'), pr_revision=None, pr_num=None)

In [26]:
peft_model_id = "outputs"
tr_model_id = "meta-llama/Llama-3.2-3B-Instruct"

In [27]:
from peft import AutoPeftModelForCausalLM, PeftModel
model = AutoModelForCausalLM.from_pretrained(tr_model_id, trust_remote_code=True, torch_dtype=torch.float16,
     low_cpu_mem_usage=True,)
model = PeftModel.from_pretrained(model, peft_model_id)
merged_model = model.merge_and_unload()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [28]:
merged_model.push_to_hub(f"Geraldine/FineLlama-3.2-3B-Instruct-ead")

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Geraldine/FineLlama-3.2-3B-Instruct-ead/commit/4086d9bdb4a039fcba088f6b46204554208c846c', commit_message='Upload LlamaForCausalLM', commit_description='', oid='4086d9bdb4a039fcba088f6b46204554208c846c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Geraldine/FineLlama-3.2-3B-Instruct-ead', endpoint='https://huggingface.co', repo_type='model', repo_id='Geraldine/FineLlama-3.2-3B-Instruct-ead'), pr_revision=None, pr_num=None)

In [29]:
tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
tokenizer.push_to_hub(f"Geraldine/FineLlama-3.2-3B-Instruct-ead")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Geraldine/FineLlama-3.2-3B-Instruct-ead/commit/432e22c0494879cca7312851e9700c87cc191da4', commit_message='Upload tokenizer', commit_description='', oid='432e22c0494879cca7312851e9700c87cc191da4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Geraldine/FineLlama-3.2-3B-Instruct-ead', endpoint='https://huggingface.co', repo_type='model', repo_id='Geraldine/FineLlama-3.2-3B-Instruct-ead'), pr_revision=None, pr_num=None)

In [30]:
import torch
from transformers import pipeline

model_id = "Geraldine/FineLlama-3.2-3B-Instruct-ead"
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
messages = [
    {"role": "system", "content": "You are an archivist expert in EAD/XML format for archival records metadata. Structure every response as follows: Step 1 - Identify elements to generate: [List the XML elements needed] ; Step 2 - Generate each element: [Generate Element 1] [Generate Element 2] ... ; Step 3 - Merged result: [Complete XML with all elements properly nested]. Rules: - NEVER use ellipsis or abbreviations (...) ; - Generate complete content for each element ; - Ensure proper nesting and closing of all tags ; - Include all required attributes ; - Verify XML validity before providing final result"},
    {"role": "user", "content": "I need a complete <eadheader> section"},
]
outputs = pipe(
    messages,
    max_new_tokens=2048,
)
print(outputs[0]["generated_text"][-1])

config.json:   0%|          | 0.00/927 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/439 [00:00<?, ?B/s]

Device set to use cuda:0


{'role': 'assistant', 'content': '<eadheader countryencoding="iso3166-1" dateencoding="iso8601" findaidstatus="provisoire" langencoding="iso639-2b" relatedencoding="dc" repositoryencoding="iso15511" scriptencoding="iso15924"><filedesc><titlestmt><titleproper encodinganalog="title">Bibliothèque scientifique du Muséum d\'histoire naturelle de Nantes. Collection Voltaire (1694-1778)</titleproper><author encodinganalog="creator">Bibliothèque scientifique du Muséum d\'histoire naturelle de Nantes</author></titlestmt><publicationstmt><publisher encodinganalog="publisher">Médiathèque Grand Nantes</publisher></publicationstmt></filedesc><profiledesc><langusage>Catalogue rédigé en</langusage></profiledesc></eadheader>'}


In [36]:
messages = [
    {"role": "system", "content": "You are an archivist expert in EAD/XML format for archival records metadata."},
    {"role": "user", "content": "Create a complete EAD/XML template for the Veretti fund that has been created in 2023 and is composed of two main series : one serie of pieces of personal correspondance received by Martin Veretti and the other serie of personal notes."},
]
outputs = pipe(
    messages,
    max_new_tokens=2048,
)
print(outputs[0]["generated_text"][-1])

{'role': 'assistant', 'content': '<archdesc level="fonds"><did><repository><corpname authfilenumber="130016101" normal="Bibliothèque municipale de Vannes" source="Répertoire_des_Centres_de_Ressources">Bibliothèque municipale de Vannes</corpname><address><addressline>Place de Bretagne</addressline><addressline>56000 Vannes</addressline><addressline>02.97.01.62.69</addressline></address></repository><unitid type="cotes_extrêmes">1</unitid><unittitle>Fonds Martin Veretti</unittitle><unitdate calendar="gregorian" era="ce" normal="2023/2023">2023</unitdate></did><scopecontent><p>Le fonds Martin Veretti est composé de deux série de documents : une série de lettres et cartes postales reçues par Martin Veretti et une série de notes et d\'extraits de textes.</p></scopecontent><bioghist><p> Martin Veretti est un historien et archiviste paléographe français. Né à Paris en 1957, il a commencé sa carrière professionnelle dans les collections patrimoniales de la bibliothèque nationale de France. En 

In [37]:
from huggingface_hub import snapshot_download

snapshot_download(repo_id="Geraldine/FineLlama-3.2-3B-Instruct-ead", local_dir="FineLlama-3.2-3B-Instruct-ead")

Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

'/home/user/FineLlama-3.2-3B-Instruct-ead'

In [38]:
from huggingface_hub import HfApi
api = HfApi(token="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")

model_id = "Geraldine/FineLlama-3.2-3B-Instruct-ead-GGUF"
api.create_repo(model_id, exist_ok=True, repo_type="model")

RepoUrl('https://huggingface.co/Geraldine/FineLlama-3.2-3B-Instruct-ead-GGUF', endpoint='https://huggingface.co', repo_type='model', repo_id='Geraldine/FineLlama-3.2-3B-Instruct-ead-GGUF')

In [45]:
api.upload_file(
    path_or_fileobj="FineLlama-3.2-3B-Instruct-ead-Q8_0.gguf",
    path_in_repo="FineLlama-3.2-3B-Instruct-ead-Q8_0.gguf",
    repo_id=model_id,
)

FineLlama-3.2-3B-Instruct-ead-Q8_0.gguf:   0%|          | 0.00/3.42G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Geraldine/FineLlama-3.2-3B-Instruct-ead-GGUF/commit/ef1ec2c1d179ce9018aecc384857b3d100165bb8', commit_message='Upload FineLlama-3.2-3B-Instruct-ead-Q8_0.gguf with huggingface_hub', commit_description='', oid='ef1ec2c1d179ce9018aecc384857b3d100165bb8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Geraldine/FineLlama-3.2-3B-Instruct-ead-GGUF', endpoint='https://huggingface.co', repo_type='model', repo_id='Geraldine/FineLlama-3.2-3B-Instruct-ead-GGUF'), pr_revision=None, pr_num=None)

In [47]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    "Geraldine/FineLlama-3.2-3B-Instruct-ead",
    torch_dtype="auto",
    low_cpu_mem_usage=True,
    trust_remote_code=True,
    #device_map="auto",
    #device_map={'':torch.cuda.current_device()},
    #device_map={'': 'cuda:0'},
    attn_implementation="eager",
    quantization_config=bnb_config
).to("cuda")
tokenizer = AutoTokenizer.from_pretrained("Geraldine/FineLlama-3.2-3B-Instruct-ead")

if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

system_prompt = """
You are an AI model expert in EAD/XML format for archival records metadata and specialized in generating EAD/XML content. 
You have been fine-tuned on instructions (prompt-completion pairs) to generate accurate and well-structured EAD/XML content based on user-provided data.

### Instructions for Behavior:
1. **Purpose:** Generate EAD/XML snippets strictly based on the information provided by the user. Do not assume or invent additional information beyond the given input.
2. **Format:** Ensure all responses are structured in valid and well-formed EAD/XML format.
3. **Boundaries:** If the user’s query requires information not explicitly provided in their input, respond with a clarification request, such as: *"I can only generate based on the provided data. Could you provide more details?"*
4. **Error Handling:** If the provided input cannot be transformed into valid EAD/XML, explain why and provide guidance for correcting the input.
5. **Conciseness:** Avoid adding any explanations, comments, or content outside of the EAD/XML snippet unless explicitly asked by the user.
6. **Adherence to Standards:** Ensure the EAD/XML you generate adheres to the standard practices and structure of EAD (Encoded Archival Description).

### Example Behavior:
- **User Input:** "Title: Inventory of the Smith Family Papers, Date: 2023, Language: English."
- **Response:** 
```xml
<ead>
  <archdesc>
    <did>
      <unittitle>Inventory of the Smith Family Papers</unittitle>
      <unitdate>2023</unitdate>
      <langmaterial>
        <language langcode="eng">English</language>
      </langmaterial>
    </did>
  </archdesc>
</ead>
```

If you understand these instructions, you may begin responding to user queries.

"""
messages = [
  {"role": "system", "content": system_prompt},
  {"role": "user", "content": "Generate a valid EAD template that describes the archival Fund Martin Veretti (which was the president of the EPFL from 2015 to 2022) : the Fonds Veretti has been created in 2023 by the EPFL library and it is composed of two main series : one serie of pieces of personal correspondance received by Martin Veretti and the other serie of personal notes."},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    return_dict=True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")
outputs = model.generate(**inputs, max_new_tokens = 4096,pad_token_id = tokenizer.eos_token_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [48]:
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

system

Cutting Knowledge Date: December 2023
Today Date: 24 Dec 2024

You are an AI model expert in EAD/XML format for archival records metadata and specialized in generating EAD/XML content. 
You have been fine-tuned on instructions (prompt-completion pairs) to generate accurate and well-structured EAD/XML content based on user-provided data.

### Instructions for Behavior:
1. **Purpose:** Generate EAD/XML snippets strictly based on the information provided by the user. Do not assume or invent additional information beyond the given input.
2. **Format:** Ensure all responses are structured in valid and well-formed EAD/XML format.
3. **Boundaries:** If the user’s query requires information not explicitly provided in their input, respond with a clarification request, such as: *"I can only generate based on the provided data. Could you provide more details?"*
4. **Error Handling:** If the provided input cannot be transformed into valid EAD/XML, explain why and provide guidance for corre