In [1]:
import torch
import transformers
import trl
import json
import os
from tqdm import tqdm

from typing import Optional
import re
# os.environ["WANDB_DISABLED"] = "true"

print(f"üì¶ PyTorch version: {torch.__version__}")
print(f"ü§ó Transformers version: {transformers.__version__}")
print(f"üìä TRL version: {trl.__version__}")
print(f"Is cuda available: {torch.cuda.is_available()}")

  from .autonotebook import tqdm as notebook_tqdm


üì¶ PyTorch version: 2.7.1+cu128
ü§ó Transformers version: 4.54.0
üìä TRL version: 0.20.0
Is cuda available: True


In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# https://huggingface.co/LiquidAI/LFM2-1.2B
# model_id = "LiquidAI/LFM2-1.2B"


# https://huggingface.co/LiquidAI/LFM2-1.2B-Extract
model_id = "LiquidAI/LFM2-1.2B-Extract"


print("üìö Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_id)

# IMPORTANT: Set left padding for decoder-only models
tokenizer.padding_side = 'left'
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("üß† Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype="bfloat16",
    # attn_implementation="flash_attention_2" # <- uncomment on compatible GPU
)

print("‚úÖ Local model loaded successfully!")
print(f"üî¢ Parameters: {model.num_parameters():,}")
print(f"üìñ Vocab size: {len(tokenizer)}")
print(f"üíæ Model size: ~{model.num_parameters() * 2 / 1e9:.1f} GB (bfloat16)")
print(f"Model device: {model.device}")
print(f"model.training = {model.training}")
print(f"Padding side: {tokenizer.padding_side}")

üìö Loading tokenizer...
üß† Loading model...
‚úÖ Local model loaded successfully!
üî¢ Parameters: 1,170,340,608
üìñ Vocab size: 64400
üíæ Model size: ~2.3 GB (bfloat16)
Model device: cuda:0
model.training = False
Padding side: left


In [4]:
import wandb

RUN_NAME = f"{model_id.replace('/', '-')}-run-przm043"

run = wandb.init(
    # Set the wandb entity where your project will be logged (generally your team name).
    entity="kainoj2-none",
    # Set the wandb project where this run will be logged.
    project="liquid-ai",
    name=RUN_NAME,
    config={
        "learning_rate": 5e-5,
        "lr_scheduler_type": "linear",
        "warmup_steps": 100,
        "warmup_ratio": 0.2,
    },
)

[34m[1mwandb[0m: Currently logged in as: [33mkainoj2[0m ([33mkainoj2-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


# Dataset

In [5]:
SYSTEM_PROMPT = """Identify and extract information matching the following schema.
Return data as a JSON object. 
For each field, select most suitable value from text
If provided text does not contain sufficient information to fill out the field, make the field empty string.
Output only JSON, and output only four fields. 

{
    "full_name": "name of the person",
    "company_name": "name of the company",
    "address": "address of the plance",
    "phone_number": "phone number"
}
"""

def promptify(example):
   
    text = example['text']
    json_label = example['json']

    return {
        "prompt": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": text}
        ],
        "completion": [
            {"role": "assistant", "content": json_label}
        ],
    }

os.makedirs(f"./models/{RUN_NAME}", exist_ok=True)
with open(f"./models/{RUN_NAME}/system_prompt.txt", "w+") as f:
    f.write(SYSTEM_PROMPT)

### Their dataset 
https://github.com/stockmarkteam/ner-wikipedia-dataset/

In [7]:
from their_dataset import load_their_dataset, add_json_label

# ds = load_their_dataset().map(add_json_label).map(promptify)
# ds_splits = ds["train"].train_test_split(test_size=1000, seed=42) 

# For testing
ds = load_their_dataset().map(add_json_label).map(promptify)
ds_splits = ds["train"].select(range(1000)).train_test_split(test_size=0.2, seed=42) 

ds_train = ds_splits['train']
ds_eval = ds_splits['test']

ds_train, ds_eval

(Dataset({
     features: ['entities', 'text', 'curid', 'address', 'full_name', 'company_name', 'json', 'prompt', 'completion'],
     num_rows: 800
 }),
 Dataset({
     features: ['entities', 'text', 'curid', 'address', 'full_name', 'company_name', 'json', 'prompt', 'completion'],
     num_rows: 200
 }))

In [8]:
from our_dataset import load_our_dataset

our_ds = load_our_dataset().map(add_json_label).map(promptify)
# our_ds_splits = our_ds["train"].train_test_split(test_size=32, seed=42)


#our_ds_train = our_ds_splits['train']
our_ds_eval = our_ds['train']

our_ds_eval

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 64/64 [00:00<00:00, 10100.29 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 64/64 [00:00<00:00, 9068.46 examples/s]


Dataset({
    features: ['file_name', 'full_name', 'company_name', 'address', 'phone_number', 'template_id', 'text', 'json', 'prompt', 'completion'],
    num_rows: 64
})

# Helpers

In [9]:
def get_json(llm_output: str) -> Optional[dict]:
    if llm_output is None:
        return None
    try:
        result = json.loads(llm_output.strip())
        return result if isinstance(result, dict) else None
    except (json.JSONDecodeError, ValueError, TypeError):
        return None


def has_same_fields(generated: dict, golden: dict) -> int:
    """Return 1 if `generated` has exactly the same keys as `golden`, else 0."""
    if generated is None or golden is None:
        return 0
    
    return int(set(generated.keys()) == set(golden.keys()))


def get_gold(ex):
    label_dict = {
        "full_name": ex.get('full_name', ''),
        "company_name": ex.get('company_name', ''),
        "address": ex.get('address', ''),
        "phone_number": ex.get('phone_number', '')
    }
    return label_dict


def extract_assistant_response(decoded_text: str) -> Optional[dict]:
    """
    Extract and parse assistant response from chat template format.
    
    Expected format:
    <|im_start|>assistant
    {"field1": "value1", "field2": "value2"}<|im_end|>
    
    Returns string
    """
    # Extract content between assistant tags
    pattern = r'<\|im_start\|>assistant\s*(.*?)<\|im_end\|>'
    match = re.search(pattern, decoded_text, re.DOTALL)

    if not match:
        return None
        # raise Exception(f"fuck assistant response shall not be empty. decoded_text = {decoded_text}")

    assistant_content = match.group(1).strip()
    return assistant_content


# Process one

In [None]:
def process_one(sample):

    # sanity check
    assert isinstance(sample, dict)

    input_ids = tokenizer.apply_chat_template(
        sample['prompt'],
        add_generation_prompt=True, #### Set false for training
        return_tensors="pt",
        tokenize=True, # !!! true=> tokens, false=> text
    )

    output = model.generate(
        input_ids.to(model.device),
        do_sample=True,
        temperature=0.3,
        min_p=0.15,
        repetition_penalty=1.05,
        max_new_tokens=512,
    )

    ## it's always output[0]
    generated_ids = output[0][input_ids.shape[-1]:]  # Skip the input tokens
    decoded = tokenizer.decode(generated_ids, skip_special_tokens=True)

    # returns decoded assistant answers
    return decoded


total = 0
correct = 0

for sample in ds['train'].shuffle(seed=43):
    total += 1
    decoded = process_one(sample)
    ans = has_same_fields(get_json(decoded), get_gold(sample))
    if (ans == 0):
        print(get_gold(sample), decoded)
        print()
    correct += ans
    if total % 10:
        print(correct / total)

# Eval batched

In [10]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    """
    Collate function for DataLoader that:
    - Takes a list of dataset samples
    - Applies chat template to each
    - Pads to same length
    - Returns batch dict
    """
    prompts = [sample['prompt'] for sample in batch]
    
    # Apply chat template with padding to entire batch
    encoded = tokenizer.apply_chat_template(
        prompts,
        add_generation_prompt=True,
        return_tensors="pt",
        padding=True,  # Pad to longest sequence in batch
        tokenize=True,
    )
    
    return {
        'input_ids': encoded,
        'samples': batch  # Keep original samples for gold labels
    }

@torch.no_grad()
def eval_batched(model_eval, evaluation_dataset, batch_size = 128):
    # Create DataLoader with custom collate function
    dataloader = DataLoader(
        evaluation_dataset,
        batch_size=batch_size,
        shuffle=False,  # Keep order for evaluation
        collate_fn=collate_fn,
        pin_memory=True,  # Faster GPU transfer
    )

    # Set model to eval mode
    model_eval.eval()

    correct_answs = 0
    total = 0

    failed_stuff = []

    for batch_dict in tqdm(dataloader, desc="Evaluating batches"):
        # Get predictions for entire batch
        input_ids = batch_dict['input_ids'].to(model.device)
    
        # Generate for entire batch at once
        outputs = model_eval.generate(
            input_ids,
            do_sample=True,
            temperature=0.3,
            min_p=0.15,
            repetition_penalty=1.05,
            max_new_tokens=512,
        )
        
        # Decode each sequence one by one
        predictions = []
        
        for output_seq in outputs:
            decoded = tokenizer.decode(output_seq, skip_special_tokens=False)
            extracted_assistant = extract_assistant_response(decoded)
            predictions.append(extracted_assistant)

        # Score each prediction against gold label
        for pred_raw, sample in zip(predictions, batch_dict['samples']):
            gold = get_gold(sample)
            pred = get_json(pred_raw)
            has_same = has_same_fields(pred, gold)
            if (has_same==0):
                failed_stuff.append({
                    "gold": gold,
                    "pred": pred,
                    "pred_raw": pred_raw
                })
            
            correct_answs += has_same
            total += 1

        # Print progress every batch_size samples
        if total % batch_size == 0:
            print(f"Progress: {total} samples, Accuracy so far: {correct_answs / total:.4f}")

    acc = correct_answs /  total
    print(f"\n‚úÖ Final Accuracy: {acc:.4f}")
    print(f"üìä Correct: {correct_answs}/{total}")

    return acc, failed_stuff

In [11]:
acc_raw_model_their, failed_stuff_theirs = eval_batched(model, evaluation_dataset=ds_eval, batch_size=128)
wandb.log({"accuracy_raw_model/theirs_simple_dataset": acc_raw_model_their})

Evaluating batches:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 1/2 [00:11<00:11, 11.91s/it]

Progress: 128 samples, Accuracy so far: 0.8906


Evaluating batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:21<00:00, 10.73s/it]


‚úÖ Final Accuracy: 0.9100
üìä Correct: 182/200





In [12]:
acc_raw_model_ours, failed_stuff_ours = eval_batched(model, evaluation_dataset=our_ds_eval, batch_size=128)
wandb.log({"accuracy_raw_model/our_complex_dataset": acc_raw_model_ours})

Evaluating batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:10<00:00, 10.69s/it]


‚úÖ Final Accuracy: 0.5781
üìä Correct: 37/64





In [20]:
sample_idx = 6
print(failed_stuff_ours[sample_idx]['gold'])
print(failed_stuff_ours[sample_idx]['pred_raw'])

{'full_name': 'Áü≥Â∑ù Â§ßÂú∞', 'company_name': 'Èñ¢Ë•ø„Çπ„ÉÅ„Éº„É´Ê†™Âºè‰ºöÁ§æ', 'address': '„Äí542-0076 Â§ßÈò™Â∫úÂ§ßÈò™Â∏Ç‰∏≠Â§ÆÂå∫Èõ£Ê≥¢5-1-60 „Å™„Çì„Å∞„Éë„Éº„ÇØ„Çπ', 'phone_number': '06-6644-7102'}
{
  "Â•ëÁ¥ÑÁ∑†ÁµêÂπ¥ÊúàÊó•": "‰ª§Âíå7Âπ¥10Êúà11Êó•",
  "Âä¥ÂÉçÊù°‰ª∂ÈÄöÁü•Êõ∏": "Ê†™Âºè‰ºöÁ§æÁü≥Â∑ùÂ§ßÂú∞",
  "‰ΩèÊâÄ": "Â§ßÈò™Â∫úÂ§ßÈò™Â∏Ç‰∏≠Â§ÆÂå∫Èõ£Ê≥¢ 5-1-60„Å™„Çì„Å∞„Éë„Éº„ÇØ„Çπ",
  "ÈõªË©±Áï™Âè∑": "06-6644-7102",
  "Â•ëÁ¥ÑÊúüÈñì": "ÂÆö„ÇÅÁÑ°„Åó",
  "Ë©¶Áî®ÊúüÈñì": "Ôºì„ÅãÊúà",
  "Â∞±Ê•≠„ÅÆÂ†¥ÊâÄ": "Êú¨Á§æ",
  "Â∞ÜÊù•Êã†ÁÇπÁï∞Âãï„ÅÆÂèØËÉΩÊÄß„ÅÇ„Çä": "Âæì‰∫ã„Åô„Åπ„ÅçÊ•≠Âãô",
  "Ê•≠ÂãôÂÜÖÂÆπ": "Áî£Ê•≠Áî®„É≠„Éú„ÉÉ„Éà„ÅÆÁµÑÁ´ã„ÉªÊ§úÊüª„ÉªÂ∑•Á®ãÊîπÂñÑË£úÂä©",
  "‰ºëÊÜ©ÊôÇÈñì": "9:00",
  "‰ºëÊÜ©ÂàÜ": "60ÂàÜ",
  "‰ºëÊó•": "Âúü„ÉªÊó•„ÉªÁ•ù„ÉªÂπ¥Êú´Âπ¥Âßã12/29„Äú1/3",
  "Âπ¥Èñì‰ºëÊöá": "120Êó•",
  "Âπ¥‰ºëÊ≥ïÂÆö": "ÂàùÂπ¥Â∫¶10Êó•Á∂ôÁ∂öÂã§Âãô Ôºñ„ÅãÊúà‰ª•ÂÜÖ„ÅÆÂπ¥‰ºëÊúâÂü∫Êú¨Ë≥ÉÈáëÊúàÁµ¶330,000ÂÜÜË´∏ÊâãÂΩìÈÄöÂã§ÂÆüË≤ªC‰∏äÈôê30,000ÂÜÜ„ÄÅ‰ΩèÂÆÖÊâãÂΩì 10,000ÂÜÜÂâ≤Â¢óË≥ÉÈáëÁéáÊ≥ïÂÆöË∂Ö 25ÔºÖÊ≥ïÂÆö‰

# Lora SFT

In [21]:
from peft import LoraConfig, get_peft_model, TaskType

GLU_MODULES = ["w1", "w2", "w3"]
MHA_MODULES = ["q_proj", "k_proj", "v_proj", "out_proj"]
CONV_MODULES = ["in_proj", "out_proj"]

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,  # <- lower values = fewer parameters
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=GLU_MODULES + MHA_MODULES + CONV_MODULES,
    bias="none",
    modules_to_save=None,
)

lora_model = get_peft_model(model, lora_config)
lora_model.print_trainable_parameters()

print("‚úÖ LoRA configuration applied!")
print(f"üéõÔ∏è  LoRA rank: {lora_config.r}")
print(f"üìä LoRA alpha: {lora_config.lora_alpha}")
print(f"üéØ Target modules: {lora_config.target_modules}")

trainable params: 5,554,176 || all params: 1,175,894,784 || trainable%: 0.4723
‚úÖ LoRA configuration applied!
üéõÔ∏è  LoRA rank: 8
üìä LoRA alpha: 16
üéØ Target modules: {'w2', 'in_proj', 'q_proj', 'w1', 'w3', 'v_proj', 'out_proj', 'k_proj'}


In [22]:
from trl import SFTConfig, SFTTrainer


lora_sft_config = SFTConfig(
    output_dir=f"./models/{RUN_NAME}/lfm2-sft-lora/",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    learning_rate=wandb.config['learning_rate'],
    lr_scheduler_type=wandb.config['lr_scheduler_type'],
    warmup_steps=wandb.config['warmup_steps'],
    warmup_ratio=wandb.config['warmup_ratio'],
    logging_steps=10,
    save_strategy="epoch",
    eval_strategy="epoch",
    load_best_model_at_end=True,
    report_to=None,
)

print("üèóÔ∏è  Creating LoRA SFT trainer...")
lora_sft_trainer = SFTTrainer(
    model=lora_model,
    args=lora_sft_config,
    train_dataset=ds_train,
    eval_dataset=ds_eval,
    processing_class=tokenizer,
)

print("\nüöÄ Starting LoRA + SFT training...")
lora_sft_trainer.train()

print("üéâ LoRA + SFT training completed!")

lora_sft_trainer.save_model()
print(f"üíæ LoRA model saved to: {lora_sft_config.output_dir}")

üèóÔ∏è  Creating LoRA SFT trainer...


Tokenizing train dataset: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 800/800 [00:00<00:00, 1175.38 examples/s]
Truncating train dataset: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 800/800 [00:00<00:00, 121087.05 examples/s]
Tokenizing eval dataset: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [00:00<00:00, 1111.89 examples/s]
Truncating eval dataset: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [00:00<00:00, 37429.09 examples/s]
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.



üöÄ Starting LoRA + SFT training...


Epoch,Training Loss,Validation Loss
1,0.0717,0.059719


üéâ LoRA + SFT training completed!
üíæ LoRA model saved to: ./models/LiquidAI-LFM2-1.2B-Extract-run-przm043/lfm2-sft-lora/


In [23]:
print("\nüîÑ Merging LoRA weights...")
merged_model = lora_model.merge_and_unload()
merged_output_dir = f"./models/{RUN_NAME}/lfm2-lora-merged"
merged_model.save_pretrained(merged_output_dir)
tokenizer.save_pretrained(merged_output_dir)
print(f"üíæ Merged model saved to: {merged_output_dir}")


üîÑ Merging LoRA weights...
üíæ Merged model saved to: ./models/LiquidAI-LFM2-1.2B-Extract-run-przm043/lfm2-lora-merged


In [24]:
acc_finetuned_theirs, failed_examples_post_theirs = eval_batched(merged_model, ds_eval, 128)
wandb.log({"accuracy_finetuned_model/theirs_simple_dataset": acc_finetuned_theirs})

Evaluating batches:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 1/2 [00:00<00:00,  1.08it/s]

Progress: 128 samples, Accuracy so far: 1.0000


Evaluating batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:01<00:00,  1.14it/s]


‚úÖ Final Accuracy: 1.0000
üìä Correct: 200/200





In [25]:
acc_finetuned_ours, failed_examples_post_ours = eval_batched(merged_model, our_ds_eval, 128)
wandb.log({"accuracy_finetuned_model/our_complex_dataset": acc_finetuned_theirs})

Evaluating batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:09<00:00, 10.00s/it]


‚úÖ Final Accuracy: 0.9688
üìä Correct: 62/64





In [28]:
for f in failed_examples_post_ours:
    print(f['gold'])
    print(f['pred'])
    print("=====")

{'full_name': 'Â§ßÂ°ö „Åï„Åè„Çâ', 'company_name': 'Ê†™Âºè‰ºöÁ§æ„Éç„Ç™„Éª„Ç≥„Éº„Éù„É¨„Éº„Ç∑„Éß„É≥', 'address': '„Äí060-0005 ÂåóÊµ∑ÈÅìÊú≠ÂπåÂ∏Ç‰∏≠Â§ÆÂå∫Âåó5Êù°Ë•ø2-5 JR„Çø„ÉØ„Éº', 'phone_number': '011-209-5100'}
None
=====
{'full_name': 'Á¶èÁî∞ Èöº‰∫∫', 'company_name': '„Ç™„Éº„Ç∑„É£„É≥„Éª„Ç≠„É£„Éî„Çø„É´„Éª„Éë„Éº„Éà„Éä„Éº„Ç∫', 'address': '„Äí100-6990 Êù±‰∫¨ÈÉΩÂçÉ‰ª£Áî∞Âå∫‰∏∏„ÅÆÂÜÖ2-6-1 ‰∏∏„ÅÆÂÜÖ„Éñ„É™„ÉÉ„ÇØ„Çπ„ÇØ„Ç®„Ç¢', 'phone_number': '03-3211-8800'}
{'„Éï„É´„Éç„Éº„É†': 'Á¶èÁî∞Èöº‰∫∫', '‰ºöÁ§æÂêç': '„Ç™„Éº„Ç∑„É£„É≥„Éª„Ç≠„É£„Éî„Çø„É´„Éª„Éë„Éº„Éà„Éä„Éº„Ç∫', '‰ΩèÊâÄ': 'Êù±‰∫¨ÈÉΩÂçÉ‰ª£Áî∞Âå∫‰∏∏„ÅÆÂÜÖ2-6-1‰∏∏„ÅÆÂÜÖ„Éñ„É™„ÉÉ„ÇØ„Çπ„ÇØ„Ç®„Ç¢', 'ÈõªË©±Áï™Âè∑': '03-3211-8800'}
=====


In [29]:
repo_id = "kainoj/LiquidAI-LFM2-1.2B-Extract-ja-pii-finetuned"

# One line each:
merged_model.push_to_hub(repo_id)
tokenizer.push_to_hub(repo_id)

Processing Files (1 / 1): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2.34GB / 2.34GB,  273MB/s  
New Data Upload: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2.09GB / 2.09GB,  248MB/s  


CommitInfo(commit_url='https://huggingface.co/kainoj/LiquidAI-LFM2-1.2B-Extract-ja-pii-finetuned/commit/df47251bde078eb7a48763d4ef5e511b8730db13', commit_message='Upload tokenizer', commit_description='', oid='df47251bde078eb7a48763d4ef5e511b8730db13', pr_url=None, repo_url=RepoUrl('https://huggingface.co/kainoj/LiquidAI-LFM2-1.2B-Extract-ja-pii-finetuned', endpoint='https://huggingface.co', repo_type='model', repo_id='kainoj/LiquidAI-LFM2-1.2B-Extract-ja-pii-finetuned'), pr_revision=None, pr_num=None)

In [30]:
wandb.log({"system_prompt": SYSTEM_PROMPT})

In [31]:
text_table = wandb.Table(columns=["System Prompt"])
text_table.add_data(SYSTEM_PROMPT)

In [32]:
from huggingface_hub import HfApi, upload_file

api = HfApi()
upload_file(
    path_or_fileobj="models/LICENSE",        # local file path
    path_in_repo="LICENSE",           # destination name in repo
    repo_id=repo_id,
)

CommitInfo(commit_url='https://huggingface.co/kainoj/LiquidAI-LFM2-1.2B-Extract-ja-pii-finetuned/commit/7957179faff0048bde2f853f50004e3a633c4de8', commit_message='Upload LICENSE with huggingface_hub', commit_description='', oid='7957179faff0048bde2f853f50004e3a633c4de8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/kainoj/LiquidAI-LFM2-1.2B-Extract-ja-pii-finetuned', endpoint='https://huggingface.co', repo_type='model', repo_id='kainoj/LiquidAI-LFM2-1.2B-Extract-ja-pii-finetuned'), pr_revision=None, pr_num=None)