In [1]:
import os
import json
import numpy as np
from rouge_score import rouge_scorer, scoring

In [2]:
from datasets import load_dataset
dataset = load_dataset("lawcompany/KLAID", 'ljp')

Downloading data:   0%|          | 0.00/60.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/161192 [00:00<?, ? examples/s]

In [3]:
full_dataset = dataset["train"]
shuffled_dataset = full_dataset.shuffle(seed=42)

train_testvalid = shuffled_dataset.train_test_split(test_size=0.1, seed=42)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=42)

train_dataset = train_testvalid['train']
valid_dataset = test_valid['train']
test_dataset = test_valid['test']

assert len(full_dataset) == len(train_dataset) + len(valid_dataset) + len(test_dataset)


In [4]:
print("Number of data: {}".format(len(full_dataset)))

Number of data: 161192


In [6]:
DATA_DIR = os.path.join("./curated-data")
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)
TRAIN_DS = os.path.join(DATA_DIR, "law-kr-train.jsonl")
VAL_DS = os.path.join(DATA_DIR, "law-kr-val.jsonl")
TEST_DS = os.path.join(DATA_DIR, "law-kr-test.jsonl")


In [7]:

def create_dataset(dataset, output_file):
    with open(output_file, 'w', encoding='utf-8') as outfile:
        prompt = "다음 법률 사실에 관련된 법률 조항을 알려주세요:\n\n"
        for data in dataset:
            
            input_text = prompt + data['fact']
            output_text = data['laws_service']
            new_data = {
            "input": input_text,
            "output": output_text
            }
            json.dump(new_data, outfile)
            outfile.write('\n')  # Add a newline after each JSON object

create_dataset(train_dataset,TRAIN_DS)
create_dataset(valid_dataset,VAL_DS)
create_dataset(test_dataset,TEST_DS)

In [8]:
import time
with open(TRAIN_DS, 'r') as infile:
    for line in infile:
        data = json.loads(line)
        print(data)
        break

{'input': '다음 법률 사실에 관련된 법률 조항을 알려주세요:\n\n피고인은 2019. 6. 29. 03:35경 김제시 B 앞 도로에서부터 같은 시 C에 있는 ‘D’ 중식당 주차장 앞 도로에 이르기까지 약 50m 구간에서 혈중알코올농도 0.067%의 술에 취한 상태로 E ‘봉고’ 화물차를 운전하였다. 이로써 피고인은 음주운전금지규정을 2회 이상 위반하였다.', 'output': '도로교통법 제148조의2 제1항,도로교통법 제44조 제1항'}


In [9]:
import time
with open(TEST_DS, 'r') as infile:
    for line in infile:
        data = json.loads(line)
        print(data)
        break

{'input': '다음 법률 사실에 관련된 법률 조항을 알려주세요:\n\n1. 주거침입 피고인은 2018. 7. 12. 18:30경 대구 동구 B건물 C호 피해자 D의 주거지인 원룸에 탑차와 사다리를 이용하여 원룸 창문을 열고 들어 가 피해자의 주거에 침입하였다. 2. 절도 피고인은 위\n 1.항의 일시 및 장소에서 위 피해자의 주민등록증과 운전면허증을 가져가 절취하였다.', 'output': '형법 제319조 제1항,형법 제329조'}


Download llama3.1 .nemo checkpoint

```python
!/usr/local/ngc-cli/ngc registry model download-version "nvidia/nemo/llama-3_1-8b-instruct-nemo:1.0" 
```

In [15]:
%%bash

# Set paths to the model, train, validation and test sets.
MODEL="llama-3-8b-instruct-nemo_v1.0/llama3_1_8b_instruct.nemo"

TRAIN_DS="[./curated-data/law-kr-train.jsonl]"
VALID_DS="[./curated-data/law-kr-val.jsonl]"
TEST_DS="[./curated-data/law-kr-test.jsonl]"
TEST_NAMES="[law]"

SCHEME="lora"
TP_SIZE=1
PP_SIZE=1

rm -rf results
OUTPUT_DIR="./results/Meta-llama3.1-8B-Instruct-Kr-law-Lora"

torchrun --nproc_per_node=1 \
/opt/NeMo/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
    exp_manager.exp_dir=${OUTPUT_DIR} \
    exp_manager.explicit_log_dir=${OUTPUT_DIR} \
    trainer.devices=1 \
    trainer.num_nodes=1 \
    trainer.precision=bf16-mixed \
    trainer.val_check_interval=20 \
    trainer.max_steps=500 \
    model.megatron_amp_O2=True \
    ++model.mcore_gpt=True \
    model.tensor_model_parallel_size=${TP_SIZE} \
    model.pipeline_model_parallel_size=${PP_SIZE} \
    model.micro_batch_size=1 \
    model.global_batch_size=8 \
    model.restore_from_path=${MODEL} \
    model.data.train_ds.num_workers=0 \
    model.data.validation_ds.num_workers=0 \
    model.data.train_ds.file_names=${TRAIN_DS} \
    model.data.train_ds.concat_sampling_probabilities=[1.0] \
    model.data.validation_ds.file_names=${VALID_DS} \
    model.peft.peft_scheme=${SCHEME}

    See https://hydra.cc/docs/1.2/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.
      ret = run_job(
    


[NeMo I 2024-11-13 15:26:31 megatron_gpt_finetuning:56] 
    
    ************** Experiment configuration ***********
[NeMo I 2024-11-13 15:26:31 megatron_gpt_finetuning:57] 
    name: megatron_gpt_peft_${model.peft.peft_scheme}_tuning
    trainer:
      devices: 1
      accelerator: gpu
      num_nodes: 1
      precision: bf16-mixed
      logger: false
      enable_checkpointing: false
      use_distributed_sampler: false
      max_epochs: 9999
      max_steps: 500
      log_every_n_steps: 10
      val_check_interval: 20
      gradient_clip_val: 1.0
    exp_manager:
      explicit_log_dir: ./results/Meta-llama3.1-8B-Instruct-Kr-law-Lora
      exp_dir: ./results/Meta-llama3.1-8B-Instruct-Kr-law-Lora
      name: ${name}
      create_wandb_logger: false
      wandb_logger_kwargs:
        project: null
        name: null
      resume_if_exists: true
      resume_ignore_no_checkpoint: true
      create_checkpoint_callback: true
      checkpoint_callback_params:
        monitor: validation_

[NeMo W 2024-11-13 15:26:31 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/_graveyard/precision.py:49: The `MixedPrecisionPlugin` is deprecated. Use `pytorch_lightning.plugins.precision.MixedPrecision` instead.
    
GPU available: True (cuda), used: True


[NeMo I 2024-11-13 15:26:31 dist_ckpt_io:95] Using ('zarr', 1) dist-ckpt save strategy.


TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
[NeMo E 2024-11-13 15:26:32 exp_manager:703] exp_manager received explicit_log_dir: ./results/Meta-llama3.1-8B-Instruct-Kr-law-Lora and at least one of exp_dir: ./results/Meta-llama3.1-8B-Instruct-Kr-law-Lora, or version: None. Please note that exp_dir, name, and version will be ignored.
[NeMo W 2024-11-13 15:26:32 exp_manager:630] There were no checkpoints found in checkpoint_dir or no checkpoint folder at checkpoint_dir :results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints. Training from scratch.


[NeMo I 2024-11-13 15:26:32 exp_manager:396] Experiments will be logged at results/Meta-llama3.1-8B-Instruct-Kr-law-Lora
[NeMo I 2024-11-13 15:26:32 exp_manager:856] TensorboardLogger has been set up


[NeMo W 2024-11-13 15:26:32 exp_manager:966] The checkpoint callback was told to monitor a validation value and trainer's max_steps was set to 500. Please ensure that max_steps will run for at least 1 epochs to ensure that checkpointing will not error out.
[NeMo W 2024-11-13 15:28:10 megatron_base_model:1158] The model: MegatronGPTSFTModel() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-13 15:28:10 megatron_base_model:1158] The model: MegatronGPTSFTModel() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-13 15:28:10 megatron_base_model:1158] The model: MegatronGPTSFTModel() does not have field.name: moe_extended_tp in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-13 15:28:10 megatron_base_model:1158] The model: MegatronGPTSFTModel() doe

[NeMo I 2024-11-13 15:28:10 megatron_init:263] Rank 0 has data parallel group : [0]
[NeMo I 2024-11-13 15:28:10 megatron_init:269] Rank 0 has combined group of data parallel and context parallel : [0]
[NeMo I 2024-11-13 15:28:10 megatron_init:274] All data parallel group ranks with context parallel combined: [[0]]
[NeMo I 2024-11-13 15:28:10 megatron_init:277] Ranks 0 has data parallel rank: 0
[NeMo I 2024-11-13 15:28:10 megatron_init:285] Rank 0 has context parallel group: [0]
[NeMo I 2024-11-13 15:28:10 megatron_init:288] All context parallel group ranks: [[0]]
[NeMo I 2024-11-13 15:28:10 megatron_init:289] Ranks 0 has context parallel rank: 0
[NeMo I 2024-11-13 15:28:10 megatron_init:296] Rank 0 has model parallel group: [0]
[NeMo I 2024-11-13 15:28:10 megatron_init:297] All model parallel group ranks: [[0]]
[NeMo I 2024-11-13 15:28:10 megatron_init:306] Rank 0 has tensor model parallel group: [0]
[NeMo I 2024-11-13 15:28:10 megatron_init:310] All tensor model parallel group ranks: 

24-11-13 15:28:10 - PID:6393 - rank:(0, 0, 0, 0) - microbatches.py:39 - INFO - setting number of micro-batches to constant 8
[NeMo W 2024-11-13 15:28:10 megatron_base_model:1158] The model: MegatronGPTSFTModel() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-13 15:28:10 megatron_base_model:1158] The model: MegatronGPTSFTModel() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-13 15:28:10 megatron_base_model:1158] The model: MegatronGPTSFTModel() does not have field.name: moe_extended_tp in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-13 15:28:10 megatron_base_model:1158] The model: MegatronGPTSFTModel() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.


[NeMo I 2024-11-13 15:28:11 megatron_base_model:584] Padded vocab_size: 128256, original vocab_size: 128256, dummy tokens: 0.


[NeMo W 2024-11-13 15:28:11 megatron_base_model:1158] The model: MegatronGPTSFTModel() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-13 15:28:11 megatron_base_model:1158] The model: MegatronGPTSFTModel() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-13 15:28:11 megatron_base_model:1158] The model: MegatronGPTSFTModel() does not have field.name: moe_extended_tp in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-13 15:28:11 megatron_base_model:1158] The model: MegatronGPTSFTModel() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-13 15:28:11 megatron_base_model:1158] The model: MegatronGPTSFTModel() does not have field.name: use_te_rng_t

[NeMo I 2024-11-13 15:28:32 dist_ckpt_io:95] Using ('zarr', 1) dist-ckpt save strategy.
Loading distributed checkpoint with TensorStoreLoadShardedStrategy
Loading distributed checkpoint directly on the GPU
[NeMo I 2024-11-13 15:29:31 nlp_overrides:1180] Model MegatronGPTSFTModel was successfully restored from /workspace/llama-3-8b-instruct-nemo_v1.0/llama3_1_8b_instruct.nemo.
[NeMo I 2024-11-13 15:29:31 megatron_gpt_finetuning:72] Adding adapter weights to the model for PEFT
[NeMo I 2024-11-13 15:29:31 nlp_adapter_mixins:203] Before adding PEFT params:
      | Name  | Type          | Params | Mode 
    ------------------------------------------------
    0 | model | Float16Module | 8.0 B  | train
    ------------------------------------------------
    0         Trainable params
    8.0 B     Non-trainable params
    8.0 B     Total params
    32,121.045Total estimated model params size (MB)
[NeMo I 2024-11-13 15:29:34 nlp_adapter_mixins:208] After adding PEFT params:
      | Name  | T

[NeMo W 2024-11-13 15:29:34 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/configuration_validator.py:161: You have overridden `MegatronGPTSFTModel.configure_sharded_model` which is deprecated. Please override the `configure_model` hook instead. Instantiation with the newer hook will be created on the device right away and have the right data type depending on the precision setting in the Trainer.
    
[NeMo W 2024-11-13 15:29:34 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/configuration_validator.py:143: You are using the `dataloader_iter` step flavor. If you consume the iterator more than once per step, the `batch_idx` argument in any hook that takes it will not match with the batch index of the last batch consumed. This might have unforeseen effects on callbacks or code that expects to get the correct index. This will also not work well with gradient accumulation. This feature is very experimental and subjec

[NeMo I 2024-11-13 15:29:34 megatron_gpt_sft_model:811] Building GPT SFT validation datasets.
[NeMo I 2024-11-13 15:29:34 text_memmap_dataset:116] Building data files
[NeMo I 2024-11-13 15:29:34 text_memmap_dataset:525] Processing 1 data files using 2 workers


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[NeMo I 2024-11-13 15:29:34 text_memmap_dataset:495] Building indexing for fn = ./curated-data/law-kr-val.jsonl
[NeMo I 2024-11-13 15:29:34 text_memmap_dataset:507] Saving idx file = ./curated-data/law-kr-val.jsonl.idx.npy
[NeMo I 2024-11-13 15:29:34 text_memmap_dataset:509] Saving metadata file = ./curated-data/law-kr-val.jsonl.idx.info
[NeMo I 2024-11-13 15:29:34 text_memmap_dataset:535] Time building 1 / 1 mem-mapped files: 0:00:00.272295
[NeMo I 2024-11-13 15:29:34 text_memmap_dataset:525] Processing 1 data files using 2 workers


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[NeMo I 2024-11-13 15:29:35 text_memmap_dataset:535] Time building 0 / 1 mem-mapped files: 0:00:00.171666
[NeMo I 2024-11-13 15:29:35 text_memmap_dataset:158] Loading data files
[NeMo I 2024-11-13 15:29:35 text_memmap_dataset:249] Loading ./curated-data/law-kr-val.jsonl
[NeMo I 2024-11-13 15:29:35 text_memmap_dataset:161] Time loading 1 mem-mapped files: 0:00:00.004199
[NeMo I 2024-11-13 15:29:35 text_memmap_dataset:165] Computing global indices
[NeMo I 2024-11-13 15:29:35 megatron_gpt_sft_model:815] Length of val dataset: 8060
[NeMo I 2024-11-13 15:29:35 megatron_gpt_sft_model:822] Building GPT SFT traing datasets.
[NeMo I 2024-11-13 15:29:35 text_memmap_dataset:116] Building data files
[NeMo I 2024-11-13 15:29:35 text_memmap_dataset:525] Processing 1 data files using 2 workers


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[NeMo I 2024-11-13 15:29:35 text_memmap_dataset:495] Building indexing for fn = ./curated-data/law-kr-train.jsonl
[NeMo I 2024-11-13 15:29:35 text_memmap_dataset:507] Saving idx file = ./curated-data/law-kr-train.jsonl.idx.npy
[NeMo I 2024-11-13 15:29:35 text_memmap_dataset:509] Saving metadata file = ./curated-data/law-kr-train.jsonl.idx.info
[NeMo I 2024-11-13 15:29:35 text_memmap_dataset:535] Time building 1 / 1 mem-mapped files: 0:00:00.412288
[NeMo I 2024-11-13 15:29:35 text_memmap_dataset:525] Processing 1 data files using 2 workers


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[NeMo I 2024-11-13 15:29:35 text_memmap_dataset:535] Time building 0 / 1 mem-mapped files: 0:00:00.165531
[NeMo I 2024-11-13 15:29:35 text_memmap_dataset:158] Loading data files
[NeMo I 2024-11-13 15:29:35 text_memmap_dataset:249] Loading ./curated-data/law-kr-train.jsonl
[NeMo I 2024-11-13 15:29:35 text_memmap_dataset:161] Time loading 1 mem-mapped files: 0:00:00.005729
[NeMo I 2024-11-13 15:29:35 text_memmap_dataset:165] Computing global indices


      counts = torch.cuda.LongTensor([1])
    


make: Entering directory '/opt/NeMo/nemo/collections/nlp/data/language_modeling/megatron'
make: Nothing to be done for 'default'.
make: Leaving directory '/opt/NeMo/nemo/collections/nlp/data/language_modeling/megatron'
> building indices for blendable datasets ...
 > sample ratios:
   dataset 0, input: 1, achieved: 1
[NeMo I 2024-11-13 15:29:36 blendable_dataset:67] > elapsed time for building blendable dataset indices: 0.07 (sec)
[NeMo I 2024-11-13 15:29:36 megatron_gpt_sft_model:824] Length of train dataset: 4020
[NeMo I 2024-11-13 15:29:36 megatron_gpt_sft_model:829] Building dataloader with consumed samples: 0
[NeMo I 2024-11-13 15:29:36 megatron_gpt_sft_model:829] Building dataloader with consumed samples: 0


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
[NeMo W 2024-11-13 15:29:36 megatron_base_model:1199] Ignoring `trainer.max_epochs` when computing `max_steps` because `trainer.max_steps` is already set to 500.


[NeMo I 2024-11-13 15:29:36 adapter_mixins:435] Unfrozen adapter : lora_kqv_adapter
[NeMo I 2024-11-13 15:29:36 adapter_mixins:435] Unfrozen adapter : lora_kqv_adapter
[NeMo I 2024-11-13 15:29:36 adapter_mixins:435] Unfrozen adapter : lora_kqv_adapter
[NeMo I 2024-11-13 15:29:36 adapter_mixins:435] Unfrozen adapter : lora_kqv_adapter
[NeMo I 2024-11-13 15:29:36 adapter_mixins:435] Unfrozen adapter : lora_kqv_adapter
[NeMo I 2024-11-13 15:29:36 adapter_mixins:435] Unfrozen adapter : lora_kqv_adapter
[NeMo I 2024-11-13 15:29:36 adapter_mixins:435] Unfrozen adapter : lora_kqv_adapter
[NeMo I 2024-11-13 15:29:36 adapter_mixins:435] Unfrozen adapter : lora_kqv_adapter
[NeMo I 2024-11-13 15:29:36 adapter_mixins:435] Unfrozen adapter : lora_kqv_adapter
[NeMo I 2024-11-13 15:29:36 adapter_mixins:435] Unfrozen adapter : lora_kqv_adapter
[NeMo I 2024-11-13 15:29:36 adapter_mixins:435] Unfrozen adapter : lora_kqv_adapter
[NeMo I 2024-11-13 15:29:36 adapter_mixins:435] Unfrozen adapter : lora_kqv_


  | Name  | Type          | Params | Mode 
------------------------------------------------
0 | model | Float16Module | 8.0 B  | train
------------------------------------------------
10.5 M    Trainable params
8.0 B     Non-trainable params
8.0 B     Total params
32,162.988Total estimated model params size (MB)
[NeMo W 2024-11-13 15:29:36 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=91` in the `DataLoader` to improve performance.
    
[NeMo W 2024-11-13 15:29:36 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/utilities.py:149: Found `dataloader_iter` argument in the `validation_step`. Note that the support for this signature is experimental and the behavior is subject to change.
    
    
[NeMo W 2024-11-13 15:29:38 nemo_logging:

Epoch 0: :   4%|▍         | 20/500 [00:26<10:45, reduced_train_loss=1.650, global_step=19.00, consumed_samples=160.0, train_step_timing in s=1.260]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 1/1008 [00:00<14:36,  1.15it/s][A
Validation DataLoader 0:   0%|          | 2/1008 [00:01<13:10,  1.27it/s][A
Validation DataLoader 0:   0%|          | 3/1008 [00:02<12:39,  1.32it/s][A
Validation DataLoader 0:   0%|          | 4/1008 [00:02<12:21,  1.35it/s][A
Validation DataLoader 0:   0%|          | 5/1008 [00:04<13:24,  1.25it/s][A
Validation DataLoader 0:   1%|          | 6/1008 [00:04<13:12,  1.26it/s][A
Validation DataLoader 0:   1%|          | 7/1008 [00:05<12:58,  1.29it/s][A
Validation DataLoader 0:   1%|          | 8/1008 [00:06<12:57,  1.29it/s][A
Validation DataLoader 0:   1%|          | 9/1008 [00:06<12:45,

Metric val_loss improved. New best score: 1.618
Epoch 0, global step 20: 'validation_loss' reached 1.61843 (best 1.61843), saving model to '/workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=1.618-step=20-consumed_samples=160.0.ckpt' as top 1
[NeMo W 2024-11-13 15:42:34 nlp_overrides:480] DistributedCheckpointIO configured but should not be used. Reverting back to TorchCheckpointIO


Epoch 0: :   8%|▊         | 40/500 [13:24<2:34:07, reduced_train_loss=0.796, global_step=39.00, consumed_samples=320.0, train_step_timing in s=1.220, val_loss=1.620]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 1/1008 [00:00<14:31,  1.16it/s][A
Validation DataLoader 0:   0%|          | 2/1008 [00:01<12:58,  1.29it/s][A
Validation DataLoader 0:   0%|          | 3/1008 [00:02<12:27,  1.34it/s][A
Validation DataLoader 0:   0%|          | 4/1008 [00:02<12:10,  1.37it/s][A
Validation DataLoader 0:   0%|          | 5/1008 [00:03<13:07,  1.27it/s][A
Validation DataLoader 0:   1%|          | 6/1008 [00:04<12:49,  1.30it/s][A
Validation DataLoader 0:   1%|          | 7/1008 [00:05<12:36,  1.32it/s][A
Validation DataLoader 0:   1%|          | 8/1008 [00:05<12:26,  1.34it/s][A
Validation DataLoader 0:   1%|          | 9/

Metric val_loss improved by 0.975 >= min_delta = 0.001. New best score: 0.643
Epoch 0, global step 40: 'validation_loss' reached 0.64329 (best 0.64329), saving model to '/workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.643-step=40-consumed_samples=320.0.ckpt' as top 1


Epoch 0: :   8%|▊         | 40/500 [25:57<4:58:29, reduced_train_loss=0.796, global_step=39.00, consumed_samples=320.0, train_step_timing in s=1.220, val_loss=0.643][NeMo I 2024-11-13 15:55:37 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=1.618-step=20-consumed_samples=160.0.ckpt
[NeMo I 2024-11-13 15:55:38 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=1.618-step=20-consumed_samples=160.0-last.ckpt
Epoch 0: :  12%|█▏        | 60/500 [26:26<3:13:53, reduced_train_loss=0.508, global_step=59.00, consumed_samples=480.0, train_step_timing in s=1.350, val_loss=0.643]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|        

Metric val_loss improved by 0.210 >= min_delta = 0.001. New best score: 0.433
Epoch 0, global step 60: 'validation_loss' reached 0.43287 (best 0.43287), saving model to '/workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.433-step=60-consumed_samples=480.0.ckpt' as top 1


Epoch 0: :  12%|█▏        | 60/500 [38:57<4:45:39, reduced_train_loss=0.508, global_step=59.00, consumed_samples=480.0, train_step_timing in s=1.350, val_loss=0.433][NeMo I 2024-11-13 16:08:37 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.643-step=40-consumed_samples=320.0.ckpt
[NeMo I 2024-11-13 16:08:38 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.643-step=40-consumed_samples=320.0-last.ckpt
Epoch 0: :  16%|█▌        | 80/500 [39:26<3:27:02, reduced_train_loss=0.273, global_step=79.00, consumed_samples=640.0, train_step_timing in s=1.230, val_loss=0.433]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|        

Metric val_loss improved by 0.076 >= min_delta = 0.001. New best score: 0.357
Epoch 0, global step 80: 'validation_loss' reached 0.35730 (best 0.35730), saving model to '/workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.357-step=80-consumed_samples=640.0.ckpt' as top 1


Epoch 0: :  16%|█▌        | 80/500 [51:57<4:32:48, reduced_train_loss=0.273, global_step=79.00, consumed_samples=640.0, train_step_timing in s=1.230, val_loss=0.357][NeMo I 2024-11-13 16:21:37 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.433-step=60-consumed_samples=480.0.ckpt
[NeMo I 2024-11-13 16:21:38 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.433-step=60-consumed_samples=480.0-last.ckpt
Epoch 0: :  20%|██        | 100/500 [52:26<3:29:47, reduced_train_loss=0.122, global_step=99.00, consumed_samples=800.0, train_step_timing in s=1.280, val_loss=0.357]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|       

Metric val_loss improved by 0.123 >= min_delta = 0.001. New best score: 0.234
Epoch 0, global step 100: 'validation_loss' reached 0.23444 (best 0.23444), saving model to '/workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.234-step=100-consumed_samples=800.0.ckpt' as top 1


Epoch 0: :  20%|██        | 100/500 [1:04:57<4:19:48, reduced_train_loss=0.122, global_step=99.00, consumed_samples=800.0, train_step_timing in s=1.280, val_loss=0.234][NeMo I 2024-11-13 16:34:37 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.357-step=80-consumed_samples=640.0.ckpt
[NeMo I 2024-11-13 16:34:38 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.357-step=80-consumed_samples=640.0-last.ckpt
Epoch 0: :  24%|██▍       | 120/500 [1:05:26<3:27:12, reduced_train_loss=0.154, global_step=119.0, consumed_samples=960.0, train_step_timing in s=1.360, val_loss=0.234] 
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%| 

Metric val_loss improved by 0.038 >= min_delta = 0.001. New best score: 0.197
Epoch 0, global step 120: 'validation_loss' reached 0.19683 (best 0.19683), saving model to '/workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.197-step=120-consumed_samples=960.0.ckpt' as top 1


Epoch 0: :  24%|██▍       | 120/500 [1:17:57<4:06:52, reduced_train_loss=0.154, global_step=119.0, consumed_samples=960.0, train_step_timing in s=1.360, val_loss=0.197][NeMo I 2024-11-13 16:47:37 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.234-step=100-consumed_samples=800.0.ckpt
[NeMo I 2024-11-13 16:47:38 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.234-step=100-consumed_samples=800.0-last.ckpt
Epoch 0: :  28%|██▊       | 140/500 [1:18:26<3:21:41, reduced_train_loss=0.147, global_step=139.0, consumed_samples=1120.0, train_step_timing in s=1.300, val_loss=0.197] 
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0

Metric val_loss improved by 0.026 >= min_delta = 0.001. New best score: 0.171
Epoch 0, global step 140: 'validation_loss' reached 0.17054 (best 0.17054), saving model to '/workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.171-step=140-consumed_samples=1120.0.ckpt' as top 1


Epoch 0: :  28%|██▊       | 140/500 [1:31:03<3:54:09, reduced_train_loss=0.147, global_step=139.0, consumed_samples=1120.0, train_step_timing in s=1.300, val_loss=0.171][NeMo I 2024-11-13 17:00:43 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.197-step=120-consumed_samples=960.0.ckpt
[NeMo I 2024-11-13 17:00:44 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.197-step=120-consumed_samples=960.0-last.ckpt
Epoch 0: :  32%|███▏      | 160/500 [1:31:32<3:14:32, reduced_train_loss=0.123, global_step=159.0, consumed_samples=1280.0, train_step_timing in s=1.270, val_loss=0.171] 
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   

Metric val_loss improved by 0.023 >= min_delta = 0.001. New best score: 0.147
Epoch 0, global step 160: 'validation_loss' reached 0.14738 (best 0.14738), saving model to '/workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.147-step=160-consumed_samples=1280.0.ckpt' as top 1


Epoch 0: :  32%|███▏      | 160/500 [1:44:05<3:41:11, reduced_train_loss=0.123, global_step=159.0, consumed_samples=1280.0, train_step_timing in s=1.270, val_loss=0.147][NeMo I 2024-11-13 17:13:45 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.171-step=140-consumed_samples=1120.0.ckpt
[NeMo I 2024-11-13 17:13:47 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.171-step=140-consumed_samples=1120.0-last.ckpt
Epoch 0: :  36%|███▌      | 180/500 [1:44:34<3:05:54, reduced_train_loss=0.101, global_step=179.0, consumed_samples=1440.0, train_step_timing in s=1.260, val_loss=0.147] 
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0: 

Metric val_loss improved by 0.020 >= min_delta = 0.001. New best score: 0.128
Epoch 0, global step 180: 'validation_loss' reached 0.12773 (best 0.12773), saving model to '/workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.128-step=180-consumed_samples=1440.0.ckpt' as top 1


Epoch 0: :  36%|███▌      | 180/500 [1:57:07<3:28:13, reduced_train_loss=0.101, global_step=179.0, consumed_samples=1440.0, train_step_timing in s=1.260, val_loss=0.128][NeMo I 2024-11-13 17:26:47 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.147-step=160-consumed_samples=1280.0.ckpt
[NeMo I 2024-11-13 17:26:48 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.147-step=160-consumed_samples=1280.0-last.ckpt
Epoch 0: :  40%|████      | 200/500 [1:57:35<2:56:23, reduced_train_loss=0.145, global_step=199.0, consumed_samples=1600.0, train_step_timing in s=1.200, val_loss=0.128] 
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0: 

Metric val_loss improved by 0.009 >= min_delta = 0.001. New best score: 0.119
Epoch 0, global step 200: 'validation_loss' reached 0.11861 (best 0.11861), saving model to '/workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.119-step=200-consumed_samples=1600.0.ckpt' as top 1


Epoch 0: :  40%|████      | 200/500 [2:10:05<3:15:08, reduced_train_loss=0.145, global_step=199.0, consumed_samples=1600.0, train_step_timing in s=1.200, val_loss=0.119][NeMo I 2024-11-13 17:39:46 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.128-step=180-consumed_samples=1440.0.ckpt
[NeMo I 2024-11-13 17:39:47 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.128-step=180-consumed_samples=1440.0-last.ckpt
Epoch 0: :  44%|████▍     | 220/500 [2:10:36<2:46:13, reduced_train_loss=0.111, global_step=219.0, consumed_samples=1760.0, train_step_timing in s=1.460, val_loss=0.119] 
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0: 

Metric val_loss improved by 0.016 >= min_delta = 0.001. New best score: 0.102
Epoch 0, global step 220: 'validation_loss' reached 0.10217 (best 0.10217), saving model to '/workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.102-step=220-consumed_samples=1760.0.ckpt' as top 1


Epoch 0: :  44%|████▍     | 220/500 [2:23:11<3:02:14, reduced_train_loss=0.111, global_step=219.0, consumed_samples=1760.0, train_step_timing in s=1.460, val_loss=0.102][NeMo I 2024-11-13 17:52:51 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.119-step=200-consumed_samples=1600.0.ckpt
[NeMo I 2024-11-13 17:52:52 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.119-step=200-consumed_samples=1600.0-last.ckpt
Epoch 0: :  48%|████▊     | 240/500 [2:23:41<2:35:40, reduced_train_loss=0.0694, global_step=239.0, consumed_samples=1920.0, train_step_timing in s=1.300, val_loss=0.102]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0: 

Metric val_loss improved by 0.005 >= min_delta = 0.001. New best score: 0.097
Epoch 0, global step 240: 'validation_loss' reached 0.09727 (best 0.09727), saving model to '/workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.097-step=240-consumed_samples=1920.0.ckpt' as top 1


Epoch 0: :  48%|████▊     | 240/500 [2:36:14<2:49:15, reduced_train_loss=0.0694, global_step=239.0, consumed_samples=1920.0, train_step_timing in s=1.300, val_loss=0.0973][NeMo I 2024-11-13 18:05:54 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.102-step=220-consumed_samples=1760.0.ckpt
[NeMo I 2024-11-13 18:05:55 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.102-step=220-consumed_samples=1760.0-last.ckpt
Epoch 0: :  52%|█████▏    | 260/500 [2:36:45<2:24:41, reduced_train_loss=0.0308, global_step=259.0, consumed_samples=2080.0, train_step_timing in s=1.280, val_loss=0.0973]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 

Metric val_loss improved by 0.007 >= min_delta = 0.001. New best score: 0.090
Epoch 0, global step 260: 'validation_loss' reached 0.09008 (best 0.09008), saving model to '/workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.090-step=260-consumed_samples=2080.0.ckpt' as top 1


Epoch 0: :  52%|█████▏    | 260/500 [2:51:02<2:37:53, reduced_train_loss=0.0308, global_step=259.0, consumed_samples=2080.0, train_step_timing in s=1.280, val_loss=0.0901][NeMo I 2024-11-13 18:20:42 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.097-step=240-consumed_samples=1920.0.ckpt
[NeMo I 2024-11-13 18:20:43 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.097-step=240-consumed_samples=1920.0-last.ckpt
Epoch 0: :  56%|█████▌    | 280/500 [2:51:32<2:14:47, reduced_train_loss=0.0489, global_step=279.0, consumed_samples=2240.0, train_step_timing in s=1.260, val_loss=0.0901]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 

Metric val_loss improved by 0.004 >= min_delta = 0.001. New best score: 0.086
Epoch 0, global step 280: 'validation_loss' reached 0.08571 (best 0.08571), saving model to '/workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.086-step=280-consumed_samples=2240.0.ckpt' as top 1


Epoch 0: :  56%|█████▌    | 280/500 [3:04:08<2:24:41, reduced_train_loss=0.0489, global_step=279.0, consumed_samples=2240.0, train_step_timing in s=1.260, val_loss=0.0857][NeMo I 2024-11-13 18:33:48 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.090-step=260-consumed_samples=2080.0.ckpt
[NeMo I 2024-11-13 18:33:49 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.090-step=260-consumed_samples=2080.0-last.ckpt
Epoch 0: :  60%|██████    | 300/500 [3:04:38<2:03:05, reduced_train_loss=0.134, global_step=299.0, consumed_samples=2400.0, train_step_timing in s=1.170, val_loss=0.0857] 
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 

Metric val_loss improved by 0.003 >= min_delta = 0.001. New best score: 0.083
Epoch 0, global step 300: 'validation_loss' reached 0.08294 (best 0.08294), saving model to '/workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.083-step=300-consumed_samples=2400.0.ckpt' as top 1


Epoch 0: :  60%|██████    | 300/500 [3:17:09<2:11:26, reduced_train_loss=0.134, global_step=299.0, consumed_samples=2400.0, train_step_timing in s=1.170, val_loss=0.0829][NeMo I 2024-11-13 18:46:49 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.086-step=280-consumed_samples=2240.0.ckpt
[NeMo I 2024-11-13 18:46:50 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.086-step=280-consumed_samples=2240.0-last.ckpt
Epoch 0: :  64%|██████▍   | 320/500 [3:17:38<1:51:10, reduced_train_loss=0.0901, global_step=319.0, consumed_samples=2560.0, train_step_timing in s=1.520, val_loss=0.0829]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0

Metric val_loss improved by 0.005 >= min_delta = 0.001. New best score: 0.078
Epoch 0, global step 320: 'validation_loss' reached 0.07844 (best 0.07844), saving model to '/workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.078-step=320-consumed_samples=2560.0.ckpt' as top 1


Epoch 0: :  64%|██████▍   | 320/500 [3:30:10<1:58:13, reduced_train_loss=0.0901, global_step=319.0, consumed_samples=2560.0, train_step_timing in s=1.520, val_loss=0.0784][NeMo I 2024-11-13 18:59:50 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.083-step=300-consumed_samples=2400.0.ckpt
[NeMo I 2024-11-13 18:59:51 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.083-step=300-consumed_samples=2400.0-last.ckpt
Epoch 0: :  68%|██████▊   | 340/500 [3:30:38<1:39:07, reduced_train_loss=0.0893, global_step=339.0, consumed_samples=2720.0, train_step_timing in s=1.270, val_loss=0.0784]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 

Metric val_loss improved by 0.007 >= min_delta = 0.001. New best score: 0.072
Epoch 0, global step 340: 'validation_loss' reached 0.07171 (best 0.07171), saving model to '/workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.072-step=340-consumed_samples=2720.0.ckpt' as top 1


Epoch 0: :  68%|██████▊   | 340/500 [3:43:10<1:45:01, reduced_train_loss=0.0893, global_step=339.0, consumed_samples=2720.0, train_step_timing in s=1.270, val_loss=0.0717][NeMo I 2024-11-13 19:12:50 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.078-step=320-consumed_samples=2560.0.ckpt
[NeMo I 2024-11-13 19:12:51 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.078-step=320-consumed_samples=2560.0-last.ckpt
Epoch 0: :  72%|███████▏  | 360/500 [3:43:38<1:26:58, reduced_train_loss=0.0177, global_step=359.0, consumed_samples=2880.0, train_step_timing in s=1.370, val_loss=0.0717] 
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader

Metric val_loss improved by 0.004 >= min_delta = 0.001. New best score: 0.067
Epoch 0, global step 360: 'validation_loss' reached 0.06733 (best 0.06733), saving model to '/workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.067-step=360-consumed_samples=2880.0.ckpt' as top 1


Epoch 0: :  72%|███████▏  | 360/500 [3:56:07<1:31:49, reduced_train_loss=0.0177, global_step=359.0, consumed_samples=2880.0, train_step_timing in s=1.370, val_loss=0.0673][NeMo I 2024-11-13 19:25:47 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.072-step=340-consumed_samples=2720.0.ckpt
[NeMo I 2024-11-13 19:25:48 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.072-step=340-consumed_samples=2720.0-last.ckpt
Epoch 0: :  76%|███████▌  | 380/500 [3:56:38<1:14:43, reduced_train_loss=0.0278, global_step=379.0, consumed_samples=3040.0, train_step_timing in s=1.270, val_loss=0.0673] 
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader

Epoch 0, global step 380: 'validation_loss' was not in top 1


Epoch 0: :  76%|███████▌  | 380/500 [4:09:12<1:18:41, reduced_train_loss=0.0278, global_step=379.0, consumed_samples=3040.0, train_step_timing in s=1.270, val_loss=0.0675][NeMo I 2024-11-13 19:38:51 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.067-step=360-consumed_samples=2880.0-last.ckpt
Epoch 0: :  80%|████████  | 400/500 [4:09:40<1:02:25, reduced_train_loss=0.0129, global_step=399.0, consumed_samples=3200.0, train_step_timing in s=1.500, val_loss=0.0675]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 1/1008 [00:01<18:11,  0.92it/s][A
Validation DataLoader 0:   0%|          | 2/1008 [00:01<14:48,  1.13it/s][A
Validation DataLoader 0:   0%|          | 3/1008 [00:02<13:41,  1.22it/s][A
Validation DataLoad

Metric val_loss improved by 0.004 >= min_delta = 0.001. New best score: 0.064
Epoch 0, global step 400: 'validation_loss' reached 0.06379 (best 0.06379), saving model to '/workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.064-step=400-consumed_samples=3200.0.ckpt' as top 1


Epoch 0: :  80%|████████  | 400/500 [4:22:09<1:05:32, reduced_train_loss=0.0129, global_step=399.0, consumed_samples=3200.0, train_step_timing in s=1.500, val_loss=0.0638][NeMo I 2024-11-13 19:51:49 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.067-step=360-consumed_samples=2880.0.ckpt
[NeMo I 2024-11-13 19:51:51 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.067-step=380-consumed_samples=3040.0-last.ckpt
Epoch 0: :  84%|████████▍ | 420/500 [4:22:40<50:01, reduced_train_loss=0.0341, global_step=419.0, consumed_samples=3360.0, train_step_timing in s=2.670, val_loss=0.0638]  
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 

Metric val_loss improved by 0.002 >= min_delta = 0.001. New best score: 0.062
Epoch 0, global step 420: 'validation_loss' reached 0.06169 (best 0.06169), saving model to '/workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.062-step=420-consumed_samples=3360.0.ckpt' as top 1


Epoch 0: :  84%|████████▍ | 420/500 [4:35:12<52:25, reduced_train_loss=0.0341, global_step=419.0, consumed_samples=3360.0, train_step_timing in s=2.670, val_loss=0.0617][NeMo I 2024-11-13 20:04:52 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.064-step=400-consumed_samples=3200.0.ckpt
[NeMo I 2024-11-13 20:04:53 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.064-step=400-consumed_samples=3200.0-last.ckpt
Epoch 0: :  88%|████████▊ | 440/500 [4:35:41<37:35, reduced_train_loss=0.073, global_step=439.0, consumed_samples=3520.0, train_step_timing in s=1.630, val_loss=0.0617]  
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0: 

Epoch 0, global step 440: 'validation_loss' reached 0.06096 (best 0.06096), saving model to '/workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.061-step=440-consumed_samples=3520.0.ckpt' as top 1


Epoch 0: :  88%|████████▊ | 440/500 [4:48:15<39:18, reduced_train_loss=0.073, global_step=439.0, consumed_samples=3520.0, train_step_timing in s=1.630, val_loss=0.061] [NeMo I 2024-11-13 20:17:55 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.062-step=420-consumed_samples=3360.0.ckpt
[NeMo I 2024-11-13 20:17:56 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.062-step=420-consumed_samples=3360.0-last.ckpt
Epoch 0: :  92%|█████████▏| 460/500 [4:48:44<25:06, reduced_train_loss=0.534, global_step=459.0, consumed_samples=3680.0, train_step_timing in s=1.380, val_loss=0.061]  
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   

Metric val_loss improved by 0.001 >= min_delta = 0.001. New best score: 0.061
Epoch 0, global step 460: 'validation_loss' reached 0.06061 (best 0.06061), saving model to '/workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.061-step=460-consumed_samples=3680.0.ckpt' as top 1


Epoch 0: :  92%|█████████▏| 460/500 [5:01:15<26:11, reduced_train_loss=0.534, global_step=459.0, consumed_samples=3680.0, train_step_timing in s=1.380, val_loss=0.0606][NeMo I 2024-11-13 20:30:55 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.061-step=440-consumed_samples=3520.0.ckpt
[NeMo I 2024-11-13 20:30:56 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.061-step=440-consumed_samples=3520.0-last.ckpt
Epoch 0: :  96%|█████████▌| 480/500 [5:01:44<12:34, reduced_train_loss=0.0376, global_step=479.0, consumed_samples=3840.0, train_step_timing in s=1.280, val_loss=0.0606] 
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:  

Epoch 0, global step 480: 'validation_loss' reached 0.06033 (best 0.06033), saving model to '/workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.060-step=480-consumed_samples=3840.0.ckpt' as top 1


Epoch 0: :  96%|█████████▌| 480/500 [5:14:14<13:05, reduced_train_loss=0.0376, global_step=479.0, consumed_samples=3840.0, train_step_timing in s=1.280, val_loss=0.0603][NeMo I 2024-11-13 20:43:53 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.061-step=460-consumed_samples=3680.0.ckpt
[NeMo I 2024-11-13 20:43:55 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.061-step=460-consumed_samples=3680.0-last.ckpt
Epoch 0: : 100%|██████████| 500/500 [5:14:44<00:00, reduced_train_loss=0.0137, global_step=499.0, consumed_samples=4e+3, train_step_timing in s=1.530, val_loss=0.0603]  
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1008 [00:00<?, ?it/s][A
Validation DataLoader 0:  

Epoch 0, global step 500: 'validation_loss' reached 0.06030 (best 0.06030), saving model to '/workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.060-step=500-consumed_samples=4000.0.ckpt' as top 1


Epoch 0: : 100%|██████████| 500/500 [5:27:16<00:00, reduced_train_loss=0.0137, global_step=499.0, consumed_samples=4e+3, train_step_timing in s=1.530, val_loss=0.0603][NeMo I 2024-11-13 20:56:56 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.060-step=480-consumed_samples=3840.0.ckpt
[NeMo I 2024-11-13 20:56:57 nlp_overrides:464] Removing checkpoint: /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.060-step=480-consumed_samples=3840.0-last.ckpt


`Trainer.fit` stopped: `max_steps=500` reached.


Epoch 0: : 100%|██████████| 500/500 [5:27:18<00:00, reduced_train_loss=0.0137, global_step=499.0, consumed_samples=4e+3, train_step_timing in s=1.530, val_loss=0.0603]


Restoring states from the checkpoint path at /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.060-step=500-consumed_samples=4000.0.ckpt
Restored all states from the checkpoint at /workspace/results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning--validation_loss=0.060-step=500-consumed_samples=4000.0.ckpt


In [16]:
!head -n 128 ./curated-data/law-kr-test.jsonl > ./curated-data/law-kr-test-n128.jsonl

In [18]:
%%bash
MODEL="llama-3-8b-instruct-nemo_v1.0/llama3_1_8b_instruct.nemo"

TEST_DS="[./curated-data/law-kr-test-n128.jsonl]" # Smaller test split
# TEST_DS="[./curated-data/law-qa-test_preprocessed.jsonl]" # Full test set
TEST_NAMES="[law]"

TP_SIZE=1
PP_SIZE=1

# This is where your LoRA checkpoint was saved
PATH_TO_TRAINED_MODEL="./results/Meta-llama3.1-8B-Instruct-Kr-law-Lora/checkpoints/megatron_gpt_peft_lora_tuning.nemo"

# The generation run will save the generated outputs over the test dataset in a file prefixed like so
OUTPUT_PREFIX="law_kr_lora"
torchrun --nproc_per_node=1 \
 /opt/NeMo/examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
    model.restore_from_path=${MODEL} \
    model.peft.restore_from_path=${PATH_TO_TRAINED_MODEL} \
    trainer.devices=1\
    trainer.num_nodes=1 \
    model.data.test_ds.file_names=${TEST_DS} \
    model.data.test_ds.names=${TEST_NAMES} \
    model.data.test_ds.global_batch_size=8 \
    model.data.test_ds.micro_batch_size=1 \
    model.data.test_ds.tokens_to_generate=50 \
    model.tensor_model_parallel_size=${TP_SIZE} \
    model.pipeline_model_parallel_size=${PP_SIZE} \
    inference.greedy=True  \
    model.data.test_ds.output_file_path_prefix=${OUTPUT_PREFIX} \
    model.data.test_ds.write_predictions_to_file=True \
    model.data.test_ds.add_bos=False \
    model.data.test_ds.add_eos=True \
    model.data.test_ds.add_sep=False \
    model.data.test_ds.label_key="output" \
    model.data.test_ds.prompt_template="\{input\}\ \{output\}"

    See https://hydra.cc/docs/1.2/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.
      ret = run_job(
    


[NeMo I 2024-11-13 21:43:05 megatron_gpt_generate:125] 
    
    ************** Experiment configuration ***********
[NeMo I 2024-11-13 21:43:05 megatron_gpt_generate:126] 
    name: megatron_gpt_peft_${model.peft.peft_scheme}_tuning
    trainer:
      devices: 1
      accelerator: gpu
      num_nodes: 1
      precision: 16
      logger: false
      enable_checkpointing: false
      use_distributed_sampler: false
      max_epochs: 9999
      max_steps: 20000
      log_every_n_steps: 10
      val_check_interval: 200
      gradient_clip_val: 1.0
    exp_manager:
      explicit_log_dir: null
      exp_dir: null
      name: ${name}
      create_wandb_logger: false
      wandb_logger_kwargs:
        project: null
        name: null
      resume_if_exists: true
      resume_ignore_no_checkpoint: true
      create_checkpoint_callback: true
      checkpoint_callback_params:
        monitor: validation_${model.data.test_ds.metric.name}
        save_top_k: 1
        mode: max
        save_nemo_o

[NeMo W 2024-11-13 21:43:05 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/_graveyard/precision.py:49: The `MixedPrecisionPlugin` is deprecated. Use `pytorch_lightning.plugins.precision.MixedPrecision` instead.
    
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
[NeMo W 2024-11-13 21:43:26 megatron_base_model:1158] The model: MegatronGPTSFTModel() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-13 21:43:26 megatron_base_model:1158] The model: MegatronGPTSFTModel() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-13 21:43:26 megatron_base_model:1158] The model: MegatronGPTSFTModel() does not have field.name: moe_extended_tp in its cfg. Add this key to cfg or config_mapping to make to make it 

[NeMo I 2024-11-13 21:43:26 megatron_init:263] Rank 0 has data parallel group : [0]
[NeMo I 2024-11-13 21:43:26 megatron_init:269] Rank 0 has combined group of data parallel and context parallel : [0]
[NeMo I 2024-11-13 21:43:26 megatron_init:274] All data parallel group ranks with context parallel combined: [[0]]
[NeMo I 2024-11-13 21:43:26 megatron_init:277] Ranks 0 has data parallel rank: 0
[NeMo I 2024-11-13 21:43:26 megatron_init:285] Rank 0 has context parallel group: [0]
[NeMo I 2024-11-13 21:43:26 megatron_init:288] All context parallel group ranks: [[0]]
[NeMo I 2024-11-13 21:43:26 megatron_init:289] Ranks 0 has context parallel rank: 0
[NeMo I 2024-11-13 21:43:26 megatron_init:296] Rank 0 has model parallel group: [0]
[NeMo I 2024-11-13 21:43:26 megatron_init:297] All model parallel group ranks: [[0]]
[NeMo I 2024-11-13 21:43:26 megatron_init:306] Rank 0 has tensor model parallel group: [0]
[NeMo I 2024-11-13 21:43:26 megatron_init:310] All tensor model parallel group ranks: 

[NeMo W 2024-11-13 21:43:26 megatron_base_model:1158] The model: MegatronGPTSFTModel() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-13 21:43:26 megatron_base_model:1158] The model: MegatronGPTSFTModel() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-13 21:43:26 megatron_base_model:1158] The model: MegatronGPTSFTModel() does not have field.name: moe_extended_tp in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-13 21:43:26 megatron_base_model:1158] The model: MegatronGPTSFTModel() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-13 21:43:26 megatron_base_model:1158] The model: MegatronGPTSFTModel() does not have field.name: use_te_rng_t

[NeMo I 2024-11-13 21:43:26 tokenizer_utils:178] Getting HuggingFace AutoTokenizer with pretrained_model_name: meta-llama/Meta-Llama-3-8B


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[NeMo I 2024-11-13 21:43:26 megatron_base_model:584] Padded vocab_size: 128256, original vocab_size: 128256, dummy tokens: 0.


[NeMo W 2024-11-13 21:43:26 megatron_base_model:1158] The model: MegatronGPTSFTModel() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-13 21:43:26 megatron_base_model:1158] The model: MegatronGPTSFTModel() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-13 21:43:26 megatron_base_model:1158] The model: MegatronGPTSFTModel() does not have field.name: moe_extended_tp in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-13 21:43:26 megatron_base_model:1158] The model: MegatronGPTSFTModel() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-13 21:43:26 megatron_base_model:1158] The model: MegatronGPTSFTModel() does not have field.name: use_te_rng_t

[NeMo I 2024-11-13 21:43:52 dist_ckpt_io:95] Using ('zarr', 1) dist-ckpt save strategy.
Loading distributed checkpoint with TensorStoreLoadShardedStrategy
Loading distributed checkpoint directly on the GPU
[NeMo I 2024-11-13 21:44:57 nlp_overrides:1180] Model MegatronGPTSFTModel was successfully restored from /workspace/llama-3-8b-instruct-nemo_v1.0/llama3_1_8b_instruct.nemo.
[NeMo I 2024-11-13 21:44:57 nlp_adapter_mixins:203] Before adding PEFT params:
      | Name  | Type     | Params | Mode 
    -------------------------------------------
    0 | model | GPTModel | 8.0 B  | train
    -------------------------------------------
    0         Trainable params
    8.0 B     Non-trainable params
    8.0 B     Total params
    32,121.045Total estimated model params size (MB)
[NeMo I 2024-11-13 21:45:00 nlp_adapter_mixins:208] After adding PEFT params:
      | Name  | Type     | Params | Mode 
    -------------------------------------------
    0 | model | GPTModel | 8.0 B  | train
    --

[NeMo W 2024-11-13 21:45:00 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/configuration_validator.py:161: You have overridden `MegatronGPTSFTModel.configure_sharded_model` which is deprecated. Please override the `configure_model` hook instead. Instantiation with the newer hook will be created on the device right away and have the right data type depending on the precision setting in the Trainer.
    
[NeMo W 2024-11-13 21:45:00 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/configuration_validator.py:143: You are using the `dataloader_iter` step flavor. If you consume the iterator more than once per step, the `batch_idx` argument in any hook that takes it will not match with the batch index of the last batch consumed. This might have unforeseen effects on callbacks or code that expects to get the correct index. This will also not work well with gradient accumulation. This feature is very experimental and subjec

[NeMo I 2024-11-13 21:45:00 megatron_gpt_sft_model:803] Building GPT SFT test datasets.
[NeMo I 2024-11-13 21:45:00 text_memmap_dataset:116] Building data files
[NeMo I 2024-11-13 21:45:00 text_memmap_dataset:525] Processing 1 data files using 46 workers


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[NeMo I 2024-11-13 21:45:02 text_memmap_dataset:495] Building indexing for fn = ./curated-data/law-kr-test-n128.jsonl
[NeMo I 2024-11-13 21:45:02 text_memmap_dataset:507] Saving idx file = ./curated-data/law-kr-test-n128.jsonl.idx.npy
[NeMo I 2024-11-13 21:45:02 text_memmap_dataset:509] Saving metadata file = ./curated-data/law-kr-test-n128.jsonl.idx.info
[NeMo I 2024-11-13 21:45:02 text_memmap_dataset:535] Time building 1 / 1 mem-mapped files: 0:00:01.804854
[NeMo I 2024-11-13 21:45:02 text_memmap_dataset:525] Processing 1 data files using 46 workers


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[NeMo I 2024-11-13 21:45:04 text_memmap_dataset:535] Time building 0 / 1 mem-mapped files: 0:00:01.893652
[NeMo I 2024-11-13 21:45:04 text_memmap_dataset:158] Loading data files
[NeMo I 2024-11-13 21:45:04 text_memmap_dataset:249] Loading ./curated-data/law-kr-test-n128.jsonl
[NeMo I 2024-11-13 21:45:04 text_memmap_dataset:161] Time loading 1 mem-mapped files: 0:00:00.004577
[NeMo I 2024-11-13 21:45:04 text_memmap_dataset:165] Computing global indices
[NeMo I 2024-11-13 21:45:04 megatron_gpt_sft_model:806] Length of test dataset: 128
[NeMo I 2024-11-13 21:45:04 megatron_gpt_sft_model:829] Building dataloader with consumed samples: 0


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
[NeMo W 2024-11-13 21:45:04 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=91` in the `DataLoader` to improve performance.
    
[NeMo W 2024-11-13 21:45:04 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/utilities.py:149: Found `dataloader_iter` argument in the `test_step`. Note that the support for this signature is experimental and the behavior is subject to change.
    
    
      input_info_tensor = torch.cuda.FloatTensor(input_info)
    
      string_tensor = torch.as_tensor(
    


Testing DataLoader 0: 100%|██████████| 16/16 [09:44<00:00,  0.03it/s][NeMo I 2024-11-13 21:54:48 megatron_gpt_sft_model:561] Total deduplicated inference data size: 128 to 128
[NeMo I 2024-11-13 21:54:48 megatron_gpt_sft_model:712] Predictions saved to law_kr_lora_test_law_inputs_preds_labels.jsonl


[NeMo W 2024-11-13 21:54:48 megatron_gpt_sft_model:652] No training data found, reconfiguring microbatches based on validation batch sizes.
[NeMo W 2024-11-13 21:54:48 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:439: It is recommended to use `self.log('val_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
    
[NeMo W 2024-11-13 21:54:48 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:439: It is recommended to use `self.log('test_loss_law', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
    
[NeMo W 2024-11-13 21:54:48 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:439: It is recommended to use `self.log('test_loss', ..., sync_

Testing DataLoader 0: 100%|██████████| 16/16 [09:44<00:00,  0.03it/s]
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃[1m [0m[1m       Test metric       [0m[1m [0m┃[1m [0m[1m      DataLoader 0       [0m[1m [0m┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│[36m [0m[36m        test_loss        [0m[36m [0m│[35m [0m[35m   0.05774552375078201   [0m[35m [0m│
│[36m [0m[36m      test_loss_law      [0m[36m [0m│[35m [0m[35m   0.05774552375078201   [0m[35m [0m│
│[36m [0m[36m        val_loss         [0m[36m [0m│[35m [0m[35m   0.05774552375078201   [0m[35m [0m│
└───────────────────────────┴───────────────────────────┘


In [19]:
!ls

00_NIMs.ipynb
00_NeMo_finetuning.ipynb
00_NeMo_finetuning.zip
01_NVIDIAAISolutions소개.pdf
01_RAG.ipynb
02_Advacned_RAG.ipynb
02_H100TensorCore를활용한학습가속화.pdf
03_분산학습을통한학습가속화.pdf
04_프로그램병목개선을위한GPU프로파일링활용.pdf
05_GenerativeAI학습플랫폼NVIDIANeMo.pdf
06_LLM추론최적화를위한TensorRT-LLM.pdf
07_NVIDIAAIEnterprise및NIM활용.pdf
Dockerfile
Dockerfile.addpackages
Dockerfile.custompytorch
NIMs.zip
NeMo
NeMo_finetuning.ipynb
TRT-LLM-AICA.zip
curated-data
law_kr_lora_test_law_inputs_preds_labels.jsonl
llama-3-8b-instruct-nemo_v1.0
llama3-lora-nemofw.ipynb
llama3_1_8b_instruct.nemo
ngc-cli
ngc-cli.md5
ngccli_linux.zip
pubmedqa
results
workspace


In [20]:
Pred_label_path = "law_kr_lora_test_law_inputs_preds_labels.jsonl"
with open(Pred_label_path, 'r', encoding='utf-8') as infile:
    for line in infile:
        data = json.loads(line)
        print(data)

{'input': '다음 법률 사실에 관련된 법률 조항을 알려주세요:\n\n1. 주거침입 피고인은 2018. 7. 12. 18:30경 대구 동구 B건물 C호 피해자 D의 주거지인 원룸에 탑차와 사다리를 이용하여 원룸 창문을 열고 들어 가 피해자의 주거에 침입하였다. 2. 절도 피고인은 위\n 1.항의 일시 및 장소에서 위 피해자의 주민등록증과 운전면허증을 가져가 절취하였다.', 'pred': ' 형법 제319조 제1항,형법 제329조', 'label': ' 형법 제319조 제1항,형법 제329조'}
{'input': '다음 법률 사실에 관련된 법률 조항을 알려주세요:\n\n피고인은 2022. 3. 30. 02:09경 경남 김해시 B모텔 앞 도로에서부터 같은 시 C에 있는 D 앞 도로에 이르기까지 약 1.5km 구간에서 혈중알코올농도 0.124%의 술에 취한 상태로 (차량번호 1 생략) 피아트 승용차를 운전하였다.', 'pred': ' 도로교통법 제148조의2 제3항 제2호,도로교통법 제44조 제1항', 'label': ' 도로교통법 제148조의2 제3항 제2호,도로교통법 제44조 제1항'}
{'input': '다음 법률 사실에 관련된 법률 조항을 알려주세요:\n\n피고인은 2019. 6. 21. 23:20경 김해시 B 앞 도로부터 C에 있는 D 부근 도로에 이르기까지 약 200미터 구간에서 혈중알콜농도 0.121%의 술에 취한 상태로 E 그랜저 승용차를 운전하였다.', 'pred': ' 도로교통법 제148조의2 제3항 제2호,도로교통법 제44조 제1항', 'label': ' 도로교통법 제148조의2 제1항,도로교통법 제44조 제1항'}
{'input': '다음 법률 사실에 관련된 법률 조항을 알려주세요:\n\n피고인은 서울 강동구 B 3층에서 ‘C’라는 상호의 업소를 운영하는 자이다. 피고인은 2014. 10. 29.부터 같은 달 30.까지 위 ‘C’ 업소에서 성매매 여성 D을 고용한 후, 그곳을 찾은 남자 손님들로부터 8만 원에서 12만 원의 대금을 받고 그

In [21]:
def compute_rouge(input_file: str) -> dict:
    ROUGE_KEYS = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
    scorer = rouge_scorer.RougeScorer(ROUGE_KEYS, use_stemmer=True)
    aggregator = scoring.BootstrapAggregator()
    lines = [json.loads(line) for line in open(input_file)]
    num_response_words = []
    num_ref_words = []
    for idx, line in enumerate(lines):
        prompt = line['input']
        response = line['pred']
        answer = line['label']
        scores = scorer.score(response, answer)
        aggregator.add_scores(scores)
        num_response_words.append(len(response.split()))
        num_ref_words.append(len(answer.split()))

    result = aggregator.aggregate()
    rouge_scores = {k: round(v.mid.fmeasure * 100, 4) for k, v in result.items()}
    print(rouge_scores)
    print(f"Average and stddev of response length: {np.mean(num_response_words):.2f}, {np.std(num_response_words):.2f}")
    print(f"Average and stddev of ref length: {np.mean(num_ref_words):.2f}, {np.std(num_ref_words):.2f}")

    return rouge_scores

In [22]:
compute_rouge(Pred_label_path)

{'rouge1': 90.3895, 'rouge2': 74.0378, 'rougeL': 89.7584, 'rougeLsum': 89.7422}
Average and stddev of response length: 4.20, 1.71
Average and stddev of ref length: 4.36, 1.91


{'rouge1': 90.3895, 'rouge2': 74.0378, 'rougeL': 89.7584, 'rougeLsum': 89.7422}