In [1]:
import nemo
import nemo.collections.asr as nemo_asr

print("NeMo and ASR module imported successfully!")

NeMo and ASR module imported successfully!


In [None]:
HF_MODEL_NAME_OR_PATH = "meta-llama/Llama-3.1-8B"

ROOT_DIR = "/workspace"
NEMO_OUTPUT_PATH = f"{ROOT_DIR}/Llama-3.1-8B-nemo"
DATA_PATH = f"{ROOT_DIR}/SAT_DATA-data"

### Step 1: Convert the Hugging Face model to NeMo checkoint format

In [None]:
!python -c 'from nemo.collections import llm; llm.import_ckpt(llm.LlamaModel(llm.Llama31Config8B()), source="hf://{HF_MODEL_NAME_OR_PATH}", output_path="{NEMO_OUTPUT_PATH}")'

### Step 2: Prepare the dataset

In [None]:
import json
import os

from datasets import load_dataset

# Load the SAT_DATA-103 dataset
dataset = load_dataset("SAT_DATA", "SAT_DATA-250")

# Define the destination folder
os.makedirs(DATA_PATH, exist_ok=True)


# Function to save dataset split to a JSONL file
def save_to_jsonl(file_path, data):
    with open(file_path, "w") as file:
        for item in data:
            file.write(json.dumps(item) + "\n")


# Define splits
splits = ["train", "validation", "test"]
file_paths = {split: os.path.join(DATA_PATH, f"SAT_DATA-{split}.jsonl") for split in splits}

# Save splits to JSONL files and calculate their sizes
for split in splits:
    if split in dataset:
        print(f"Saving {split} split to {file_paths[split]}")
        save_to_jsonl(file_paths[split], dataset[split])
    else:
        print(f"Split {split} not found in the dataset.")

print("Dataset saved to JSONL files.")

In [None]:
!python /opt/NeMo/scripts/nlp_language_modeling/preprocess_data_for_megatron.py \
    --input="{DATA_PATH}/SAT_DATA-train.jsonl" \
    --tokenizer-library=huggingface \
    --tokenizer-type="{HF_MODEL_NAME_OR_PATH}" \
    --output-prefix="{DATA_PATH}/SAT_DATA_tokenized_train" \
    --append-eod \
    --workers=32

In [None]:
!python /opt/NeMo/scripts/nlp_language_modeling/preprocess_data_for_megatron.py \
    --input="{DATA_PATH}/SAT_DATA-validation.jsonl" \
    --tokenizer-library=huggingface \
    --tokenizer-type="{HF_MODEL_NAME_OR_PATH}" \
    --output-prefix="{DATA_PATH}/SAT_DATA_tokenized_val" \
    --append-eod \
    --workers=32

In [None]:
!python /opt/NeMo/scripts/nlp_language_modeling/preprocess_data_for_megatron.py \
    --input="{DATA_PATH}/SAT_DATA-test.jsonl" \
    --tokenizer-library=huggingface \
    --tokenizer-type="{HF_MODEL_NAME_OR_PATH}" \
    --output-prefix="{DATA_PATH}/SAT_DATA_tokenized_test" \
    --append-eod \
    --workers=32

After running the above scripts, you will see the preprocesed `/workspace/SAT_DATA-data/SAT_DATA_tokenized_{train/val/test}_text_document.{idx/bin}`files. These output files will be used in the next step.