In [1]:
from typing import Optional, Type
from src.settings import HyperparameterConfig
import yaml
import os


def load_yaml_config(config_file: str) -> dict:
    if not os.path.isfile(config_file):
        raise FileNotFoundError(f"Config file not found: {config_file}")
    
    with open(config_file, "r") as f:
        return yaml.safe_load(f)


def load_hyperparameters(
    config_file: Optional[str] = "/teamspace/studios/this_studio/legal-chatbot/configs/training.yaml",
    config_class: Type[HyperparameterConfig] = HyperparameterConfig
) -> HyperparameterConfig:
    """
    Load hyperparameters from a YAML config using a specified config class.
    Default is HyperparameterConfig, but can pass subclass (e.g., LlamaHyperparameterConfig).
    """
    if config_file is None:
        print(f"No config file provided. Using default {config_class.__name__} settings.")
        return config_class()

    yaml_config = load_yaml_config(config_file)

    if not isinstance(yaml_config, dict):
        raise ValueError("Configuration file content must be a dictionary at the top level.")

    if 'hyperparameters' not in yaml_config:
        raise KeyError("'hyperparameters' section not found in configuration file.")

    return config_class(**yaml_config['hyperparameters'])


from src.settings import QwenHyperparameterConfig


hparams = load_hyperparameters("/teamspace/studios/this_studio/legal-chatbot/configs/training.yaml", config_class=QwenHyperparameterConfig)
hparams

QwenHyperparameterConfig(per_device_train_batch_size=1, gradient_accumulation_steps=8, max_steps=120, warmup_steps=10, learning_rate=2e-05, embedding_learning_rate=1e-05, fp16=False, bf16=True, logging_steps=1, optim='adamw_8bit', weight_decay=0.01, lr_scheduler_type='linear', seed=42, output_dir='outputs', report_to='none')

In [2]:
from datasets import load_dataset
from dotenv import load_dotenv
import os

load_dotenv()
ds = load_dataset("DuongTrongChi/luatvn-split-v_0.2.0", "split", split="train", token=os.getenv("HF_TOKEN"))
ds

Resolving data files:   0%|          | 0/36 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/36 [00:00<?, ?files/s]

train-00000-of-00036.parquet:   0%|          | 0.00/129M [00:00<?, ?B/s]

train-00001-of-00036.parquet:   0%|          | 0.00/135M [00:00<?, ?B/s]

train-00002-of-00036.parquet:   0%|          | 0.00/92.0M [00:00<?, ?B/s]

train-00003-of-00036.parquet:   0%|          | 0.00/117M [00:00<?, ?B/s]

train-00004-of-00036.parquet:   0%|          | 0.00/41.1M [00:00<?, ?B/s]

train-00005-of-00036.parquet:   0%|          | 0.00/78.1M [00:00<?, ?B/s]

train-00006-of-00036.parquet:   0%|          | 0.00/155M [00:00<?, ?B/s]

train-00007-of-00036.parquet:   0%|          | 0.00/75.6M [00:00<?, ?B/s]

train-00008-of-00036.parquet:   0%|          | 0.00/114M [00:00<?, ?B/s]

train-00009-of-00036.parquet:   0%|          | 0.00/145M [00:00<?, ?B/s]

train-00010-of-00036.parquet:   0%|          | 0.00/97.1M [00:00<?, ?B/s]

train-00011-of-00036.parquet:   0%|          | 0.00/75.2M [00:00<?, ?B/s]

train-00012-of-00036.parquet:   0%|          | 0.00/118M [00:00<?, ?B/s]

train-00013-of-00036.parquet:   0%|          | 0.00/50.5M [00:00<?, ?B/s]

train-00014-of-00036.parquet:   0%|          | 0.00/118M [00:00<?, ?B/s]

train-00015-of-00036.parquet:   0%|          | 0.00/139M [00:00<?, ?B/s]

train-00016-of-00036.parquet:   0%|          | 0.00/92.8M [00:00<?, ?B/s]

train-00017-of-00036.parquet:   0%|          | 0.00/77.4M [00:00<?, ?B/s]

train-00018-of-00036.parquet:   0%|          | 0.00/128M [00:00<?, ?B/s]

train-00019-of-00036.parquet:   0%|          | 0.00/90.6M [00:00<?, ?B/s]

train-00020-of-00036.parquet:   0%|          | 0.00/143M [00:00<?, ?B/s]

train-00021-of-00036.parquet:   0%|          | 0.00/101M [00:00<?, ?B/s]

train-00022-of-00036.parquet:   0%|          | 0.00/82.6M [00:00<?, ?B/s]

train-00023-of-00036.parquet:   0%|          | 0.00/124M [00:00<?, ?B/s]

train-00024-of-00036.parquet:   0%|          | 0.00/104M [00:00<?, ?B/s]

train-00025-of-00036.parquet:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

train-00026-of-00036.parquet:   0%|          | 0.00/105M [00:00<?, ?B/s]

train-00027-of-00036.parquet:   0%|          | 0.00/92.1M [00:00<?, ?B/s]

train-00028-of-00036.parquet:   0%|          | 0.00/111M [00:00<?, ?B/s]

train-00029-of-00036.parquet:   0%|          | 0.00/104M [00:00<?, ?B/s]

train-00030-of-00036.parquet:   0%|          | 0.00/163M [00:00<?, ?B/s]

train-00031-of-00036.parquet:   0%|          | 0.00/53.2M [00:00<?, ?B/s]

train-00032-of-00036.parquet:   0%|          | 0.00/116M [00:00<?, ?B/s]

train-00033-of-00036.parquet:   0%|          | 0.00/127M [00:00<?, ?B/s]

train-00034-of-00036.parquet:   0%|          | 0.00/148M [00:00<?, ?B/s]

train-00035-of-00036.parquet:   0%|          | 0.00/165M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/152823 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/32 [00:00<?, ?it/s]

Dataset({
    features: ['doc_property', 'doc_relate', 'doc_relate_diagram', 'id', 'text', 'html', 'property', 'extract'],
    num_rows: 152823
})

In [14]:
def f1(example):
    data = dict()
    data['DocName'] = example['doc_property'].get('DocName', None)
    data['DocIdentity'] = example['doc_property'].get('DocIdentity', None)
    data['OrganName'] = example['doc_property'].get('OrganName', None)
    data['EffectDate'] = example['doc_property'].get('EffectDate', None)

    return {'text': example['text'], 'metadata': data}

_ds = ds.map(f1, remove_columns=ds.features)
_ds

Dataset({
    features: ['text', 'metadata'],
    num_rows: 152823
})

In [15]:
_ds = _ds.filter(lambda x: x['text'] != '')
_ds

Filter:   0%|          | 0/152823 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'metadata'],
    num_rows: 152769
})

In [16]:
_ds = _ds.filter(lambda x: x['text'] != 'None')
_ds

Filter:   0%|          | 0/152769 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'metadata'],
    num_rows: 96716
})

In [19]:
_ds = _ds.filter(lambda x: x['text'] != "<p>None</p>")
_ds

Filter:   0%|          | 0/96716 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'metadata'],
    num_rows: 96716
})

In [None]:
def format_metadata(metadata: dict) -> str:
    s = "<metadata>\n"
    for k, v in metadata.items():
        s += f"{k}: {v}\n"
    s += "</metadata>"
    return s


def f2(example):
    text = example['text'] + format_metadata(example['metadata'])

    return {'text': text}

temp_ds = _ds.map(f2)
temp_ds

Map:   0%|          | 0/96716 [00:00<?, ? examples/s]

In [21]:
temp_ds.push_to_hub("DuongTrongChi/legal-pretrain", "processed", token="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")

NameError: name 'temp_ds' is not defined