# Host a Lilac instance of a HuggingFace Dataset

This notebook will show you how to host a Lilac instance of a HuggingFace dataset with just a few lines of Python.


In [6]:
from typing import ClassVar, Iterable, Optional, Any
from huggingface_hub import duplicate_space, SpaceHardware, SpaceStorage, HfApi, delete_folder
import lilac as ll
import yaml


In [21]:
class IndentDumper(yaml.Dumper):
  """A yaml dumper that indent lists."""

  def increase_indent(self, flow: bool = False, indentless: bool = False) -> Any:
    """Increase the indent level."""
    return super(IndentDumper, self).increase_indent(flow, False)


def to_yaml(input: dict) -> str:
  """Convert a dictionary to a pretty yaml representation."""
  return yaml.dump(input, Dumper=IndentDumper, sort_keys=False)


def deploy_dataset(dataset_name: str,
                   target_space: str,
                   config_name: Optional[str] = None,
                   split: Optional[str] = None,
                   private: Optional[bool] = None,
                   token: Optional[str] = None,
                   exist_ok: Optional[bool] = None,
                   hardware: Optional[SpaceHardware] = None,
                   storage: Optional[SpaceStorage] = None,
                   sleep_time: Optional[int] = None) -> None:

  variables: dict[str, str] = {'LILAC_LOAD_ON_START_SERVER': 'true'}
  if storage:
    variables['LILAC_PROJECT_DIR'] = '/data'
    variables['TRANSFORMERS_CACHE'] = '/data/.cache'
    variables['XDG_CACHE_HOME'] = '/data/.cache'
    variables['HF_HOME'] = '/data/.huggingface'

  repo_url = duplicate_space(
    from_id='lilacai/lilac',
    to_id=target_space,
    private=private,
    token=token,
    exist_ok=exist_ok,
    hardware=hardware,
    storage=storage,
    sleep_time=sleep_time,
    variables=[{
      'key': key,
      'value': value
    } for key, value in variables.items()])

  target_space_splits = target_space.split('/')
  if len(target_space_splits) == 2:
    namespace, dataset_name = target_space_splits
  else:
    namespace = 'local'
    dataset_name = target_space_splits[0]

  config = ll.Config(datasets=[
    ll.DatasetConfig(
      namespace=namespace,
      name=dataset_name,
      source=ll.HuggingFaceSource(dataset_name=dataset_name, config_name=config_name, split=split))
  ])

  hf_api = HfApi()
  hf_api.upload_file(
    path_or_fileobj=to_yaml(
      config.model_dump(exclude_defaults=True, exclude_none=True, exclude_unset=True)).encode(),
    path_in_repo=f'data/{ll.project.PROJECT_CONFIG_FILENAME}',
    repo_id=target_space,
    repo_type='space',
  )
  hf_api.upload_file(
    path_or_fileobj=('---\n' + to_yaml({
      'title': f'Lilac: {dataset_name}',
      'emoji': '🌷',
      'colorFrom': 'purple',
      'colorTo': 'purple',
      'sdk': 'docker',
      'app_port': 5432,
      'datasets': []
    }) + '\n---').encode(),
    path_in_repo='README.md',
    repo_id=target_space,
    repo_type='space',
  )
  delete_folder(path_in_repo='data/.cache', repo_id=target_space)

  print('repo_url=', repo_url)


In [26]:
deploy_dataset(
  dataset_name='glue',
  config_name='ax',
  target_space='nsthorat-lilac/glue',
  exist_ok=True,
)


HfHubHTTPError: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/spaces/lilacai/lilac/duplicate (Request ID: Root=1-65123a4c-207419e666c3596c624d0672;ad4671ff-adfb-44f4-aa07-a1049466989b)

You have exceeded our daily quotas for action: duplicate. We invite you to retry later.