In [4]:
import lilac as ll

config = ll.DatasetConfig(
  namespace='local',
  name='glaive',
  source=ll.HuggingFaceSource(dataset_name='glaiveai/glaive-code-assistant'),
)
dataset = ll.create_dataset(config, overwrite=True)


Dataset "glaive" written to ./data/datasets/local/glaive


In [2]:
import lilac as ll

ll.start_server()


  from .autonotebook import tqdm as notebook_tqdm


INFO:     Started server process [52411]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:5432 (Press CTRL+C to quit)


In [5]:
import re
import subprocess
import lilac as ll

code_block_re = re.compile('```(py|python)\n(.*)?\n```', re.MULTILINE | re.DOTALL)


# Format the code blocks of the "answer" column using the `ruff`` formatter.
def format_code(item):
  text = item['answer']
  if not text:
    return None

  new_text = text
  has_edit = False
  for _, code_block in code_block_re.findall(text):
    if not code_block:
      continue
    try:
      # Call the ruff binary to format the current code block.
      formatted_code_block = subprocess.check_output(
        ['ruff', 'format', '-'], input=code_block, encoding='utf-8', stderr=subprocess.DEVNULL
      )
      new_text = new_text.replace(code_block, formatted_code_block)
      has_edit = True
    except subprocess.CalledProcessError:
      continue
  return {'answer': new_text, 'has_edit': has_edit}


ds = ll.get_dataset('local', 'glaive')
ds.map(format_code, output_column='answer_formatted', num_jobs=-1)
ds.add_media_field('answer_formatted', markdown=True)


Scheduling task "64c60e46150e4ccab0b40e89ce8872e8": "[local/glaive][shard 0/8] map "format_code" to "answer_formatted"".
Scheduling task "6ffa24216e7245889315d5d9145bc070": "[local/glaive][shard 1/8] map "format_code" to "answer_formatted"".
Scheduling task "806bf7c2e9494ec7a5a84c0bb2d56d41": "[local/glaive][shard 2/8] map "format_code" to "answer_formatted"".
Scheduling task "2bc4872d31604dce87ebba9306788dc0": "[local/glaive][shard 3/8] map "format_code" to "answer_formatted"".
Scheduling task "2e7d2d29a4794cce9589b8b4b6017d7d": "[local/glaive][shard 4/8] map "format_code" to "answer_formatted"".
Scheduling task "6baa0c3fe3844800b8ca61bbb551372c": "[local/glaive][shard 5/8] map "format_code" to "answer_formatted"".
Scheduling task "4d00334e79b4447693ca4159f5e317b7": "[local/glaive][shard 6/8] map "format_code" to "answer_formatted"".
Scheduling task "50f67bc965c3417ebf0f8f921d5101a2": "[local/glaive][shard 7/8] map "format_code" to "answer_formatted"".


[local/glaive][shard 0/8] map "format_code" to "answer_formatted":  13%|█▎        | 17014/136109 [02:17<16:00, 124.01it/s]
[local/glaive][shard 4/8] map "format_code" to "answer_formatted":  11%|█         | 15289/136109 [02:14<14:15, 141.21it/s]

Task finished "64c60e46150e4ccab0b40e89ce8872e8": "[local/glaive][shard 0/8] map "format_code" to "answer_formatted"" in 2m24s.


[local/glaive][shard 2/8] map "format_code" to "answer_formatted":  13%|█▎        | 17014/136109 [02:25<16:58, 116.91it/s]
[local/glaive][shard 1/8] map "format_code" to "answer_formatted":  12%|█▏        | 16282/136109 [02:23<15:34, 128.26it/s]

Task finished "806bf7c2e9494ec7a5a84c0bb2d56d41": "[local/glaive][shard 2/8] map "format_code" to "answer_formatted"" in 2m32s.


[local/glaive][shard 5/8] map "format_code" to "answer_formatted":  13%|█▎        | 17014/136109 [02:24<16:50, 117.90it/s]


Task finished "6baa0c3fe3844800b8ca61bbb551372c": "[local/glaive][shard 5/8] map "format_code" to "answer_formatted"" in 2m33s.


[local/glaive][shard 3/8] map "format_code" to "answer_formatted":  13%|█▎        | 17014/136109 [02:24<16:50, 117.91it/s]
[local/glaive][shard 1/8] map "format_code" to "answer_formatted":  12%|█▏        | 16473/136109 [02:24<10:12, 195.35it/s]

Task finished "2bc4872d31604dce87ebba9306788dc0": "[local/glaive][shard 3/8] map "format_code" to "answer_formatted"" in 2m33s.


[local/glaive][shard 6/8] map "format_code" to "answer_formatted":  13%|█▎        | 17014/136109 [02:24<16:49, 117.98it/s]
[local/glaive][shard 1/8] map "format_code" to "answer_formatted":  12%|█▏        | 16575/136109 [02:25<08:22, 237.71it/s]

Task finished "4d00334e79b4447693ca4159f5e317b7": "[local/glaive][shard 6/8] map "format_code" to "answer_formatted"" in 2m34s.


[local/glaive][shard 4/8] map "format_code" to "answer_formatted":  13%|█▎        | 17014/136109 [02:25<16:58, 116.96it/s]
[local/glaive][shard 1/8] map "format_code" to "answer_formatted":  12%|█▏        | 16850/136109 [02:26<07:50, 253.56it/s]

Task finished "2e7d2d29a4794cce9589b8b4b6017d7d": "[local/glaive][shard 4/8] map "format_code" to "answer_formatted"" in 2m35s.


[local/glaive][shard 1/8] map "format_code" to "answer_formatted":  13%|█▎        | 17014/136109 [02:26<17:08, 115.75it/s]


Task finished "6ffa24216e7245889315d5d9145bc070": "[local/glaive][shard 1/8] map "format_code" to "answer_formatted"" in 2m36s.


[local/glaive][shard 7/8] map "format_code" to "answer_formatted":  12%|█▏        | 17011/136109 [02:26<17:05, 116.16it/s]


Wrote map output to ./data/datasets/local/glaive/./data/datasets/local/glaive/answer_formatted-00000-of-00001.parquet


<lilac.data.dataset_duckdb.DuckDBMapOutput at 0x2bcc694d0>

Task finished "50f67bc965c3417ebf0f8f921d5101a2": "[local/glaive][shard 7/8] map "format_code" to "answer_formatted"" in 2m36s.
