In [4]:
import lilac as ll

config = ll.DatasetConfig(
  namespace='local',
  name='glaive',
  source=ll.HuggingFaceSource(dataset_name='glaiveai/glaive-code-assistant'),
)
dataset = ll.create_dataset(config, overwrite=True)


Dataset "glaive" written to ./data/datasets/local/glaive


In [None]:
import lilac as ll

ll.start_server()


INFO:     Started server process [63957]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:5432 (Press CTRL+C to quit)


In [3]:
import re
import subprocess
import lilac as ll

code_block_re = re.compile('```(py|python)\n(.*)?\n```', re.MULTILINE | re.DOTALL)


# Format the code blocks of the "answer" column using the `ruff`` formatter.
def format_code(item):
  text = item['answer']
  if not text:
    return None

  new_text = text
  has_edit = False
  for _, code_block in code_block_re.findall(text):
    if not code_block:
      continue
    try:
      # Call the ruff binary to format the current code block.
      formatted_code_block = subprocess.check_output(
        ['ruff', 'format', '-'], input=code_block, encoding='utf-8', stderr=subprocess.DEVNULL
      )
      new_text = new_text.replace(code_block, formatted_code_block)
      has_edit = True
    except subprocess.CalledProcessError:
      continue
  return {'answer': new_text, 'has_edit': has_edit}


ds = ll.get_dataset('local', 'glaive')
ds.map(format_code, output_column='answer_formatted2', num_jobs=-1)


Scheduling task "9cde93f95d014498944ca8b496bc0de9": "[local/glaive][shard 0/8] map "format_code" to "answer_formatted2"".
Scheduling task "d16fd9b65a174bef9a997471143903cb": "[local/glaive][shard 1/8] map "format_code" to "answer_formatted2"".
Scheduling task "98d1d909bd304749b4f43272b913767b": "[local/glaive][shard 2/8] map "format_code" to "answer_formatted2"".
Scheduling task "67ea8effc3da423488040d054d4c1183": "[local/glaive][shard 3/8] map "format_code" to "answer_formatted2"".
Scheduling task "1f962dddedb749e19822e91c7e7fad4e": "[local/glaive][shard 4/8] map "format_code" to "answer_formatted2"".
Scheduling task "e651db937e9c442cb31e2426033f63cd": "[local/glaive][shard 5/8] map "format_code" to "answer_formatted2"".
Scheduling task "62281e64571146ef9e79cbf756aff69d": "[local/glaive][shard 6/8] map "format_code" to "answer_formatted2"".
Scheduling task "56a7e8481fac4852afb34867cb1c1dea": "[local/glaive][shard 7/8] map "format_code" to "answer_formatted2"".


[local/glaive][shard 0/8] map "format_code" to "answer_formatted2":  13%|█▎        | 17014/136109 [02:19<16:19, 121.64it/s] 
[local/glaive][shard 6/8] map "format_code" to "answer_formatted2":  12%|█▏        | 16480/136109 [02:18<13:26, 148.36it/s]

Task finished "9cde93f95d014498944ca8b496bc0de9": "[local/glaive][shard 0/8] map "format_code" to "answer_formatted2"" in 2m27s.


[local/glaive][shard 3/8] map "format_code" to "answer_formatted2":  13%|█▎        | 17014/136109 [02:20<16:22, 121.21it/s]
[local/glaive][shard 7/8] map "format_code" to "answer_formatted2":  12%|█▏        | 16368/136109 [02:19<12:08, 164.38it/s]

Task finished "67ea8effc3da423488040d054d4c1183": "[local/glaive][shard 3/8] map "format_code" to "answer_formatted2"" in 2m28s.


[local/glaive][shard 1/8] map "format_code" to "answer_formatted2":  13%|█▎        | 17014/136109 [02:21<16:28, 120.42it/s]
[local/glaive][shard 7/8] map "format_code" to "answer_formatted2":  12%|█▏        | 16403/136109 [02:19<13:25, 148.69it/s]

Task finished "d16fd9b65a174bef9a997471143903cb": "[local/glaive][shard 1/8] map "format_code" to "answer_formatted2"" in 2m28s.
Task finished "1f962dddedb749e19822e91c7e7fad4e": "[local/glaive][shard 4/8] map "format_code" to "answer_formatted2"" in 2m28s.


[local/glaive][shard 4/8] map "format_code" to "answer_formatted2":  13%|█▎        | 17014/136109 [02:20<16:24, 121.00it/s]
[local/glaive][shard 5/8] map "format_code" to "answer_formatted2":  13%|█▎        | 17014/136109 [02:19<16:19, 121.55it/s]
[local/glaive][shard 7/8] map "format_code" to "answer_formatted2":  12%|█▏        | 16481/136109 [02:19<11:22, 175.18it/s]

Task finished "e651db937e9c442cb31e2426033f63cd": "[local/glaive][shard 5/8] map "format_code" to "answer_formatted2"" in 2m28s.


[local/glaive][shard 6/8] map "format_code" to "answer_formatted2":  13%|█▎        | 17014/136109 [02:21<16:27, 120.61it/s]
[local/glaive][shard 7/8] map "format_code" to "answer_formatted2":  12%|█▏        | 16805/136109 [02:21<07:44, 256.64it/s]

Task finished "62281e64571146ef9e79cbf756aff69d": "[local/glaive][shard 6/8] map "format_code" to "answer_formatted2"" in 2m30s.


[local/glaive][shard 2/8] map "format_code" to "answer_formatted2":  13%|█▎        | 17014/136109 [02:23<16:41, 118.87it/s]
[local/glaive][shard 7/8] map "format_code" to "answer_formatted2":  12%|█▏        | 16909/136109 [02:21<09:09, 216.92it/s]

Task finished "98d1d909bd304749b4f43272b913767b": "[local/glaive][shard 2/8] map "format_code" to "answer_formatted2"" in 2m30s.


[local/glaive][shard 7/8] map "format_code" to "answer_formatted2":  12%|█▏        | 17011/136109 [02:22<16:34, 119.71it/s]


Wrote map output to ./data/datasets/local/glaive/./data/datasets/local/glaive/answer_formatted2-00000-of-00001.parquet


<lilac.data.dataset_duckdb.DuckDBMapOutput at 0x2ab503fd0>

Task finished "56a7e8481fac4852afb34867cb1c1dea": "[local/glaive][shard 7/8] map "format_code" to "answer_formatted2"" in 2m31s.
