In [2]:
import lilac as ll

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
ll.start_server()

<lilac.server.Server at 0x2af17c990>

INFO:     Started server process [1662]
INFO:     Waiting for application startup.
INFO:     ASGI 'lifespan' protocol appears unsupported.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:5432 (Press CTRL+C to quit)


# Load a dataset from HuggingFace


In [3]:
ds = ll.from_huggingface('argilla/databricks-dolly-15k-curated-en', overwrite=True)

Dataset "databricks-dolly-15k-curated-en" written to ./data/datasets/local/databricks-dolly-15k-curated-en


# Database selections


In [6]:
# Simple select / limit.
rows = ds.select_rows(
  columns=['*'],
  limit=5,
)
for row in rows:
  print(row)

{'id': '5182', 'category': 'closed_qa', 'original-instruction': 'What are the causes of population growth in the Sun Belt in the US?', 'original-context': 'The Sun Belt has seen substantial population growth since post-World War II from an influx of people seeking a warm and sunny climate, a surge in retiring baby boomers, and growing economic opportunities. The advent of air conditioning created more comfortable summer conditions and allowed more manufacturing and industry to locate in the Sun Belt. Since much of the construction in the Sun Belt is new or recent, housing styles and design are often modern and open. Recreational opportunities in the Sun Belt are often not tied strictly to one season, and many tourist and resort cities, such as Fort Lauderdale, Gulf Shores, Houston, Las Vegas, Los Angeles, Miami, Myrtle Beach, New Orleans, Orlando, Palm Springs, Phoenix, San Antonio, San Diego, Tampa, and Tucson support a tourist industry all year.\nThe traditional explanations for the 

In [7]:
# Simple filter.
rows_result = ds.select_rows(
  columns=['*'],
  filters=[('category', 'equals', 'classification')],
  limit=5,
)
for row in rows_result:
  print(row)

print('Total count:', rows_result.total_num_rows)

{'id': '8872', 'category': 'classification', 'original-instruction': 'Which of these drinks are caffeinated: water, coffee, tea, laptop, orange juice, lemonade, pre workout.', 'original-context': '', 'original-response': 'Coffee, tea, and pre workout may have varying amounts of caffeine. Water, orange juice, and lemonade are drinks which don’t have caffeine. A laptop is not a drink at all.', 'external_id': None, '__hfsplit__': 'train', 'new-instruction.user_id': [None], 'new-instruction.value': ['Which of these drinks are caffeinated: water, coffee, tea, laptop, orange juice, lemonade, pre workout.'], 'new-instruction.status': ['submitted'], 'new-context.user_id': [None], 'new-context.value': [''], 'new-context.status': ['submitted'], 'new-response.user_id': [None], 'new-response.value': ['Coffee, tea, and pre workout may have varying amounts of caffeine. Water, orange juice, and lemonade are drinks which don’t have caffeine. A laptop is not a drink at all.'], 'new-response.status': ['

# Histograms


In [8]:
from pprint import pprint

# Simple histogram
groups = ds.select_groups(
  'category',
)
pprint(groups.counts)

[('open_qa', 3611),
 ('general_qa', 2191),
 ('classification', 2136),
 ('closed_qa', 1823),
 ('brainstorming', 1768),
 ('information_extraction', 1512),
 ('summarization', 1263),
 ('creative_writing', 711)]


# Signals


In [9]:
# Compute a signal.
ds.compute_signal(ll.TextStatisticsSignal(), 'original-instruction', overwrite=True)

Compute signal  TextStatisticsSignal({"signal_name":"text_statistics"}) on databricks-dolly-15k-curated-en:original-instruction: 100%|██████████| 15015/15015 [00:01<00:00, 9652.02it/s] 

Wrote signal output to ./data/datasets/local/databricks-dolly-15k-curated-en/original-instruction/text_statistics





# Custom signal


In [12]:
import re

url_re = re.compile(
  r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
)


class URLHighlighter(ll.TextSignal):
  name = 'url-highlighter'

  def compute(self, docs: list[str]):
    for doc in docs:
      # Get the spans
      url_spans = url_re.finditer(doc)

      spans = []
      for url_span in url_spans:
        spans.append(ll.span(url_span.start(), url_span.end()))

      yield {'has_url': len(spans) > 0, 'urls': spans}


ds.compute_signal(URLHighlighter(), 'original-instruction', overwrite=True)

Compute signal  URLHighlighter({"signal_name":"url-highlighter"}) on databricks-dolly-15k-curated-en:original-instruction: 100%|██████████| 15015/15015 [00:00<00:00, 83845.65it/s]


Wrote signal output to ./data/datasets/local/databricks-dolly-15k-curated-en/original-instruction/url-highlighter


In [13]:
ds.compute_signal(URLHighlighter(), 'original-response', overwrite=True)

Compute signal  URLHighlighter({"signal_name":"url-highlighter"}) on databricks-dolly-15k-curated-en:original-response: 100%|██████████| 15015/15015 [00:00<00:00, 78597.76it/s]


Wrote signal output to ./data/datasets/local/databricks-dolly-15k-curated-en/original-response/url-highlighter


In [4]:
ds.compute_embedding('gte-small', 'original-instruction', use_garden=True, overwrite=True)

Compute embedding  GTESmall({"embed_input_type":"document","signal_name":"gte-small"}) on databricks-dolly-15k-curated-en:original-instruction:   0%|          | 0/15015 [00:00<?, ?it/s]/Users/nikhil/Code/lilac/.venv/lib/python3.11/site-packages/modal/client.py:154: DeprecationError: 🚨 Version 0.56.4964 of `modal` will be deprecated on July 06, 2024. Please upgrade to the latest version using `pip install --upgrade modal`. 🚨
Compute embedding  GTESmall({"embed_input_type":"document","signal_name":"gte-small"}) on databricks-dolly-15k-curated-en:original-instruction: 100%|██████████| 15015/15015 [00:25<00:00, 600.34it/s]


Computing GTE on Lilac Garden took 24.816s.
hnswlib index creation took 0.001s.
hnswlib add items took 1.393s.
Wrote embedding index to ./data/datasets/local/databricks-dolly-15k-curated-en/original-instruction/gte-small


# Searches


In [1]:
top_rows = ds.select_rows(
  columns=[ll.ROWID, 'original-instruction'],
  limit=5,
  searches=[
    ll.ConceptSearch(
      path='original-instruction',
      concept_namespace='local',
      concept_name='politics-auto',
      embedding='gte-small',
    )
  ],
  combine_columns=True,
)

for row in top_rows:
  pprint(row)

NameError: name 'ds' is not defined