In [14]:
import lilac as ll

items = [
  {'questions': ['A', 'B']},
  {'questions': ['C']},
  {'questions': ['D', 'E']},
]
config = ll.DatasetConfig(
  namespace='local',
  name='tutorial2',
  source=ll.DictSource(items),
)
dataset = ll.create_dataset(config, overwrite=True)
dataset.to_pandas()


Reading from source dict...: 100%|██████████| 3/3 [00:00<00:00, 837.97it/s]

Dataset "tutorial2" written to ./datasets/local/tutorial2





Unnamed: 0,questions.*
0,"[A, B]"
1,[C]
2,"[D, E]"


In [16]:
def add_prefix(question):
  return 'Q: ' + question


res = dataset.map(
  add_prefix, input_path=('questions', '*'), output_column='questions_with_prefix', overwrite=True
)

print(list(res))

dataset.to_pandas()


Scheduling task "0dbe18d9b6ed460c92b347258bd12344": "[local/tutorial2][1 shards] map "add_prefix" to "questions_with_prefix"".
Wrote map output to ./datasets/local/tutorial2/questions_with_prefix-00000-of-00001.parquet
[]


[Shard 0/1] map "add_prefix" to "('questions_with_prefix',)": 100%|██████████| 3/3 [00:00<00:00, 420.47it/s]


Unnamed: 0,questions.*,questions_with_prefix.*
0,"[A, B]","[Q: A, Q: B]"
1,[C],[Q: C]
2,"[D, E]","[Q: D, Q: E]"


Task finished "0dbe18d9b6ed460c92b347258bd12344": "[local/tutorial2][1 shards] map "add_prefix" to "questions_with_prefix"" in 4s.


In [10]:
import lilac as ll

items = [
  {
    'text': 'Apple Inc. is an American multinational technology company headquartered in Cupertino, California, (Company: Apple Inc.) that designs, develops, and sells consumer electronics, computer software, and online services.'
  },
  {
    'text': 'Google LLC is an American multinational technology company (Company: Google LLC) that specializes in Internet-related services and products, which include online advertising technologies, a search engine, cloud computing, software, and hardware.'
  },
]
config = ll.DatasetConfig(
  namespace='local',
  name='tutorial3',
  source=ll.DictSource(items),
)
dataset = ll.create_dataset(config, overwrite=True)
dataset.to_pandas()


def extract_company_with_coordinates(text):
  import re

  pattern = r'\(Company: ([^\)]+)\)'
  matches = re.finditer(pattern, text)
  return [ll.span(m.start(), m.end(), {'Company': m.group(1)}) for m in matches]


dataset.map(
  extract_company_with_coordinates, input_path='text', output_column='company', overwrite=True
)


Reading from source dict...: 100%|██████████| 2/2 [00:00<00:00, 1251.66it/s]




Dataset "tutorial3" written to ./datasets/local/tutorial3
Scheduling task "22c95a8997ef490d8a28e56f9933cf94": "[local/tutorial3][1 shards] map "extract_company_with_coordinates" to "company"".
Wrote map output to ./datasets/local/tutorial3/company-00000-of-00001.parquet


[Shard 0/1] map "extract_company_with_coordinates" to "('company',)": 100%|██████████| 2/2 [00:00<00:00, 849.65it/s]


<lilac.data.dataset_duckdb.DuckDBMapOutput at 0x2cf6fbb10>

In [21]:
df = dataset.to_pandas()
print(df.columns)


Index(['text', 'company.*', 'company.*.Company'], dtype='object')


In [18]:
df[:1]['company.*.Company']


0    [Apple Inc.]
Name: company.*.Company, dtype: object

In [None]:
ll.start_server()


INFO:     Started server process [65094]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:5432 (Press CTRL+C to quit)


In [None]:
ll.start_server()


INFO:     Started server process [65547]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:5432 (Press CTRL+C to quit)


In [2]:
import lilac as ll

config = ll.DatasetConfig(
  namespace='local',
  name='companies',
  source=ll.HuggingFaceSource(dataset_name='atmallen/companies_azaria_mitchell'),
)
dataset = ll.create_dataset(config, overwrite=True)
dataset.to_pandas()


  from .autonotebook import tqdm as notebook_tqdm


Dataset "companies" written to ./data/datasets/local/companies


Unnamed: 0,statement,label,__hfsplit__,__rowid__,overview.sbert.*,overview.openai.*,overview.sbert.*.embedding,overview.openai.*.embedding
0,Shell operates in the industry of oil & gas op...,true,train,a151a69e-3efe-4fa8-a0a9-302f8c9a9fe0,,,,
1,Nestle operates in the industry of Banking.,false,train,cd9f8dd0-3ff0-4d45-a8ff-39bcd6a87e9d,,,,
2,Itau Unibanco Holding operates in the industry...,true,train,602ceff5-1973-44fd-911c-fc83a1096948,,,,
3,Raytheon Technologies operates in the industry...,true,train,7990194a-6e50-45b5-97e6-bc2793b95ede,,,,
4,Manulife operates in the industry of insurance.,true,train,8de5f7d2-9d63-42cd-bb64-19f911357166,,,,
...,...,...,...,...,...,...,...,...
1195,Alphabet has headquarters in South Korea.,false,test,aee1baa1-79d5-49cc-8a7a-45199421872a,,,,
1196,"Iberdrola is a holding company, which engages ...",false,test,0e31deea-2000-46ce-9579-50991f953257,,,,
1197,Dow has headquarters in Italy.,false,test,b72e5655-7933-4282-b743-4e405defc58f,,,,
1198,Merck & Co. engages in the provision of health...,true,test,6b39b493-1857-47df-9485-55b21cb9bd1c,,,,


In [3]:
import instructor
from openai import OpenAI
from pydantic import BaseModel, Field

# Enables `response_model`
# client = instructor.patch(OpenAI())
import lilac as ll


dataset = ll.get_dataset('local', 'companies')

if 'get_ipython' in globals():
  del globals()['get_ipython']
if 'exit' in globals():
  del globals()['exit']
if 'quit' in globals():
  del globals()['quit']


class CompanyDetail(BaseModel):
  """Details of a company."""

  name: str = Field(description='Company name')
  age: int = Field(description='The estimated age of the company')


def extract_company(text):
  client = instructor.patch(OpenAI())
  company = client.chat.completions.create(
    model='gpt-3.5-turbo',
    response_model=CompanyDetail,
    messages=[
      {'role': 'user', 'content': text},
    ],
  )
  return company.model_dump()


dataset.map(extract_company, input_path='statement', output_column='company')


Scheduling task "3a2ea39c39584c9c96a89654b518064f": "[local/companies][1 shards] map "extract_company" to "company"".


[Shard 0/1] map "extract_company" to "('company',)":  31%|███       | 374/1200 [04:59<11:53,  1.16it/s]/var/folders/zn/brkm752x5673zb89tjjqzzs00000gn/T/ipykernel_69441/3762146513.py:33: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
[Shard 0/1] map "extract_company" to "('company',)":  29%|██▉       | 352/1200 [14:38<42:38:38, 181.04s/it]/var/folders/zn/brkm752x5673zb89tjjqzzs00000gn/T/ipykernel_69441/3762146513.py:33: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
[Shard 0/1] map "extract_company" to "('company',)":  29%|██▉       | 353/1200 [14:39<29:52:32, 126.98s/it]/var/folders/zn/brkm752x5673zb89tjjqzzs00000gn/T/ipykernel_69441/3762146513.py:33: Pyda