In [6]:
import palimpzest as pz

In [3]:
# # download tar files with testdata
# !wget -nc https://people.csail.mit.edu/gerarvit/PalimpzestData/enron-tiny.tar.gz
# !wget -nc wget -nc https://people.csail.mit.edu/gerarvit/PalimpzestData/real-estate-eval-5.tar.gz
# !wget -nc https://palimpzest-workloads.s3.us-east-1.amazonaws.com/chroma-biodex.tar.gz

# # open tar files
# !tar -xzf enron-tiny.tar.gz
# !tar -xzf real-estate-eval-5.tar.gz
# !tar -xzf chroma-biodex.tar.gz

In [7]:
# define the fields we wish to compute
email_cols = [
    {"name": "sender", "type": str, "desc": "The email address of the sender"},
    {"name": "subject", "type": str, "desc": "The subject of the email"},
    {"name": "date", "type": str, "desc": "The date the email was sent"},
]

# lazily construct the computation to get emails about holidays sent in July
dataset = pz.TextFileDataset(id="enron",path="enron-tiny/")
dataset = dataset.sem_map(email_cols)
dataset = dataset.sem_filter("The email was sent in July")
dataset = dataset.sem_filter("The email is about holidays")

In [8]:
# from src.palimpzest.query.processor.config import QueryProcessorConfig
config = pz.QueryProcessorConfig(
    available_models=["hosted_vllm/Qwen/Qwen2.5-1.5B-Instruct"],
    api_base="http://localhost:8000/v1",
    policy=pz.MaxQuality(),
    execution_strategy="parallel",
    progress=True
)
output = dataset.run(config)


Output()




Total time: 1.77s
Total cost: $0.0461


In [9]:
# display output (if using Jupyter, otherwise use print(output_df))
output_df = output.to_df(cols=["date", "sender", "subject"])
display(output_df)

Unnamed: 0,sender,subject,date
0,larry.berger@enron.com,Vacation Days in August,"Thu, 26 Jul 2001 06:59:38 -0700 (PDT)"
1,larry.berger@enron.com,Vacation days in July,"Thu, 28 Jun 2001 09:55:24 -0700 (PDT)"


In [5]:
print(f"Optimization Time: {output.execution_stats.optimization_time:.2f}s")
print(f"Optimization Cost: ${output.execution_stats.optimization_cost:.3f}")
print("---")
print(f"Plan Execution Time: {output.execution_stats.plan_execution_time:.2f}s")
print(f"Plan Execution Cost: ${output.execution_stats.plan_execution_cost:.3f}")

print("Final plan executed:")
print("---")
final_plan_id = list(output.execution_stats.plan_strs.keys())[-1]
print(output.execution_stats.plan_strs[final_plan_id])

Optimization Time: 0.00s
Optimization Cost: $0.000
---
Plan Execution Time: 3.69s
Plan Execution Cost: $0.046
Final plan executed:
---
0. Schema['contents', 'date', 'filename', 'sender', 'subject'] -> LLMFilter -> Schema['contents', 'date', 'filename', 'sender', 'subject']
    (contents, date, filename, send) -> (contents, date, filename, send)
    Model: Model.VLLM_QWEN_2_5_1_5B_INSTRUCT
    Filter: The email is about holidays

  1. Schema['contents', 'date', 'filename', 'sender', 'subject'] -> LLMFilter -> Schema['contents', 'date', 'filename', 'sender', 'subject']
    (contents, date, filename, send) -> (contents, date, filename, send)
    Model: Model.VLLM_QWEN_2_5_1_5B_INSTRUCT
    Filter: The email was sent in July

    2. TextFile -> LLMConvertBonded -> Schema['contents', 'date', 'filename', 'sender', 'subject']
    (contents, filename) -> (contents, date, filename, send)
    Model: Model.VLLM_QWEN_2_5_1_5B_INSTRUCT
    Prompt Strategy: PromptStrategy.COT_QA
    Reasoning Effort