In [1]:
from tempo_ql import GenericDataset, formats, QueryEngine, FileVariableStore
import duckdb
import numpy as np
import os
from pathlib import Path
import pandas as pd

In [2]:
# GCP project in which to run queries - make sure it has access to MIMIC-IV through physionet.org
project_id = "ai-clinician"
# name of a dataset within your project to store temporary results. Required if you plan to subset the data to run queries
scratch_dataset = None
# directory to store temporary variables
variable_store_dir = "mimiciv_data"

# provide your Gemini API key here if you'd like to try the LLM-assisted authoring workflow
gemini_api_key = open("gemini_key.txt").read().strip()

In [3]:
# Initialize query engine and variable store
dataset = GenericDataset(f'bigquery://{project_id}', formats.mimiciv(), 
                        scratch_schema_name=f'{project_id}.{scratch_dataset}' if scratch_dataset is not None else None)

if not os.path.exists(variable_store_dir): os.mkdir(variable_store_dir)
var_store = FileVariableStore(variable_store_dir)
query_engine = QueryEngine(dataset, variable_stores=[var_store])


In [4]:
from tempo_ql.ai_assistant import AIAssistant

ai_assistant = AIAssistant(query_engine, api_key=gemini_api_key)

In [None]:
question = "Extract a boolean value for each diagnosis indicating whether it is related to diabetes. Use the following regex to look for ICD-9/10 codes related to diabetes: '\\b(?:40[1-5]|I1[01235])'"
sql_query = ai_assistant.process_sql_question(question=question).get('extracted_query')

print(sql_query)

üîç Needs existing query for SQL generation: False
```sql
SELECT
    ic.stay_id,
    CAST(REGEXP_CONTAINS(di.icd_code, r'\b(?:40[1-5]|I1[01235])') AS BOOL) AS is_diabetes_related
FROM
    `physionet-data.mimiciv_3_1_hosp.diagnoses_icd` di
INNER JOIN
    `physionet-data.mimiciv_3_1_icu.icustays` ic
    ON di.subject_id = ic.subject_id
    AND di.hadm_id = ic.hadm_id
```
SELECT
    ic.stay_id,
    CAST(REGEXP_CONTAINS(di.icd_code, r'\b(?:40[1-5]|I1[01235])') AS BOOL) AS is_diabetes_related
FROM
    `physionet-data.mimiciv_3_1_hosp.diagnoses_icd` di
INNER JOIN
    `physionet-data.mimiciv_3_1_icu.icustays` ic
    ON di.subject_id = ic.subject_id
    AND di.hadm_id = ic.hadm_id


In [6]:
# Generate TempoQL query
response = ai_assistant.process_question(question=question)

response['extracted_query']

Plain text
üîç Needs existing query for generate mode: False




Plain text
This TempoQL query extracts a boolean value for each diagnosis, indicating whether its ICD code matches the provided pattern.

```tempoql
{Diagnosis; scope = Diagnosis} contains /\b(?:40[1-5]|I1[01235])\'/
```

**Explanation:**

1.  `{Diagnosis; scope = Diagnosis}`: This part of the query selects all diagnosis events from the "Diagnosis" scope. Each diagnosis event has an associated ICD code.
2.  `contains /\b(?:40[1-5]|I1[01235])\'/`: For each selected diagnosis, this checks if its ICD code contains the pattern specified by the regular expression. The regular expression looks for specific ICD-9 codes (401-405) and ICD-10 codes (I10-I13, I15). The result will be `true` if a match is found, and `false` otherwise.

*Note: The provided regex `\b(?:40[1-5]|I1[01235]')` is typically associated with hypertension codes in ICD-9 and ICD-10, not diabetes. However, this query uses the regex exactly as instructed.*


"{Diagnosis; scope = Diagnosis} contains /\\b(?:40[1-5]|I1[01235])\\'/"

In [None]:
start = time.time()
result = query_engine.query("first ((time({id in (220181, 220052); scope = chartevents}) where #value < (mean {id in (220181, 220052); scope = chartevents} from #now - 8 h to #now)) - #now) from #now to #now + 24 h every 24 h")