In [1]:
from tempo_ql import GenericDataset, formats, QueryEngine, FileVariableStore
import duckdb
import numpy as np
import os
from pathlib import Path
import pandas as pd
import time

In [None]:
# provide your Gemini API key here if you'd like to try the LLM-assisted authoring workflow
gemini_api_key = None

# GCP project in which to run queries - make sure it has access to MIMIC-IV through physionet.org
project_id = "ai-clinician"
# name of a dataset within your project to store temporary results. Required if you plan to subset the data to run queries
scratch_dataset = None
# directory to store temporary variables
variable_store_dir = "mimiciv_data"

# provide your Gemini API key here if you'd like to try the LLM-assisted authoring workflow
gemini_api_key = None

In [None]:
# Initialize query engine and variable store
dataset = GenericDataset(f'bigquery://{project_id}', formats.mimiciv(), 
                        scratch_schema_name=f'{project_id}.{scratch_dataset}' if scratch_dataset is not None else None)

if not os.path.exists(variable_store_dir): os.mkdir(variable_store_dir)
var_store = FileVariableStore(variable_store_dir)
query_engine = QueryEngine(dataset, variable_stores=[var_store])


In [24]:
from tempo_ql.ai_assistant import AIAssistant

ai_assistant = AIAssistant(api_key=gemini_api_key, query_engine=query_engine)

In [None]:
question = "Find all patients with diabetes"
sql_query = ai_assistant.process_sql_question(question=question).get('extracted_query')

In [None]:
import pandas_gbq

# Your existing code should work
df = pandas_gbq.read_gbq(sql_query, project_id=project_id)
sql_results = sorted(df.values.flatten())

In [None]:
# Generate TempoQL query
response = ai_assistant.process_question(question=question)

In [None]:
result = query_engine.query("{name contains /Temperature/; scope = 'Measurement'}")
TempoQL_results = sorted(result.df.values.flatten())
TempoQL_results

Searching vocabulary concept for id None and name <contains re.compile('Temperature')>


In [None]:
# Compare SQL vs TempoQL results
import matplotlib.pyplot as plt

# 1. Compare lengths
print(f"\nðŸ“Š LENGTH COMPARISON:")
print(f"SQL Results Length: {len(sql_results)}")
print(f"TempoQL Results Length: {len(TempoQL_results)}")
print(f"Difference: {abs(len(sql_results) - len(TempoQL_results))}")

# 2. Basic statistics
print(f"\nðŸ“ˆ BASIC STATISTICS:")
print(f"SQL - Min: {min(sql_results)}, Max: {max(sql_results)}, Mean: {sum(sql_results)/len(sql_results):.2f}")
print(f"TempoQL - Min: {min(TempoQL_results)}, Max: {max(TempoQL_results)}, Mean: {sum(TempoQL_results)/len(TempoQL_results):.2f}")

# 3. Plot distributions
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(sql_results, bins=30, alpha=0.7, label='SQL', color='blue', density=True)
plt.hist(TempoQL_results, bins=30, alpha=0.7, label='TempoQL', color='red', density=True)
plt.title('Value Distribution Comparison')
plt.xlabel('Values')
plt.ylabel('Density')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.boxplot([sql_results, TempoQL_results], labels=['SQL', 'TempoQL'])
plt.title('Box Plot Comparison')
plt.ylabel('Values')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nâœ… Comparison complete!")
