In [None]:
import os
import pathlib
from dotenv import load_dotenv
load_dotenv()

# Install dependencies
LOCAL_DATABRICKS_NOTEBOOK_PATH = os.getenv('LOCAL_DATABRICKS_NOTEBOOK_PATH')
if LOCAL_DATABRICKS_NOTEBOOK_PATH and pathlib.Path(LOCAL_DATABRICKS_NOTEBOOK_PATH).exists():
    print(f"Installing databricks-notebook from {LOCAL_DATABRICKS_NOTEBOOK_PATH}")
    %pip install --editable "{LOCAL_DATABRICKS_NOTEBOOK_PATH}"
else:
    print("Installing databricks-notebook from git")
    %pip install git+https://github.com/datafold/databricks-notebook.git

# Restart to make dependencies available
# %restart_python on databricks notebook
import IPython
app = IPython.Application.instance()
app.kernel.do_shutdown(True)


Installing databricks-notebook from /Users/sergeyklinov/databricks-notebook
Obtaining file:///Users/sergeyklinov/databricks-notebook
  Installing build dependencies ... [?25ldone
[?25h  Checking if build backend supports build_editable ... [?25ldone
[?25h  Getting requirements to build editable ... [?25ldone
[?25h  Preparing editable metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: databricks-notebook
  Building editable for databricks-notebook (pyproject.toml) ... [?25ldone
[?25h  Created wheel for databricks-notebook: filename=databricks_notebook-0.1.0-0.editable-py3-none-any.whl size=2939 sha256=c97e1f72ab6a40e112e24b1a4740613802105aa28812cd18245f7ac8743c476d
  Stored in directory: /private/var/folders/3y/p4yqdnw167xfr84r44_t60lh0000gn/T/pip-ephem-wheel-cache-ji9la71b/wheels/d6/fe/61/e1ee441d5c3d6bacd1e078d8335cf494301163e0f54e0a9d49
Successfully built databricks-notebook
Installing collected packages: databricks-notebook
  Attempting uninstal

{'status': 'ok', 'restart': True}

: 

In [1]:
org_token = "my_secret_token" # do not change
host="https://sergey.st.datafold.io"
identity = None

# We collect basic identity information to help track and resolve any issues
# with SQL translation and provide you with the best experience. This data is
# used internally by Datafold only and helps us:
# - Diagnose translation errors specific to your workspace configuration
# - Improve translation quality based on real usage patterns
# - Provide better support when you need assistance
#
# If you prefer not to share certain information, you can comment out specific
# fields below or remove this entire code block. The tool will still work, but
# we may have limited ability to help troubleshoot issues.

# def get_context_info():
#     context = dbutils.notebook.entry_point.getDbutils().notebook().getContext()
#     return {
#         'workspace_id': context.workspaceId().get(),
#         'workspace_url': context.browserHostName().get(),
#         'cluster_id': context.clusterId().get(),
#         'notebook_path': context.notebookPath().get(),
#         'user': context.userName().get()
#     }

# identity = get_context_info()

In [2]:
import csv
import os
from pathlib import Path
from databricks_notebook import translate_queries_and_get_results

# Configuration
CHUNK_SIZE = 5  # Process 100 queries at a time
TEST_MODE = True  # Set to False to process all queries
MAX_CHUNKS_TEST = 2  # Only process first 2 chunks in test mode

input_csv_path = Path.home() / "dma" / "dma-pearson-assessment" / "included_queries.csv"
output_csv_path = Path.home() / "dma" / "dma-pearson-assessment" / "report.csv"

print(f"Reading queries from: {input_csv_path}")

# Read all queries from CSV
queries_data = []
with open(input_csv_path, 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        queries_data.append({
            'query_hash': row['QueryHash'],
            'query_text': row['QueryText']
        })

total_queries = len(queries_data)
print(f"Found {total_queries} queries to translate")
print(f"Processing in chunks of {CHUNK_SIZE}")

if TEST_MODE:
    print(f"\n⚠️  TEST MODE ENABLED - Processing only first {MAX_CHUNKS_TEST} chunks ({MAX_CHUNKS_TEST * CHUNK_SIZE} queries max)")
    print(f"   Set TEST_MODE = False to process all queries\n")

# Initialize or clear the output CSV file with headers
with open(output_csv_path, 'w', newline='', encoding='utf-8') as f:
    fieldnames = ['query_hash', 'asset_name', 'original_query', 'translation_status', 'translation']
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()

print(f"Output file initialized: {output_csv_path}\n")

# Process queries in chunks
success_count = 0
failed_count = 0
other_count = 0
chunks_processed = 0

for chunk_start in range(0, total_queries, CHUNK_SIZE):
    # Stop after MAX_CHUNKS_TEST chunks if in test mode
    if TEST_MODE and chunks_processed >= MAX_CHUNKS_TEST:
        print(f"\n⚠️  TEST MODE: Stopping after {chunks_processed} chunks")
        break
    
    chunk_end = min(chunk_start + CHUNK_SIZE, total_queries)
    chunk_queries_data = queries_data[chunk_start:chunk_end]
    
    print(f"=== Processing queries {chunk_start + 1} to {chunk_end} of {total_queries} ===")
    
    # Extract query texts for this chunk
    queries_to_translate = [q['query_text'] for q in chunk_queries_data]
    
    # Translate this chunk
    print(f"Translating {len(queries_to_translate)} queries...")
    translation_results = translate_queries_and_get_results(
        queries_to_translate, 
        org_token, 
        identity, 
        host
    )
    
    print("Translation completed for this chunk!")
    
    # Prepare report data for this chunk
    chunk_report_rows = []
    translated_models = translation_results.get('translated_models', [])
    
    for i, query_data in enumerate(chunk_queries_data):
        # Match the query with its translation result by index
        if i < len(translated_models):
            model = translated_models[i]
            status = model.get('translation_status', '')
            
            report_row = {
                'query_hash': query_data['query_hash'],
                'asset_name': model.get('asset_name', ''),
                'original_query': query_data['query_text'],
                'translation_status': status,
                'translation': model.get('target_sql', '')
            }
            
            # Update counters
            if status == 'success':
                success_count += 1
            elif status == 'failed':
                failed_count += 1
            else:
                other_count += 1
        else:
            # In case there's a mismatch
            report_row = {
                'query_hash': query_data['query_hash'],
                'asset_name': '',
                'original_query': query_data['query_text'],
                'translation_status': 'not_translated',
                'translation': ''
            }
            other_count += 1
        
        chunk_report_rows.append(report_row)
    
    # Append this chunk's results to the CSV file
    with open(output_csv_path, 'a', newline='', encoding='utf-8') as f:
        fieldnames = ['query_hash', 'asset_name', 'original_query', 'translation_status', 'translation']
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writerows(chunk_report_rows)
    
    chunks_processed += 1
    print(f"✓ Chunk results written to report ({chunk_end}/{total_queries} queries processed)\n")

print("=" * 50)
if TEST_MODE and chunks_processed >= MAX_CHUNKS_TEST:
    print(f"✓ Test run completed! Processed {chunks_processed} chunks ({chunks_processed * CHUNK_SIZE} queries max)")
    print(f"   To process all queries, set TEST_MODE = False")
else:
    print("✓ All translations completed!")
print(f"✓ Report file: {output_csv_path}")
print("\n=== Translation Summary ===")
print(f"Queries processed: {chunks_processed * CHUNK_SIZE if TEST_MODE else total_queries}")
print(f"Successfully translated: {success_count}")
print(f"Failed: {failed_count}")
print(f"Other: {other_count}")

Reading queries from: /Users/sergeyklinov/dma/dma-pearson-assessment/included_queries.csv
Found 4285 queries to translate
Processing in chunks of 5

⚠️  TEST MODE ENABLED - Processing only first 2 chunks (10 queries max)
   Set TEST_MODE = False to process all queries

Output file initialized: /Users/sergeyklinov/dma/dma-pearson-assessment/report.csv

=== Processing queries 1 to 5 of 4285 ===
Translating 5 queries...
✓ Organization created with id 13
✓ Translation Project created with id 10014.
✓ Uploaded queries to translate.
✓ Started translation with id 1cd216cc-a405-404b-8d7f-cdf972d14474
✓ Translation completed with status: done
Translation completed for this chunk!
✓ Chunk results written to report (5/4285 queries processed)

=== Processing queries 6 to 10 of 4285 ===
Translating 5 queries...
✓ Translation Project created with id 10015.
✓ Uploaded queries to translate.
✓ Started translation with id 474f044b-ef4c-47f0-94a4-eb2469dcbc60
✓ Translation completed with status: done
Tra