In [None]:
from chdb import session
import time
import os
import shutil

print("Connecting to chdb session...")
session_dir = os.path.join(os.getcwd(), "chdb_test_data_insertion_ipynb")
os.makedirs(session_dir, exist_ok=True)
chs = session.Session(session_dir)

csv_path = os.path.join(os.getcwd(), "chdb_test_data_insertion_embedding.csv")
with open(csv_path, 'w') as temp_csv:
    temp_csv.write("movieId,embedding\n")  # Header

    # Generate 10,000 rows of test data
    for i in range(1, 10001):
        embedding = [float(i + j * 0.1) for j in range(10)]
        embedding_str = '[' + ','.join(map(str, embedding)) + ']'
        temp_csv.write(f'{i},"{embedding_str}"\n')

# Setup database and table
print("\n=== Setup Phase ===")
chs.query("CREATE DATABASE IF NOT EXISTS test ENGINE = Atomic")
chs.query("USE test")
chs.query('DROP TABLE IF EXISTS embeddings')

chs.query("""CREATE TABLE embeddings (
      movieId UInt32 NOT NULL,
      embedding Array(Float32) NOT NULL
  ) ENGINE = MergeTree()
  ORDER BY movieId""")

# Test 1: INFILE insertion (10k rows)
print("\n=== Test 1: INFILE Insertion (10k rows) ===")
start_time = time.time()
try:
    result = chs.query(f"INSERT INTO embeddings FROM INFILE '{csv_path}' FORMAT CSV")
    infile_time = time.time() - start_time
    print(f"✓ INFILE insertion successful! Time: {infile_time:.3f}s")

    count = chs.query('SELECT COUNT(*) as count FROM embeddings')
    print(f"Records inserted via INFILE: {count}")

    if count != '0':
        print("Sample data from INFILE:")
        sample = chs.query('SELECT movieId, embedding FROM embeddings ORDER BY movieId LIMIT 3')
        print(sample)

except Exception as e:
    print(f"✗ INFILE insertion failed: {e}")
    infile_time = 0

# Test 2: Regular insertion (10 additional rows)
print("\n=== Test 2: Regular VALUES Insertion (10 rows) ===")
start_time = time.time()
try:
    # Insert 10 additional rows with movieId starting from 20001
    for i in range(20001, 20011):
        embedding = [float(i + j * 0.1) for j in range(10)]
        embedding_str = '[' + ','.join(map(str, embedding)) + ']'
        chs.query(f"INSERT INTO embeddings VALUES ({i}, {embedding_str})")

    values_time = time.time() - start_time
    print(f"✓ VALUES insertion successful! Time: {values_time:.3f}s")

except Exception as e:
    print(f"✗ VALUES insertion failed: {e}")
    values_time = 0

# Test 3: Verify total count
print("\n=== Test 3: Count Verification ===")
try:
    total_count = chs.query('SELECT COUNT(*) as total FROM embeddings')
    print(f"Total records in embeddings table: {total_count}")

    # Count by range
    infile_count = chs.query('SELECT COUNT(*) as infile_count FROM embeddings WHERE movieId <= 10000')
    values_count = chs.query('SELECT COUNT(*) as values_count FROM embeddings WHERE movieId >= 20001')

    print(f"Records from INFILE (movieId <= 10000): {infile_count}")
    print(f"Records from VALUES (movieId >= 20001): {values_count}")

    # Sample from both ranges
    print("\nSample from INFILE data:")
    print(chs.query('SELECT movieId, embedding FROM embeddings WHERE movieId <= 10000 ORDER BY movieId LIMIT 2'))

    print("Sample from VALUES data:")
    print(chs.query('SELECT movieId, embedding FROM embeddings WHERE movieId >= 20001 ORDER BY movieId LIMIT 2'))

except Exception as e:
    print(f"Count verification error: {e}")

# Test 4: Direct CSV engine reading
print("\n=== Test 4: CSV Engine Direct Reading ===")
try:
    print("Reading generated CSV file directly using CSV engine:")

    # Method 1: Using file() function
    csv_count1 = chs.query(f"SELECT COUNT(*) as csv_count FROM file('{csv_path}', 'CSV', 'movieId UInt32, embedding String')")
    print(f"CSV file rows (via file() function): {csv_count1}")

    # Method 2: Using CSV table engine directly
    print("Sample rows from CSV file:")
    csv_sample = chs.query(f"SELECT movieId, embedding FROM file('{csv_path}', 'CSV', 'movieId UInt32, embedding String') ORDER BY movieId LIMIT 3")
    print(csv_sample)

    print("Last few rows from CSV file:")
    csv_tail = chs.query(f"SELECT movieId, embedding FROM file('{csv_path}', 'CSV', 'movieId UInt32, embedding String') ORDER BY movieId DESC LIMIT 3")
    print(csv_tail)

except Exception as e:
    print(f"CSV engine reading error: {e}")

# Cleanup
print("\n=== Cleanup ===")
try:
    chs.close()
    os.unlink(csv_path)
    shutil.rmtree(session_dir, ignore_errors=True)
    print("Temporary files cleaned up")
except Exception as e:
    print(f"Warning: Could not clean up temporary files: {e}")

print(f"\n=== Performance Summary ===")
if infile_time > 0:
    print(f"INFILE insertion (10k rows): {infile_time:.3f}s")
if values_time > 0:
    print(f"VALUES insertion (10 rows): {values_time:.3f}s")
    if infile_time > 0:
        print(f"INFILE is {values_time/infile_time*1000:.1f}x faster per 1000 rows")