In [None]:
from pathlib import Path
import duckdb

In [None]:
# Compute the database path relative to this notebook using cwd
notebook_dir = Path.cwd()
# db_path = (notebook_dir.parent / "database" / "geocoder.duckdb").resolve()
db_path = (notebook_dir.parent / "database" / "degauss.duckdb").resolve()
# con = duckdb.connect(str(db_path))

print(f"Connected to {db_path}")

In [None]:
# List all available tables (excluding per-county tables like tl_2021_13001_*)
with duckdb.connect(str(db_path)) as conn:
    tables = conn.execute("""
        SELECT table_name 
        FROM information_schema.tables 
        WHERE table_schema = 'main' 
        AND table_name NOT LIKE 'tl_%' -- exludes per-county tables
        ORDER BY table_name
    """).fetchall()
    tbl_ls = [table[0] for table in tables]
    print(f"Available consolidated tables ({len(tables)}):")
    for table in tables:
        print(f"  - {table[0]}")
    print(tbl_ls)

In [None]:
# Load data from all consolidated tables
dataframes = {}
with duckdb.connect(str(db_path)) as conn:
    for tbl in tbl_ls:
        df = conn.execute(f"SELECT * FROM {tbl} LIMIT 100").df()
        dataframes[tbl] = df
        print(f"Loaded {len(df)} rows from {tbl}")

In [None]:
# Display dataframes in tabbed interface
from ipywidgets import Tab, Output
from IPython.display import display

# Create output widgets for each table
outputs = []
for tbl_name, df in dataframes.items():
    out = Output()
    with out:
        print(f"Table: {tbl_name} ({len(df)} rows, {len(df.columns)} columns)")
        display(df)
    outputs.append(out)

# Create tab widget
tab = Tab(children=outputs)
for i, tbl_name in enumerate(dataframes.keys()):
    tab.set_title(i, tbl_name)

display(tab)

In [None]:
## Convert to DeGAUSS Format

Convert the TIGER/Line tables to DeGAUSS geocoding format with optimized schema and pre-computed metaphone codes.

## Verify DeGAUSS Database

In [None]:
# Connect to the new DeGAUSS database and verify tables
degauss_db = str(db_path.parent / "degauss.duckdb")

with duckdb.connect(degauss_db) as conn:
    # Load spatial extension
    conn.execute("INSTALL spatial;")
    conn.execute("LOAD spatial;")
    
    # Get table info
    tables = conn.execute("SELECT table_name FROM information_schema.tables WHERE table_schema = 'main'").fetchall()
    print(f"DeGAUSS Database Tables:")
    for table in tables:
        count = conn.execute(f"SELECT COUNT(*) FROM {table[0]}").fetchone()[0]
        print(f"  - {table[0]}: {count:,} rows")
    
    print("\nSample edge record (with readable geometry):")
    edge_sample = conn.execute("""
        SELECT 
            tlid, 
            ST_AsText(geometry) as geom_wkt,
            ST_GeometryType(geometry) as geom_type,
            ST_Length(geometry) as length_degrees
        FROM edge 
        LIMIT 3
    """).df()
    display(edge_sample)
    
    print("\nSample feature record:")
    feature_sample = conn.execute("SELECT fid, street, street_phone, paflag, zip FROM feature LIMIT 5").df()
    display(feature_sample)
    
    print("\nSample feature_edge linkage:")
    feature_edge_sample = conn.execute("SELECT fid, tlid FROM feature_edge LIMIT 5").df()
    display(feature_edge_sample)
    
    print("\nSample range record:")
    range_sample = conn.execute("SELECT tlid, fromhn, tohn, zip, side FROM range LIMIT 5").df()
    display(range_sample)


### Explanation of Geometry Format

The geometry column is correctly stored as **WKB (Well-Known Binary)** format. When Python displays the raw bytes, it shows them as a list of integers like `[1, 4, 0, 0, 0, ...]`, but the actual storage is proper binary format.

To view geometries in a readable format, always use DuckDB spatial functions:
- `ST_AsText(geometry)` - Converts to WKT (Well-Known Text) format
- `ST_AsGeoJSON(geometry)` - Converts to GeoJSON format  
- `ST_GeometryType(geometry)` - Returns the geometry type (e.g., "LINESTRING")
- `ST_Length(geometry)` - Returns the length for line geometries

The WKB format is the standard for storing spatial data efficiently in databases.