# Exploring geocoder.db

This notebook demonstrates how to:
- Connect to the geocoder.db SQLite database
- List available tables
- Preview table data
- Plot geometries if available

---

In [None]:
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
from shapely import wkb, wkt
from shapely.geometry import shape
import geopandas as gpd
import ipywidgets as widgets
from ipywidgets import Tab

from pathlib import Path
import os

import warnings
warnings.filterwarnings('ignore')

In [None]:
db_year = '2021'

nb_dir = Path(os.getcwd())
project_root = Path(os.getcwd()).parent

# db_path = project_root / 'database' / 'geocoder.db'

db_path = project_root / 'tiger_data' / 'databases' / db_year / 'geocoder.db'

db_path

## 2. Connect to geocoder.db SQLite Database

We will connect to the geocoder.db file using sqlite3.

In [None]:
conn = sqlite3.connect(db_path)
print(f'Connected to {db_path}')

## 3. List Available Tables

Query the SQLite master table to list all tables present in the database.

In [None]:
tables_query = "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;"
tables = pd.read_sql_query(tables_query, conn)
print('Tables in geocoder.db:')
display(tables)

## 4. Preview Table Data

Select a table and display the first few rows using pandas for inspection.

In [None]:
# Create tabbed interface for all tables
if not tables.empty:
    tab = Tab()
    outputs = []
    
    for table_name in tables['name']:
        df = pd.read_sql_query(f'SELECT * FROM {table_name} LIMIT 5;', conn)
        print(f'Table: {table_name}')
        display(df)
else:
    print('No tables found.')

In [None]:
# Query for all indices in the database
indices_query = "SELECT name, tbl_name, sql FROM sqlite_master WHERE type='index' ORDER BY tbl_name, name;"
indices = pd.read_sql_query(indices_query, conn)
print(f'Indices in geocoder.db ({len(indices)} total):')
display(indices)

## 5. Plot Geometries from a Selected Table

If the selected table contains geometry data (WKB), we will plot a few geometries using shapely and matplotlib.

In [None]:
# Try to plot geometries from a table with a geometry column using GeoPandas
geom_table = "edge"
geom_col = "geometry"
limit = 1000

import struct
from shapely.geometry import LineString

with conn:
    tbl_info = pd.read_sql(f"PRAGMA table_info({geom_table});", conn)
    display(tbl_info)
    geom_df = pd.read_sql_query(f"SELECT * FROM {geom_table} LIMIT {limit};", conn)
    display(geom_df)

    geoms = []
    for wkb_bytes in geom_df[geom_col]:
        if wkb_bytes:
            geom = None
            try:
                n = len(wkb_bytes) // 4
                coords = struct.unpack(f'<{n}i', wkb_bytes)
                if len(coords) % 2 == 0:
                    xy = list(zip(coords[::2], coords[1::2]))
                    xy_scaled = [(x/1e6, y/1e6) for x, y in xy]
                    geom = LineString(xy_scaled)
            except Exception as e:
                print(f"Failed to unpack as int32 little-endian: {e}")

            if geom is not None:
                geoms.append(geom)

    if geoms:
        gdf = gpd.GeoDataFrame(geometry=geoms)
        # Set CRS to EPSG:4326 (WGS84) if known, otherwise comment out the next line
        gdf.set_crs(epsg=4326, inplace=True)
        # ax = gdf.plot(figsize=(8,8), alpha=0.7, edgecolor='k')
        # ax.set_title(f"Sample Geometries from {geom_table} (GeoPandas)")
        # plt.show()
    # else:
        # print('No valid geometries found to plot.')

In [None]:
# --- Plotly visualization of geometries (lines and points) with basemap ---
import plotly.graph_objects as go
import os

# Try to get Mapbox token from environment
mapbox_token = os.getenv('MAPBOX_TOKEN', '')

# If env variable isn't set, try loading from .env file
if not mapbox_token:
    try:
        from dotenv import load_dotenv
        load_dotenv()
        mapbox_token = os.getenv('MAPBOX_TOKEN', '')
    except ImportError:
        pass

# Or set your token directly here (don't commit this!)
# mapbox_token = 'pk.your_token_here'

use_mapbox = bool(mapbox_token)  # Use Mapbox if token is available, otherwise OpenStreetMap

if 'gdf' in locals() and not gdf.empty:
    # For lines: extract coordinates for each LineString
    gdf_lines = gdf[gdf.geometry.type == 'LineString'].copy()
    line_traces = []
    for geom in gdf_lines.geometry:
        coords = list(geom.coords)
        lons, lats = zip(*coords)
        line_traces.append(go.Scattermapbox(
            lon=lons,
            lat=lats,
            mode='lines',
            line=dict(width=3, color='rgba(0, 0, 255, 0.8)'),
            hoverinfo='lon+lat',
            name='Lines',
            showlegend=True
        ))

    # For points: extract all unique points from all lines
    import numpy as np
    all_points = np.vstack([np.array(geom.coords) for geom in gdf_lines.geometry])
    points_df = pd.DataFrame(all_points, columns=['lon', 'lat'])
    point_trace = go.Scattermapbox(
        lon=points_df['lon'],
        lat=points_df['lat'],
        mode='markers',
        marker=dict(size=8, color='red', opacity=0.8),
        hoverinfo='lon+lat',
        name='Vertices',
        showlegend=True
    )

    # Center map on mean of points
    center = dict(lat=points_df['lat'].mean(), lon=points_df['lon'].mean())

    fig = go.Figure(line_traces + [point_trace])
    
    if use_mapbox:
        # Use Mapbox with token
        print(f"Using Mapbox (token found)")
        fig.update_layout(
            mapbox=dict(
                accesstoken=mapbox_token,
                style='streets',
                center=center,
                zoom=12
            ),
            margin=dict(l=0, r=0, t=40, b=0),
            title="Sample Geometries (Plotly + Mapbox)",
            height=600
        )
    else:
        # Use OpenStreetMap (no token required)
        print("Using OpenStreetMap (no Mapbox token found)")
        print("To use Mapbox, set MAPBOX_TOKEN in .env file or uncomment line above")
        fig.update_layout(
            mapbox=dict(
                style='open-street-map',
                center=center,
                zoom=12
            ),
            margin=dict(l=0, r=0, t=40, b=0),
            title="Sample Geometries (Plotly + OpenStreetMap)",
            height=600
        )
    
    fig.show()
    print(f"Map centered at: lat={center['lat']:.6f}, lon={center['lon']:.6f}")
    print(f"Total lines: {len(line_traces)}, Total points: {len(points_df)}")
else:
    print("GeoDataFrame 'gdf' not found or empty. Run the previous cell first.")

# Disconnect from the Database

Finally, we will close the database connection.

In [None]:
conn.close()