# 🌿 Graves Greenery — Colab SQL Template (DuckDB + `%%sql` + GitHub CSVs)

A serverless SQL sandbox for the Graves Greenery project.

- Loads CSVs from your GitHub repo (public)
- Creates tables in a persistent DuckDB database
- Enables `%%sql` magic for pretty outputs
- Optional `%%mysql` magic to use MySQL syntax via SQLGlot

## 1) Install packages

In [ ]:
!pip -q install duckdb ipython-sql pandas duckdb-engine sqlalchemy sqlglot

## 2) Imports & load SQL magic

In [ ]:
import os, re, glob, subprocess, textwrap, pandas as pd
from pathlib import Path

# Load the ipython-sql extension
%load_ext sql

## 3) Configuration — repo & DB paths (pre-filled)

In [ ]:
# === EDIT ONLY IF NEEDED ===
GITHUB_USER   = "danielgraves"
GITHUB_REPO   = "Graves_Greenery_Analysis"
GITHUB_BRANCH = "main"
CSV_GLOB      = "data/**/*.csv"   # recursive search under /data/
DB_PATH       = "/content/graves_greenery.duckdb"
REPO_DIR      = f"/content/{GITHUB_REPO}"
INCLUDE_PARENT_IN_TABLE = False

## 4) Start DuckDB (file-backed) and connect `%%sql`

In [ ]:
Path(DB_PATH).parent.mkdir(parents=True, exist_ok=True)
%sql duckdb:///{DB_PATH}
print('Connected to DuckDB at', DB_PATH)

## 5) Clone (or Pull) your GitHub repo (public)

In [ ]:
def clone_or_pull_repo(user, repo, branch, dest):
    if os.path.exists(dest):
        print(f'Repo exists at {dest}. Pulling latest...')
        subprocess.run(f'git -C {dest} pull --ff-only', shell=True, check=True)
        return
    cmd = f'git clone --depth 1 --branch {branch} https://github.com/{user}/{repo}.git {dest}'
    print(cmd)
    subprocess.run(cmd, shell=True, check=True)

clone_or_pull_repo(GITHUB_USER, GITHUB_REPO, GITHUB_BRANCH, REPO_DIR)
print('Repo ready at:', REPO_DIR)

## 6) Load CSVs into DuckDB tables (Option A — SQLAlchemy engine)

In [ ]:
from sqlalchemy import create_engine
engine = create_engine(f'duckdb:///{DB_PATH}')

def slugify_table_name(path, include_parent=False):
    path = Path(path)
    stem = re.sub(r'[^a-z0-9_]+', '_', path.stem.lower()).strip('_')
    if include_parent and path.parent != path.parent.parent:
        parent = re.sub(r'[^a-z0-9_]+', '_', path.parent.name.lower()).strip('_')
        name = f"{parent}_{stem}"
    else:
        name = stem
    if re.match(r'^\d', name):
        name = 't_' + name
    return name

def load_csvs_as_tables(repo_dir, csv_glob, include_parent=False):
    csvs = glob.glob(os.path.join(repo_dir, csv_glob), recursive=True)
    loaded = []
    with engine.begin() as conn:
        for f in csvs:
            tbl = slugify_table_name(f, include_parent=include_parent)
            conn.exec_driver_sql(
                f"""
                CREATE OR REPLACE TABLE "{tbl}" AS
                SELECT * FROM read_csv_auto(?, header=True, sample_size=-1, ignore_errors=True);
                """,
                (f,)
            )
            loaded.append((tbl, f))
    return loaded

loaded = load_csvs_as_tables(REPO_DIR, CSV_GLOB, INCLUDE_PARENT_IN_TABLE)
print(f'Loaded {len(loaded)} CSVs into DuckDB tables.')
loaded[:10]

## 7) Verify: list available tables

In [ ]:
%%sql
SELECT table_name
FROM duckdb_tables()
WHERE NOT internal
ORDER BY table_name;

## 8) Test query

In [ ]:
%%sql
SELECT * FROM dim_customers LIMIT 5;

## 9) (Optional) `%%mysql` magic — write MySQL syntax via SQLGlot

In [ ]:
from sqlglot import transpile
from IPython.core.magic import register_cell_magic

@register_cell_magic
def mysql(line, cell):
    [duckdb_sql] = transpile(cell, read='mysql', write='duckdb')
    duckdb_sql = duckdb_sql.replace('IFNULL', 'COALESCE').replace('NOW()', 'CURRENT_TIMESTAMP')
    print('Translated to DuckDB SQL:\n', duckdb_sql, '\n', flush=True)
    return get_ipython().run_cell_magic('sql', '', duckdb_sql)

print('Custom %%mysql magic is ready.')

## 10) Refresh data (git pull + reload CSVs)

In [ ]:
subprocess.run(f'git -C {REPO_DIR} pull --ff-only', shell=True, check=True)
loaded = load_csvs_as_tables(REPO_DIR, CSV_GLOB, INCLUDE_PARENT_IN_TABLE)
print(f'Reloaded {len(loaded)} CSVs into DuckDB tables.')

## 11) Snapshot / export

In [ ]:
print('DuckDB snapshot at:', DB_PATH)
export_path = '/content/sample_export.csv'
_ = get_ipython().run_cell_magic('sql', '', textwrap.dedent(f"""
COPY (SELECT * FROM dim_customers LIMIT 10)
TO '{export_path}' WITH (HEADER, DELIMITER ',');
"""))
print('Exported CSV:', export_path)