# Graves' Greenery — Setup
---
## SQL via DuckDB (ipython-sql + PrettyTable)
Single DuckDB connection (no server). All CSVs in `/data` are exposed as VIEWS (or TABLES) using the active %sql connection.

In [None]:
# --- CONFIG ---
REPO_USER = "danielsgraves"
REPO_NAME = "Graves_Greenery_Analysis"
REPO_DIR  = f"/content/{REPO_NAME}"
DATA_DIR  = f"{REPO_DIR}/data"
DB_FILE   = f"{REPO_DIR}/outputs/graves_greenery.duckdb"
LOAD_AS_VIEWS = True
PRINT_SAMPLE_ROWS = 5

In [None]:
# --- SYNC REPO ---
import os, subprocess

def run(cmd):
    p = subprocess.run(cmd, shell=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    print(p.stdout)
    if p.returncode != 0:
        raise RuntimeError(f"Command failed: {cmd}")

if not os.path.exists(REPO_DIR):
    run(f"git clone https://github.com/{REPO_USER}/{REPO_NAME}.git {REPO_DIR}")
else:
    os.chdir(REPO_DIR)
    run("git fetch --all --prune")
    run("git pull --rebase")

os.makedirs(f"{REPO_DIR}/outputs", exist_ok=True)
os.chdir(REPO_DIR)
print("Working dir:", os.getcwd())
run("ls -la")

In [None]:
# --- INSTALL: ipython-sql + PrettyTable + DuckDB ---
%pip -q install ipython-sql prettytable duckdb duckdb-engine sqlalchemy pandas

%reload_ext sql
%config SqlMagic.autopandas = False
%config SqlMagic.feedback = False

import prettytable as pt
candidates = ['MARKDOWN','PLAIN_COLUMNS','MSWORD_FRIENDLY','DOUBLE_BORDER','SINGLE_BORDER','DEFAULT']
avail = [s for s in candidates if hasattr(pt, s)]
if avail:
    chosen = avail[0]
    get_ipython().run_line_magic('config', f"SqlMagic.style = '{chosen}'")
    print(f"PrettyTable style set to: {chosen}")
else:
    get_ipython().run_line_magic('config', "SqlMagic.autopandas = True")
    print("No PrettyTable styles detected; switched to DataFrame rendering.")

In [None]:
# --- RESET CONNECTIONS + CONNECT TO DUCKDB ---
%reload_ext sql
try:
    from sql.connection import Connection
    for key in list(Connection.connections.keys()):
        try:
            Connection.close(key)
            print('Closed:', key)
        except Exception as e:
            print('Skip:', key, '->', e)
    print('All old connections closed.')
except Exception as e:
    print('No prior connections or cleanup error:', e)

db_url  = f"duckdb:///{DB_FILE}"
%sql $db_url
print('✅ Connected to:', db_url)

In [None]:
# --- LOAD ALL CSVs INTO DUCKDB ---
import os, glob
from IPython import get_ipython

files = sorted(glob.glob(os.path.join(DATA_DIR, "*.csv")))
print(f"Found {len(files)} CSV(s) in {DATA_DIR}")
if not files:
    print("⚠️ No CSVs found. Put your data files under /data.")
else:
    for path in files:
        table = os.path.splitext(os.path.basename(path))[0]
        if LOAD_AS_VIEWS:
            sql = f"""
            CREATE OR REPLACE VIEW {table} AS
            SELECT * FROM read_csv_auto('{path}', HEADER=TRUE);
            """
        else:
            sql = f"""
            CREATE OR REPLACE TABLE {table} AS
            SELECT * FROM read_csv_auto('{path}', HEADER=TRUE);
            """
        get_ipython().run_cell_magic('sql', '', sql)
        print(('VIEW ready: ' if LOAD_AS_VIEWS else 'TABLE loaded: ') + table)
print('Done.')

In [None]:
%%sql
SHOW TABLES;

In [None]:
# --- ROW COUNT DASHBOARD (FIXED VERSION) ---
%config SqlMagic.autopandas = True
import pandas as pd

# 1️⃣ Try reading from duckdb_tables(), fallback to SHOW TABLES if needed
try:
    res = %sql SELECT table_name FROM duckdb_tables() WHERE NOT internal ORDER BY table_name;
    df_names = res.DataFrame()
    tables = df_names['table_name'].tolist()
except Exception as e:
    print('Fallback to SHOW TABLES due to error:', e)
    res2 = %sql SHOW TABLES;
    df2 = res2.DataFrame()
    col = 'name' if 'name' in df2.columns else df2.columns[0]
    tables = df2[col].tolist()

# 2️⃣ Count rows per table
rows = []
for t in tables:
    cnt = %sql SELECT COUNT(*) AS n FROM $t;
    n = int(cnt.DataFrame()['n'][0])
    rows.append({'table': t, 'rows': n})

counts = pd.DataFrame(rows).sort_values('table').reset_index(drop=True)
display(counts)

# 3️⃣ Optional preview samples
if PRINT_SAMPLE_ROWS and tables:
    for t in tables:
        print(f"\n### {t} (first {PRINT_SAMPLE_ROWS} rows)")
        _ = %sql SELECT * FROM $t LIMIT $PRINT_SAMPLE_ROWS;

%config SqlMagic.autopandas = False

## SQL Sandbox
Use `%%sql` below to run queries directly against DuckDB.

In [None]:
%%sql
-- Example:
-- SELECT COUNT(*) AS n FROM dim_customers;
-- SELECT category, COUNT(*) FROM dim_plants GROUP BY category ORDER BY 2 DESC LIMIT 10;