# 🚀 Quick Start — Load Data & Enable SQL (No Server)

###_This section sets up an **in-memory DuckDB** and the `%%sql` magic so you can run SQL directly in Colab. CSVs from the GitHub repo are loaded into tables named after each file (snake_case)._

###**What you get:**
- One in-memory DuckDB session (no MySQL/SQLite servers)
- `%%sql` / `%sql` via `ipython-sql` with pretty table output
- Auto-load all CSVs from `data/**` → tables (e.g., `dim_customers`)
- A quick verification query you can edit

### 1) Install libraries & pull your repo

In [1]:
!pip -q install --upgrade duckdb duckdb-engine "sqlalchemy>=2.0" ipython-sql jupysql pandas

import os, subprocess
REPO_USER = "danielsgraves"                 # <-- repo owner
REPO_NAME = "Graves_Greenery_Analysis"     # <-- repo name
REPO_DIR  = f"/content/{REPO_NAME}"

if not os.path.exists(REPO_DIR):
    subprocess.run(
        f"git clone --depth 1 https://github.com/{REPO_USER}/{REPO_NAME}.git {REPO_DIR}",
        shell=True, check=True
    )
else:
    subprocess.run(f"git -C {REPO_DIR} pull --ff-only", shell=True, check=True)

print("Repo ready at:", REPO_DIR)
print("CSV root:", f"{REPO_DIR}/data")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.5/20.5 MB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.7/49.7 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.1/95.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m101.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.8/192.8 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m63.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.3/137.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver do

### 2) Connect a single in-memory DuckDB session for `%%sql`

In [2]:
%reload_ext sql
%config SqlMagic.autopandas = False    # PrettyTable output (set True for DataFrame)
%config SqlMagic.feedback = False
%sql duckdb:///:memory:
print("✅ Connected %sql to in-memory DuckDB.")

✅ Connected %sql to in-memory DuckDB.


### 3) Load all CSVs → tables (names match file stems)

In [3]:
import os, re, glob
from pathlib import Path

CSV_GLOB = "data/**/*.[cC][sS][vV]"   # case-insensitive .csv
INCLUDE_PARENT_PREFIX = False          # True → prefix parent folder: e.g., sales_dim_customers

def to_snake(name: str) -> str:
    s = re.sub(r"[^0-9a-zA-Z]+", "_", name).strip("_")
    s = re.sub(r"_+", "_", s)
    if s and s[0].isdigit():
        s = "t_" + s
    return s.lower()

def table_name_for(csv_path: Path) -> str:
    stem = csv_path.stem
    if INCLUDE_PARENT_PREFIX and csv_path.parent != csv_path.parent.parent:
        return to_snake(csv_path.parent.name + "_" + stem)
    return to_snake(stem)

files = [Path(p) for p in glob.glob(os.path.join(REPO_DIR, CSV_GLOB), recursive=True)]
files = [p for p in files if p.is_file()]
print(f"Found {len(files)} CSV(s). Showing first 15 mappings…")
for rel, tbl in [(str(p.relative_to(REPO_DIR)), table_name_for(p)) for p in files[:15]]:
    print(f"  {rel}  →  {tbl}")

# Create tables via the same %sql connection (no secondary connections)
for p in files:
    tbl = table_name_for(p)
    q = f"""
    CREATE OR REPLACE TABLE "{tbl}" AS
    SELECT * FROM read_csv_auto('{str(p)}', header=True, sample_size=-1, ignore_errors=True);
    """
    get_ipython().run_cell_magic('sql', '', q)

print("Loaded tables (first few):", [table_name_for(p) for p in files[:8]])

Found 10 CSV(s). Showing first 15 mappings…
  data/dim_dates.csv  →  dim_dates
  data/data_dictionary.csv  →  data_dictionary
  data/dim_customers.csv  →  dim_customers
  data/graves_greenery_full_denormalized.csv  →  graves_greenery_full_denormalized
  data/fact_order_items.csv  →  fact_order_items
  data/dim_plants.csv  →  dim_plants
  data/dim_categories.csv  →  dim_categories
  data/dim_inventory.csv  →  dim_inventory
  data/fact_orders.csv  →  fact_orders
  data/dim_locations.csv  →  dim_locations
Loaded tables (first few): ['dim_dates', 'data_dictionary', 'dim_customers', 'graves_greenery_full_denormalized', 'fact_order_items', 'dim_plants', 'dim_categories', 'dim_inventory']


### 4) Verify & sample
_If your repo has `data/**/dim_customers.csv`, the table will be **`dim_customers`**._

In [8]:
# Remove displaylimit
%config SqlMagic.displaylimit = None

In [9]:
%%sql
SELECT table_name
FROM information_schema.tables
WHERE table_schema='main'
ORDER BY table_name;

table_name
data_dictionary
dim_categories
dim_customers
dim_dates
dim_inventory
dim_locations
dim_plants
fact_order_items
fact_orders
graves_greenery_full_denormalized


In [11]:
%%sql
-- Test query (adjust name if your file is different)
SELECT * FROM graves_greenery_full_denormalized LIMIT 2;

order_item_id,order_id_fk,order_ts,order_date_key,order_channel,order_status,customer_id_fk,first_name,last_name,email,city,state_province,postal_code,country,plant_id_fk,scientific_name,common_name,genus,indoor_outdoor,pot_size,sku,msrp,unit_price,qty,fulfilled_qty,line_discount,subtotal,discount,tax,shipping_fee,grand_total,location_name_ship,location_name_store,coupon_code,payment_method,unit_cost_snapshot,ship_from_location_id_fk,order_id,store_location_id_fk,plant_id,category_id_fk,is_pet_toxic,light_req,water_req,avg_maturity_height_in,cost_basis,price_band,rarity_score,customer_id,phone,address1,address2,signup_ts,marketing_opt_in,loyalty_tier,source_channel,location_id_ship,address1_ship,city_ship,state_province_ship,postal_code_ship,country_ship,location_id_store,address1_store,city_store,state_province_store,postal_code_store,country_store,category_id,category_level1,category_level2,genus_cat,date_key,full_date,year,quarter,month,month_name,day,day_of_week,day_name,is_weekend
1,1,2024-04-07 19:51:31,20240407,In-Store,Completed,10306,Daniel,Davis,ddavis@outlook.com,Seattle,WA,97810,USA,56,Calathea orbifolia,Orbifolia,Calathea,Indoor,10in,CAL10-00056,72.23,86.02,3,3,0.0,462.03,3.97,35.38,0.0,493.44,Main Warehouse,Vancouver Store,VIP20,PayPal,40.02,3.0,1,2,56,10,0,Bright Indirect,High,24,39.54,Standard,3,10306,226-296-7213,9741 River Way,,2023-06-12,True,,Walk-in,3.0,50 Greenhouse Way,Vancouver,WA,98661,USA,2,1200 Main Ave,Vancouver,WA,98660,USA,10,Indoor,Ficus,Calathea,20240407,2024-04-07,2024,2,4,April,7,7,Sunday,True
2,1,2024-04-07 19:51:31,20240407,In-Store,Completed,10306,Daniel,Davis,ddavis@outlook.com,Seattle,WA,97810,USA,236,Acer rubrum,Red Maple,Acer,Outdoor,3gal,ACE3-00236,142.97,168.2,1,1,0.0,462.03,3.97,35.38,0.0,493.44,Main Warehouse,Vancouver Store,VIP20,PayPal,91.39,3.0,1,2,236,31,0,Medium,High,74,90.21,Standard,2,10306,226-296-7213,9741 River Way,,2023-06-12,True,,Walk-in,3.0,50 Greenhouse Way,Vancouver,WA,98661,USA,2,1200 Main Ave,Vancouver,WA,98660,USA,31,Outdoor,Grasses,Acer,20240407,2024-04-07,2024,2,4,April,7,7,Sunday,True


# Project Overview

# Problem Statement

# Data Cleaning and Preparation

# Exploratory Data Analysis (EDA)

# Solution and Implementation

# Recommendations and Next Steps