# 🚀 Quick Start — Load Data & Enable SQL (No Server)

###_This section sets up an **in-memory DuckDB** and the `%%sql` magic so you can run SQL directly in Colab. CSVs from the GitHub repo are loaded into tables named after each file (snake_case)._

###**What you get:**
- One in-memory DuckDB session (no MySQL/SQLite servers)
- `%%sql` / `%sql` via `ipython-sql` with pretty table output
- Auto-load all CSVs from `data/**` → tables (e.g., `dim_customers`)
- A quick verification query you can edit

### 1) Install libraries & pull your repo

In [1]:
!pip -q install --upgrade duckdb duckdb-engine "sqlalchemy>=2.0" ipython-sql jupysql pandas

import os, subprocess
REPO_USER = "danielsgraves"                 # <-- repo owner
REPO_NAME = "Graves_Greenery_Analysis"     # <-- repo name
REPO_DIR  = f"/content/{REPO_NAME}"

if not os.path.exists(REPO_DIR):
    subprocess.run(
        f"git clone --depth 1 https://github.com/{REPO_USER}/{REPO_NAME}.git {REPO_DIR}",
        shell=True, check=True
    )
else:
    subprocess.run(f"git -C {REPO_DIR} pull --ff-only", shell=True, check=True)

print("Repo ready at:", REPO_DIR)
print("CSV root:", f"{REPO_DIR}/data")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.5/20.5 MB[0m [31m69.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.7/49.7 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.1/95.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m111.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.8/192.8 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m66.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.3/137.3 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver doe

### 2) Connect a single in-memory DuckDB session for `%%sql`

In [2]:
%reload_ext sql
%config SqlMagic.autopandas = False    # PrettyTable output (set True for DataFrame)
%config SqlMagic.feedback = False
%sql duckdb:///:memory:
print("✅ Connected %sql to in-memory DuckDB.")

✅ Connected %sql to in-memory DuckDB.


### 3) Load all CSVs → tables (names match file stems)

In [3]:
import os, re, glob
from pathlib import Path

CSV_GLOB = "data/**/*.[cC][sS][vV]"   # case-insensitive .csv
INCLUDE_PARENT_PREFIX = False          # True → prefix parent folder: e.g., sales_dim_customers

def to_snake(name: str) -> str:
    s = re.sub(r"[^0-9a-zA-Z]+", "_", name).strip("_")
    s = re.sub(r"_+", "_", s)
    if s and s[0].isdigit():
        s = "t_" + s
    return s.lower()

def table_name_for(csv_path: Path) -> str:
    stem = csv_path.stem
    if INCLUDE_PARENT_PREFIX and csv_path.parent != csv_path.parent.parent:
        return to_snake(csv_path.parent.name + "_" + stem)
    return to_snake(stem)

files = [Path(p) for p in glob.glob(os.path.join(REPO_DIR, CSV_GLOB), recursive=True)]
files = [p for p in files if p.is_file()]
print(f"Found {len(files)} CSV(s). Showing first 15 mappings…")
for rel, tbl in [(str(p.relative_to(REPO_DIR)), table_name_for(p)) for p in files[:15]]:
    print(f"  {rel}  →  {tbl}")

# Create tables via the same %sql connection (no secondary connections)
for p in files:
    tbl = table_name_for(p)
    q = f"""
    CREATE OR REPLACE TABLE "{tbl}" AS
    SELECT * FROM read_csv_auto('{str(p)}', header=True, sample_size=-1, ignore_errors=True);
    """
    get_ipython().run_cell_magic('sql', '', q)

print("Loaded tables (first few):", [table_name_for(p) for p in files[:8]])

Found 10 CSV(s). Showing first 15 mappings…
  data/fact_orders.csv  →  fact_orders
  data/dim_customers.csv  →  dim_customers
  data/dim_inventory.csv  →  dim_inventory
  data/dim_dates.csv  →  dim_dates
  data/dim_plants.csv  →  dim_plants
  data/data_dictionary.csv  →  data_dictionary
  data/dim_categories.csv  →  dim_categories
  data/graves_greenery_full_denormalized.csv  →  graves_greenery_full_denormalized
  data/dim_locations.csv  →  dim_locations
  data/fact_order_items.csv  →  fact_order_items
Loaded tables (first few): ['fact_orders', 'dim_customers', 'dim_inventory', 'dim_dates', 'dim_plants', 'data_dictionary', 'dim_categories', 'graves_greenery_full_denormalized']


### 4) Verify & sample
_If your repo has `data/**/dim_customers.csv`, the table will be **`dim_customers`**._

In [4]:
# Remove displaylimit
%config SqlMagic.displaylimit = None

In [5]:
%%sql
SELECT table_name
FROM information_schema.tables
WHERE table_schema='main'
ORDER BY table_name;

table_name
data_dictionary
dim_categories
dim_customers
dim_dates
dim_inventory
dim_locations
dim_plants
fact_order_items
fact_orders
graves_greenery_full_denormalized


In [6]:
%%sql
SELECT *
FROM dim_customers
LIMIT 2;

customer_id,first_name,last_name,email,phone,address1,address2,city,state_province,postal_code,country,signup_ts,marketing_opt_in,loyalty_tier,source_channel
1,Aiden,Lopez,aiden.lopez@gravesgreenery.com,,798 birch way,,Olympia,WA,97589,USA,2023-03-11,True,,Friend
2,Ethan,MARTINEZ,emartinez@outlook.com,,6129 Willow St,,Seattle,WA,98766,USA,2023-11-25,True,,Friend


# Project Overview

# Problem Statement

# Data Cleaning and Preparation

---
Clean & trim columns from dim_customers table
---

In [93]:
%%sql
WITH dim_customers_cleaned AS(
      SELECT
                customer_id
              , CONCAT(UPPER(LEFT(TRIM(first_name),1)), LOWER(SUBSTRING(TRIM(first_name), 2))) AS first_name_cleaned -- Normalized first name
              , CONCAT(UPPER(LEFT(TRIM(last_name),1)), LOWER(SUBSTRING(TRIM(last_name), 2))) AS last_name_cleaned -- Normalized first name
              , TRIM(LOWER(email)) AS email_cleaned
              , CASE        -- Removes all non-numeric characters, country code & formats phone column
                        WHEN LENGTH(regexp_replace(phone, '[^0-9]', '', 'g')) = 11 AND LEFT(regexp_replace(phone, '[^0-9]', '', 'g'), 1) = '1'
                        THEN regexp_replace(SUBSTRING(regexp_replace(phone, '[^0-9]', '', 'g'), 2), '(\d{3})(\d{3})(\d{4})', '(\1) \2-\3')
                        WHEN LENGTH(regexp_replace(phone, '[^0-9]', '', 'g')) = 10
                        THEN regexp_replace(regexp_replace(phone, '[^0-9]', '', 'g'), '(\d{3})(\d{3})(\d{4})', '(\1) \2-\3')
                        ELSE regexp_replace(phone, '[^0-9]', '', 'g')
                END AS phone_cleaned
              , TRIM(UPPER(address1)) AS address_cleaned
              , address2
              , CONCAT(UPPER(LEFT(TRIM(city),1)), LOWER(SUBSTRING(TRIM(city), 2))) AS city_cleaned -- Normalized city
              , state_province
              , postal_code
              , country
              , signup_ts
              , marketing_opt_in
              , loyalty_tier
              , source_channel

FROM            dim_customers)

SELECT
              *
FROM          dim_customers_cleaned
WHERE         first_name_cleaned = 'Aiden' AND last_name_cleaned = 'Lopez'
LIMIT 2

customer_id,first_name_cleaned,last_name_cleaned,email_cleaned,phone_cleaned,address_cleaned,address2,city_cleaned,state_province,postal_code,country,signup_ts,marketing_opt_in,loyalty_tier,source_channel
1,Aiden,Lopez,aiden.lopez@gravesgreenery.com,,798 BIRCH WAY,,Olympia,WA,97589,USA,2023-03-11,True,,Friend
537,Aiden,Lopez,aiden.lopez@gravesgreenery.com,(106) 592-6010,1494 BIRCH WAY,,Eugene,OR,98143,USA,2023-08-18,True,Silver,Google


---
Exploratory cleanup
---

In [92]:
%%sql
SELECT DISTINCT
                    first_name
FROM                dim_customers
LIMIT 1

first_name
Ethan


# Exploratory Data Analysis (EDA)

# Solution and Implementation

# Recommendations and Next Steps