# 🚀 Quick Start — Load Data & Enable SQL (No Server)

###_This section sets up an **in-memory DuckDB** and the `%%sql` magic so you can run SQL directly in Colab. CSVs from the GitHub repo are loaded into tables named after each file (snake_case)._

###**What you get:**
- One in-memory DuckDB session (no MySQL/SQLite servers)
- `%%sql` / `%sql` via `ipython-sql` with pretty table output
- Auto-load all CSVs from `data/**` → tables (e.g., `dim_customers`)
- A quick verification query you can edit

### 1) Install libraries & pull your repo

In [132]:
!pip -q install --upgrade duckdb duckdb-engine "sqlalchemy>=2.0" ipython-sql jupysql pandas

import os, subprocess
REPO_USER = "danielsgraves"                 # <-- repo owner
REPO_NAME = "Graves_Greenery_Analysis"     # <-- repo name
REPO_DIR  = f"/content/{REPO_NAME}"

if not os.path.exists(REPO_DIR):
    subprocess.run(
        f"git clone --depth 1 https://github.com/{REPO_USER}/{REPO_NAME}.git {REPO_DIR}",
        shell=True, check=True
    )
else:
    subprocess.run(f"git -C {REPO_DIR} pull --ff-only", shell=True, check=True)

print("Repo ready at:", REPO_DIR)
print("CSV root:", f"{REPO_DIR}/data")

Repo ready at: /content/Graves_Greenery_Analysis
CSV root: /content/Graves_Greenery_Analysis/data


### 2) Connect a single in-memory DuckDB session for `%%sql`

In [133]:
%reload_ext sql
%config SqlMagic.autopandas = False    # PrettyTable output (set True for DataFrame)
%config SqlMagic.feedback = False
%sql duckdb:///:memory:
print("✅ Connected %sql to in-memory DuckDB.")

✅ Connected %sql to in-memory DuckDB.


### 3) Load all CSVs → tables (names match file stems)

In [134]:
import os, re, glob
from pathlib import Path

CSV_GLOB = "data/**/*.[cC][sS][vV]"   # case-insensitive .csv
INCLUDE_PARENT_PREFIX = False          # True → prefix parent folder: e.g., sales_dim_customers

def to_snake(name: str) -> str:
    s = re.sub(r"[^0-9a-zA-Z]+", "_", name).strip("_")
    s = re.sub(r"_+", "_", s)
    if s and s[0].isdigit():
        s = "t_" + s
    return s.lower()

def table_name_for(csv_path: Path) -> str:
    stem = csv_path.stem
    if INCLUDE_PARENT_PREFIX and csv_path.parent != csv_path.parent.parent:
        return to_snake(csv_path.parent.name + "_" + stem)
    return to_snake(stem)

files = [Path(p) for p in glob.glob(os.path.join(REPO_DIR, CSV_GLOB), recursive=True)]
files = [p for p in files if p.is_file()]
print(f"Found {len(files)} CSV(s). Showing first 15 mappings…")
for rel, tbl in [(str(p.relative_to(REPO_DIR)), table_name_for(p)) for p in files[:15]]:
    print(f"  {rel}  →  {tbl}")

# Create tables via the same %sql connection (no secondary connections)
for p in files:
    tbl = table_name_for(p)
    q = f"""
    CREATE OR REPLACE TABLE "{tbl}" AS
    SELECT * FROM read_csv_auto('{str(p)}', header=True, sample_size=-1, ignore_errors=True);
    """
    get_ipython().run_cell_magic('sql', '', q)

print("Loaded tables (first few):", [table_name_for(p) for p in files[:8]])

Found 14 CSV(s). Showing first 15 mappings…
  data/fact_orders.csv  →  fact_orders
  data/fact_returns.csv  →  fact_returns
  data/dim_customers.csv  →  dim_customers
  data/dim_inventory.csv  →  dim_inventory
  data/dim_promotions.csv  →  dim_promotions
  data/dim_dates.csv  →  dim_dates
  data/dim_plants.csv  →  dim_plants
  data/dim_return_reason.csv  →  dim_return_reason
  data/data_dictionary.csv  →  data_dictionary
  data/fact_order_promotions.csv  →  fact_order_promotions
  data/graves_greenery_full_denormalized.csv  →  graves_greenery_full_denormalized
  data/dim_locations.csv  →  dim_locations
  data/fact_order_items.csv  →  fact_order_items
  data/dim_plant_category.csv  →  dim_plant_category
Loaded tables (first few): ['fact_orders', 'fact_returns', 'dim_customers', 'dim_inventory', 'dim_promotions', 'dim_dates', 'dim_plants', 'dim_return_reason']


### 4) Verify & sample
_If your repo has `data/**/dim_customers.csv`, the table will be **`dim_customers`**._

In [135]:
# Remove displaylimit
%config SqlMagic.displaylimit = None

In [136]:
%%sql
SELECT table_name
FROM information_schema.tables
WHERE table_schema='main'
ORDER BY table_name;

table_name
data_dictionary
dim_categories
dim_customers
dim_customers_cleaned
dim_dates
dim_inventory
dim_locations
dim_plant_category
dim_plants
dim_promotions


In [137]:
%%sql
SELECT *
FROM dim_customers
LIMIT 2;

customer_id,first_name,last_name,email,phone,address1,address2,city,state_province,postal_code,country,signup_ts,marketing_opt_in,loyalty_tier,source_channel
1,Lucas,Johansson,johansson.lucas@gmail.com,(424) 782-5074,3886 Fir Rd,,Grand Rapids,MI,49056,USA,2023-09-10 15:49:16,False,Green,In-Store
2,Karim,Kobayashi,kobayashi.karim@hotmail.com,(245) 627-4759,4868 Juniper Ter,,Charlottetown,PE,C8H 1K6,Canada,2022-06-20 13:44:19,False,Emerald,Online


# Project Overview

# Problem Statement

# Data Cleaning and Preparation

---
Clean & trim columns from dim_customers table and create cleaned view
---

In [138]:
%%sql
CREATE OR REPLACE VIEW dim_customers_cleaned AS
SELECT
                customer_id
              , CONCAT(UPPER(LEFT(TRIM(first_name),1)), LOWER(SUBSTRING(TRIM(first_name), 2))) AS first_name -- Normalized first name
              , CONCAT(UPPER(LEFT(TRIM(last_name),1)), LOWER(SUBSTRING(TRIM(last_name), 2))) AS last_name -- Normalized first name
              , TRIM(LOWER(email)) AS email
              , CASE        -- Removes all non-numeric characters, country code & formats phone column
                        WHEN LENGTH(regexp_replace(phone, '[^0-9]', '', 'g')) = 11 AND LEFT(regexp_replace(phone, '[^0-9]', '', 'g'), 1) = '1'
                        THEN regexp_replace(SUBSTRING(regexp_replace(phone, '[^0-9]', '', 'g'), 2), '(\d{3})(\d{3})(\d{4})', '(\1) \2-\3')
                        WHEN LENGTH(regexp_replace(phone, '[^0-9]', '', 'g')) = 10
                        THEN regexp_replace(regexp_replace(phone, '[^0-9]', '', 'g'), '(\d{3})(\d{3})(\d{4})', '(\1) \2-\3')
                        ELSE regexp_replace(phone, '[^0-9]', '', 'g')
                END AS phone
              , TRIM(UPPER(address1)) AS address1
              , address2
              , CONCAT(UPPER(LEFT(TRIM(city),1)), LOWER(SUBSTRING(TRIM(city), 2))) AS city -- Normalized city
              , state_province
              , postal_code
              , country
              , signup_ts
              , marketing_opt_in
              , loyalty_tier
              , source_channel

FROM
                dim_customers

Count


---
Dedupe customers table
---

In [139]:
%%sql
WITH duplicates AS (SELECT DISTINCT
                    first_name
                  , last_name
                  , city
                  , COUNT(*) AS duplicate_count
FROM
                    dim_customers
GROUP BY
                    first_name
                  , last_name
                  , city
HAVING
                    count(*) > 1
)

SELECT
          dc.*
FROM
          dim_customers dc
  JOIN
          duplicates d ON
              dc.first_name = d.first_name AND
              dc.last_name = d.last_name AND
              dc.city = d.city
ORDER BY
          dc.first_name
        , dc.last_name
        , dc.city

customer_id,first_name,last_name,email,phone,address1,address2,city,state_province,postal_code,country,signup_ts,marketing_opt_in,loyalty_tier,source_channel
81,Ara,Nowak,anowak@gmail.com,(333) 515-4878,585 Alder St,,Pittsburgh,PA,18568,USA,2021-02-09 22:49:12,True,Silver,Online
9296,Ara,Nowak,nowak.ara@icloud.com,(974) 979-3629,6826 Cherry Ave,,Pittsburgh,PA,19639,USA,2024-02-16 20:22:37,False,Gold,Online
85,Evelyn,Thomas,ethomas@outlook.com,(684) 285-7437,3583 Cedar Rd,,Wilmington,DE,19770,USA,2024-09-24 23:54:30,True,Gold,In-Store
9300,Evelyn,Thomas,evelyn.thomas@icloud.com,(801) 350-8929,7467 Pine Pl,,Wilmington,DE,19866,USA,2022-11-06 06:22:43,True,Silver,In-Store
211,Jack,Pereira,jackpereira@yahoo.com,(348) 553-2402,491 Alder Blvd,,Montgomery,AL,36119,USA,2020-08-14 06:30:49,True,Green,In-Store
9426,Jack,Pereira,jack_pereira@proton.me,(969) 240-9379,8678 Vine St,,Montgomery,AL,36096,USA,2024-02-20 09:02:27,False,Green,In-Store
150,James,Greco,jgreco9644@aol.com,(356) 765-4944,3978 Elm Ct,,Tulsa,OK,73686,USA,2023-09-17 06:58:58,False,Green,In-Store
9365,James,Greco,james_greco7236@outlook.com,(870) 859-6371,2747 Laurel Ct,,Tulsa,OK,74252,USA,2020-05-25 09:09:49,True,Green,Online
285,Madison,Ferreira,mferreira4164@aol.com,(637) 932-3901,1780 Elm Ter,Apt 51,Salt Lake City,UT,84159,USA,2021-10-28 19:08:05,False,Silver,In-Store
9500,Madison,Ferreira,madison_ferreira2147@aol.com,(622) 837-8247,9587 Poplar Ter,,Salt Lake City,UT,84614,USA,2024-03-30 13:16:35,True,Green,Online


---
Orders exploration
---

---
Highest margin plants
---

In [141]:
%%sql
SELECT
              foi.order_id
            , p.plant_name
            , foi.plant_id
            , foi.unit_price
            , foi.estimated_margin AS margin

FROM
              fact_order_items foi
LEFT JOIN
              dim_plants p
        ON
              foi.plant_id = p.plant_id
ORDER BY
              margin DESC
LIMIT 25

order_id,plant_name,plant_id,unit_price,margin
311170,Malus domestica 2gal,670,266.71,302.46
300146,Malus domestica 2gal,670,263.84,293.85
313923,Malus domestica 2gal,670,256.39,271.5
314048,Malus domestica 2gal,670,286.04,240.3
306372,Malus domestica 2gal,670,285.09,238.4
315459,Malus domestica 2gal,670,284.53,237.28
309002,Malus domestica 2gal,670,281.69,231.6
324774,Brassia verrucosa 'Tricolor' 2gal,104,148.97,224.28
319364,Brassia verrucosa 'Tricolor' 2gal,104,147.3,219.27
306001,Brassia verrucosa 'Tricolor' 2gal,104,145.56,214.05


# Exploratory Data Analysis (EDA)

# Solution and Implementation

# Recommendations and Next Steps