# 🚀 Quick Start — Load Data & Enable SQL (No Server)

###_This section sets up an **in-memory DuckDB** and the `%%sql` magic so you can run SQL directly in Colab. CSVs from the GitHub repo are loaded into tables named after each file (snake_case)._

###**What you get:**
- One in-memory DuckDB session (no MySQL/SQLite servers)
- `%%sql` / `%sql` via `ipython-sql` with pretty table output
- Auto-load all CSVs from `data/**` → tables (e.g., `dim_customers`)
- A quick verification query you can edit

### 1) Install libraries & pull your repo

In [100]:
!pip -q install --upgrade duckdb duckdb-engine "sqlalchemy>=2.0" ipython-sql jupysql pandas

import os, subprocess
REPO_USER = "danielsgraves"                 # <-- repo owner
REPO_NAME = "Graves_Greenery_Analysis"     # <-- repo name
REPO_DIR  = f"/content/{REPO_NAME}"

if not os.path.exists(REPO_DIR):
    subprocess.run(
        f"git clone --depth 1 https://github.com/{REPO_USER}/{REPO_NAME}.git {REPO_DIR}",
        shell=True, check=True
    )
else:
    subprocess.run(f"git -C {REPO_DIR} pull --ff-only", shell=True, check=True)

print("Repo ready at:", REPO_DIR)
print("CSV root:", f"{REPO_DIR}/data")

Repo ready at: /content/Graves_Greenery_Analysis
CSV root: /content/Graves_Greenery_Analysis/data


### 2) Connect a single in-memory DuckDB session for `%%sql`

In [101]:
%reload_ext sql
%config SqlMagic.autopandas = False    # PrettyTable output (set True for DataFrame)
%config SqlMagic.feedback = False
%sql duckdb:///:memory:
print("✅ Connected %sql to in-memory DuckDB.")

✅ Connected %sql to in-memory DuckDB.


### 3) Load all CSVs → tables (names match file stems)

In [102]:
import os, re, glob
from pathlib import Path

CSV_GLOB = "data/**/*.[cC][sS][vV]"   # case-insensitive .csv
INCLUDE_PARENT_PREFIX = False          # True → prefix parent folder: e.g., sales_dim_customers

def to_snake(name: str) -> str:
    s = re.sub(r"[^0-9a-zA-Z]+", "_", name).strip("_")
    s = re.sub(r"_+", "_", s)
    if s and s[0].isdigit():
        s = "t_" + s
    return s.lower()

def table_name_for(csv_path: Path) -> str:
    stem = csv_path.stem
    if INCLUDE_PARENT_PREFIX and csv_path.parent != csv_path.parent.parent:
        return to_snake(csv_path.parent.name + "_" + stem)
    return to_snake(stem)

files = [Path(p) for p in glob.glob(os.path.join(REPO_DIR, CSV_GLOB), recursive=True)]
files = [p for p in files if p.is_file()]
print(f"Found {len(files)} CSV(s). Showing first 15 mappings…")
for rel, tbl in [(str(p.relative_to(REPO_DIR)), table_name_for(p)) for p in files[:15]]:
    print(f"  {rel}  →  {tbl}")

# Create tables via the same %sql connection (no secondary connections)
for p in files:
    tbl = table_name_for(p)
    q = f"""
    CREATE OR REPLACE TABLE "{tbl}" AS
    SELECT * FROM read_csv_auto('{str(p)}', header=True, sample_size=-1, ignore_errors=True);
    """
    get_ipython().run_cell_magic('sql', '', q)

print("Loaded tables (first few):", [table_name_for(p) for p in files[:8]])

Found 13 CSV(s). Showing first 15 mappings…
  data/fact_orders.csv  →  fact_orders
  data/fact_returns.csv  →  fact_returns
  data/dim_customers.csv  →  dim_customers
  data/dim_inventory.csv  →  dim_inventory
  data/dim_promotions.csv  →  dim_promotions
  data/dim_dates.csv  →  dim_dates
  data/dim_plants.csv  →  dim_plants
  data/dim_return_reason.csv  →  dim_return_reason
  data/fact_order_promotions.csv  →  fact_order_promotions
  data/graves_greenery_full_denormalized.csv  →  graves_greenery_full_denormalized
  data/dim_locations.csv  →  dim_locations
  data/fact_order_items.csv  →  fact_order_items
  data/dim_plant_category.csv  →  dim_plant_category
Loaded tables (first few): ['fact_orders', 'fact_returns', 'dim_customers', 'dim_inventory', 'dim_promotions', 'dim_dates', 'dim_plants', 'dim_return_reason']


### 4) Verify & sample
_If your repo has `data/**/dim_customers.csv`, the table will be **`dim_customers`**._

In [103]:
# Remove displaylimit
%config SqlMagic.displaylimit = None

In [104]:
%%sql
SELECT table_name
FROM information_schema.tables
WHERE table_schema='main'
ORDER BY table_name;

table_name
data_dictionary
dim_categories
dim_customers
dim_dates
dim_inventory
dim_locations
dim_plant_category
dim_plants
dim_promotions
dim_return_reason


In [105]:
%%sql
SELECT *
FROM dim_customers
LIMIT 2;

customer_id,first_name,last_name,email,phone,address1,address2,city,state_province,postal_code,country,signup_ts,marketing_opt_in,loyalty_tier,source_channel
1,Sebastian,Sanders,sebastiansanders@gmail.com,(471) 251-7178,9739 Cedar Blvd,,Burnaby,NB,N9K 4C0,Canada,2025-01-24 05:08:32,True,Silver,Online
2,Kayden,Davis,kayden_davis@aol.com,(255) 691-6483,8799 Cedar Ct,,Birmingham,FL,95459,USA,2022-10-29 18:30:57,True,Silver,In-Store


# Project Overview

# Problem Statement

# Data Cleaning and Preparation

---
Clean & trim columns from dim_customers table and create cleaned view
---

In [110]:
%%sql
CREATE OR REPLACE VIEW dim_customers_cleaned AS
SELECT
                customer_id
              , CONCAT(UPPER(LEFT(TRIM(first_name),1)), LOWER(SUBSTRING(TRIM(first_name), 2))) AS first_name -- Normalized first name
              , CONCAT(UPPER(LEFT(TRIM(last_name),1)), LOWER(SUBSTRING(TRIM(last_name), 2))) AS last_name -- Normalized first name
              , TRIM(LOWER(email)) AS email
              , CASE        -- Removes all non-numeric characters, country code & formats phone column
                        WHEN LENGTH(regexp_replace(phone, '[^0-9]', '', 'g')) = 11 AND LEFT(regexp_replace(phone, '[^0-9]', '', 'g'), 1) = '1'
                        THEN regexp_replace(SUBSTRING(regexp_replace(phone, '[^0-9]', '', 'g'), 2), '(\d{3})(\d{3})(\d{4})', '(\1) \2-\3')
                        WHEN LENGTH(regexp_replace(phone, '[^0-9]', '', 'g')) = 10
                        THEN regexp_replace(regexp_replace(phone, '[^0-9]', '', 'g'), '(\d{3})(\d{3})(\d{4})', '(\1) \2-\3')
                        ELSE regexp_replace(phone, '[^0-9]', '', 'g')
                END AS phone
              , TRIM(UPPER(address1)) AS address1
              , address2
              , CONCAT(UPPER(LEFT(TRIM(city),1)), LOWER(SUBSTRING(TRIM(city), 2))) AS city -- Normalized city
              , state_province
              , postal_code
              , country
              , signup_ts
              , marketing_opt_in
              , loyalty_tier
              , source_channel

FROM
                dim_customers

Count


---
Dedupe customers table
---

In [123]:
%%sql
WITH duplicates AS (SELECT DISTINCT
                    first_name
                  , last_name
                  , city
                  , COUNT(*) AS duplicate_count
FROM
                    dim_customers
GROUP BY
                    first_name
                  , last_name
                  , city
HAVING
                    count(*) > 1
)

SELECT
          dc.*
FROM
          dim_customers dc
  JOIN
          duplicates d ON
              dc.first_name = d.first_name AND
              dc.last_name = d.last_name AND
              dc.city = d.city
ORDER BY
          dc.first_name
        , dc.last_name
        , dc.city

customer_id,first_name,last_name,email,phone,address1,address2,city,state_province,postal_code,country,signup_ts,marketing_opt_in,loyalty_tier,source_channel
732,Amelie,Bennett,abennett5945@aol.com,(297) 880-7988,9983 Juniper Dr,,Fort Collins,MD,14512,USA,2022-09-15 02:42:51,False,Silver,In-Store
14715,Amelie,Bennett,bennett.amelie@yahoo.com,(393) 229-8759,8556 Birch Ct,,Fort Collins,WA,37490,USA,2023-05-22 13:34:42,True,Silver,Online
13941,Easton,Moreau,easton.moreau@yahoo.com,(286) 353-3814,8665 Maple Rd,,Atlanta,VA,72054,USA,2023-12-30 23:02:03,True,Green,Online
14547,Easton,Moreau,moreau.easton2842@outlook.com,(282) 619-2079,1714 Ivy Pl,,Atlanta,DE,91418,USA,2020-11-18 11:11:14,True,Silver,Online
12236,Gael,Gonzales,gonzales.gael@yahoo.com,(298) 274-6441,9305 Palm Blvd,Apt 122,Boston,WV,66108,USA,2021-07-09 06:44:48,True,Green,In-Store
14998,Gael,Gonzales,gael.gonzales8350@hotmail.com,(336) 293-7088,4420 Magnolia St,Apt 235,Boston,WV,13764,USA,2020-05-27 07:16:11,False,Silver,In-Store
6149,Hudson,Wallace,hwallace@gmail.com,(819) 962-9644,323 Laurel Dr,,Boise,IN,45448,USA,2023-08-16 02:35:47,False,Green,Online
14732,Hudson,Wallace,hudson_wallace@outlook.com,(869) 927-9689,6311 Alder Ct,,Boise,GA,40545,USA,2023-10-02 10:40:41,True,Silver,Online
6947,James,Russell,james_russell127@aol.com,(666) 821-6377,4784 Ivy Dr,Apt 27,Raleigh,VT,61535,USA,2021-06-15 08:12:37,True,Green,Online
14711,James,Russell,jamesrussell@outlook.com,(689) 572-4123,568 Fir Ave,,Raleigh,RI,40239,USA,2024-06-12 07:16:53,False,Green,Online


---
Orders exploration
---

---
Highest margin plants
---

In [131]:
%%sql
SELECT
              foi.order_id
            , p.scientific_name
            , p.genus
            , p.common_name
            , foi.plant_id
            , foi.unit_price
            , foi.estimated_margin AS margin

FROM
              fact_order_items foi
LEFT JOIN
              dim_plants p
        ON
              foi.plant_id = p.plant_id
ORDER BY
              margin DESC
LIMIT 25

order_id,scientific_name,genus,common_name,plant_id,unit_price,margin
312245,,,,451,277.51,448.83
318856,,,,451,273.41,436.53
307915,,,,432,272.61,428.1
305916,,,,451,263.39,406.47
313001,,,,432,261.44,394.59
307039,,,,451,256.23,384.99
316121,,,,432,257.01,381.3
310884,,,,451,278.27,325.89
310526,,,,451,281.46,307.12
304063,,,,451,278.49,301.18


# Exploratory Data Analysis (EDA)

# Solution and Implementation

# Recommendations and Next Steps