# 🚀 Quick Start — Load Data & Enable SQL (No Server)

###_This section sets up an **in-memory DuckDB** and the `%%sql` magic so you can run SQL directly in Colab. CSVs from the GitHub repo are loaded into tables named after each file (snake_case)._

###**What you get:**
- One in-memory DuckDB session (no MySQL/SQLite servers)
- `%%sql` / `%sql` via `ipython-sql` with pretty table output
- Auto-load all CSVs from `data/**` → tables (e.g., `dim_customers`)
- A quick verification query you can edit

### 1) Install libraries & pull your repo

In [1]:
!pip -q install --upgrade duckdb duckdb-engine "sqlalchemy>=2.0" ipython-sql jupysql pandas

import os, subprocess
REPO_USER = "danielsgraves"                 # <-- repo owner
REPO_NAME = "Graves_Greenery_Analysis"     # <-- repo name
REPO_DIR  = f"/content/{REPO_NAME}"

if not os.path.exists(REPO_DIR):
    subprocess.run(
        f"git clone --depth 1 https://github.com/{REPO_USER}/{REPO_NAME}.git {REPO_DIR}",
        shell=True, check=True
    )
else:
    subprocess.run(f"git -C {REPO_DIR} pull --ff-only", shell=True, check=True)

print("Repo ready at:", REPO_DIR)
print("CSV root:", f"{REPO_DIR}/data")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.5/20.5 MB[0m [31m80.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.7/49.7 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m64.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.1/95.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m100.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.8/192.8 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

### 2) Connect a single in-memory DuckDB session for `%%sql`

In [2]:
%reload_ext sql
%config SqlMagic.autopandas = False    # PrettyTable output (set True for DataFrame)
%config SqlMagic.feedback = False
%sql duckdb:///:memory:
print("✅ Connected %sql to in-memory DuckDB.")

✅ Connected %sql to in-memory DuckDB.


### 3) Load all CSVs → tables (names match file stems)

In [3]:
import os, re, glob
from pathlib import Path

CSV_GLOB = "data/**/*.[cC][sS][vV]"   # case-insensitive .csv
INCLUDE_PARENT_PREFIX = False          # True → prefix parent folder: e.g., sales_dim_customers

def to_snake(name: str) -> str:
    s = re.sub(r"[^0-9a-zA-Z]+", "_", name).strip("_")
    s = re.sub(r"_+", "_", s)
    if s and s[0].isdigit():
        s = "t_" + s
    return s.lower()

def table_name_for(csv_path: Path) -> str:
    stem = csv_path.stem
    if INCLUDE_PARENT_PREFIX and csv_path.parent != csv_path.parent.parent:
        return to_snake(csv_path.parent.name + "_" + stem)
    return to_snake(stem)

files = [Path(p) for p in glob.glob(os.path.join(REPO_DIR, CSV_GLOB), recursive=True)]
files = [p for p in files if p.is_file()]
print(f"Found {len(files)} CSV(s). Showing first 15 mappings…")
for rel, tbl in [(str(p.relative_to(REPO_DIR)), table_name_for(p)) for p in files[:15]]:
    print(f"  {rel}  →  {tbl}")

# Create tables via the same %sql connection (no secondary connections)
for p in files:
    tbl = table_name_for(p)
    q = f"""
    CREATE OR REPLACE TABLE "{tbl}" AS
    SELECT * FROM read_csv_auto('{str(p)}', header=True, sample_size=-1, ignore_errors=True);
    """
    get_ipython().run_cell_magic('sql', '', q)

print("Loaded tables (first few):", [table_name_for(p) for p in files[:8]])

Found 14 CSV(s). Showing first 15 mappings…
  data/dim_dates.csv  →  dim_dates
  data/dim_plant_category.csv  →  dim_plant_category
  data/dim_inventory.csv  →  dim_inventory
  data/dim_customers_clean.csv  →  dim_customers_clean
  data/fact_returns.csv  →  fact_returns
  data/fact_order_promotions.csv  →  fact_order_promotions
  data/dim_return_reason.csv  →  dim_return_reason
  data/fact_order_items.csv  →  fact_order_items
  data/dim_locations.csv  →  dim_locations
  data/dim_promotions.csv  →  dim_promotions
  data/dim_customers.csv  →  dim_customers
  data/graves_greenery_full_denormalized.csv  →  graves_greenery_full_denormalized
  data/dim_plants.csv  →  dim_plants
  data/fact_orders.csv  →  fact_orders
Loaded tables (first few): ['dim_dates', 'dim_plant_category', 'dim_inventory', 'dim_customers_clean', 'fact_returns', 'fact_order_promotions', 'dim_return_reason', 'fact_order_items']


### 4) Verify & sample
_If your repo has `data/**/dim_customers.csv`, the table will be **`dim_customers`**._

In [4]:
# Remove displaylimit
%config SqlMagic.displaylimit = None

In [5]:
%%sql
SELECT table_name
FROM information_schema.tables
WHERE table_schema='main'
ORDER BY table_name;

table_name
dim_customers
dim_customers_clean
dim_dates
dim_inventory
dim_locations
dim_plant_category
dim_plants
dim_promotions
dim_return_reason
fact_order_items


In [6]:
%%sql
SELECT *
FROM dim_customers
LIMIT 2;

customer_id,first_name,last_name,email,phone,address1,address2,city,state_province,postal_code,country,signup_ts,marketing_opt_in,loyalty_tier,source_channel
1,Lucas,Johansson,LUCAS.JOHANSSON@HOTMAIL.COM,(426) 633-9078,8537 Poplar Rd,,Edmonton,AB,T5G 3K7,Canada,2022-01-28 19:12:45,True,Green,In-Store
2,karim,kobayashi,kkobayashi@gmail.com,(550) 746-9092,1768 Myrtle Ln,,Minneapolis,MN,55739,USA,2025-03-29 09:41:07,True,Emerald,Online


# Project Overview

# Problem Statement

# Data Cleaning and Preparation

---
Clean & trim columns from dim_customers table and create cleaned view
---

In [7]:
%%sql
CREATE OR REPLACE VIEW dim_customers_cleaned AS
SELECT
                customer_id
              , CONCAT(UPPER(LEFT(TRIM(first_name),1)), LOWER(SUBSTRING(TRIM(first_name), 2))) AS first_name -- Normalized first name
              , CONCAT(UPPER(LEFT(TRIM(last_name),1)), LOWER(SUBSTRING(TRIM(last_name), 2))) AS last_name -- Normalized first name
              , TRIM(LOWER(email)) AS email
              , CASE        -- Removes all non-numeric characters, country code & formats phone column
                        WHEN LENGTH(regexp_replace(phone, '[^0-9]', '', 'g')) = 11 AND LEFT(regexp_replace(phone, '[^0-9]', '', 'g'), 1) = '1'
                        THEN regexp_replace(SUBSTRING(regexp_replace(phone, '[^0-9]', '', 'g'), 2), '(\d{3})(\d{3})(\d{4})', '(\1) \2-\3')
                        WHEN LENGTH(regexp_replace(phone, '[^0-9]', '', 'g')) = 10
                        THEN regexp_replace(regexp_replace(phone, '[^0-9]', '', 'g'), '(\d{3})(\d{3})(\d{4})', '(\1) \2-\3')
                        ELSE regexp_replace(phone, '[^0-9]', '', 'g')
                END AS phone
              , TRIM(UPPER(address1)) AS address1
              , address2
              , CONCAT(UPPER(LEFT(TRIM(city),1)), LOWER(SUBSTRING(TRIM(city), 2))) AS city -- Normalized city
              , state_province
              , postal_code
              , country
              , signup_ts
              , marketing_opt_in
              , loyalty_tier
              , source_channel

FROM
                dim_customers

Count


In [23]:
%%sql
SELECT      email
            , CASE
                WHEN
                      LOWER(email)
                LIKE
                      '%gmail.com'
                AND
                      LOWER(email)
                NOT LIKE
                      '%@gmail.com'
                THEN
                      REPLACE(LOWER(email), 'gmail.com', '@gmail.com')
                ELSE
                      LOWER(email)
              END AS email_cleaned

FROM
      dim_customers
WHERE email NOT LIKE '%@%'

email,email_cleaned
MICHAELMORI6378AOL.COM,michaelmori6378aol.com
imran_costayahoo.com,imran_costayahoo.com
sharma.neel5972aol.com,sharma.neel5972aol.com
khalil.singh8115yahoo.com,khalil.singh8115yahoo.com
nakamura.rohanhotmail.com,nakamura.rohanhotmail.com
lucia_kauraol.com,lucia_kauraol.com
hassanitogmail.com,hassanito@gmail.com
davis.danielyahoo.com,davis.danielyahoo.com
amariiyer5124gmail.com,amariiyer5124@gmail.com
yurilambertyahoo.com,yurilambertyahoo.com


---
Dedupe customers table
---

In [10]:
%%sql
WITH duplicates AS (SELECT
                    first_name
                  , last_name
                  , city
                  , COUNT(*) AS duplicate_count
FROM
                    dim_customers
GROUP BY
                    first_name
                  , last_name
                  , city
HAVING
                    count(*) > 1
)

SELECT
          dc.*
FROM
          dim_customers dc
  JOIN
          duplicates d ON
              dc.first_name = d.first_name AND
              dc.last_name = d.last_name AND
              dc.city = d.city
ORDER BY
          dc.first_name
        , dc.last_name
        , dc.city

customer_id,first_name,last_name,email,phone,address1,address2,city,state_province,postal_code,country,signup_ts,marketing_opt_in,loyalty_tier,source_channel
7155,Sebastian,Sanchez,sanchez.sebastian5071@gmail.com,(745) 741-2684,2961 Cedar Ln,,Albany,NY,122881,USA,2024-01-29 00:36:07,False,Gold,In-Store
9191,Sebastian,Sanchez,ssanchez2929@gmail.com,(688) 860-7293,7488 Elm Rd,,Albany,NY,15367,USA,2022-12-06 01:19:40,True,Silver,In-Store


---
Orders exploration
---

# Exploratory Data Analysis (EDA)

# Solution and Implementation

# Recommendations and Next Steps