In [None]:
### Dimension: Geography (Dim_Geo)
'''
This block extracts and validates geographic attributes used for analytical joins.

Purpose:
- Build a clean Geography dimension
- Preserve special characters (e.g., Eastern European states)
- Assign stable surrogate keys

Notes:
- Encoding is critical for this dataset
- Validation is performed inline before saving
'''

In [None]:
import polars as pl
import os

# ------------------------------------------------------------------------------
# FILE SETUP
# ------------------------------------------------------------------------------
file_path = r"D:\Data Lake\Bronze\DataCo_Final_2M.csv"
folder_path = r"D:\Data Lake\testing"
os.makedirs(folder_path, exist_ok=True)

# ------------------------------------------------------------------------------
# 1. LOAD DATA WITH SAFE ENCODING
# ------------------------------------------------------------------------------
# cp1252 preserves Eastern European characters better than latin-1
try:
    print("Attempting to read with cp1252 encoding...")
    df_main = pl.read_csv(file_path, encoding="cp1252")
except Exception as e:
    print(f"cp1252 failed ({e}). Falling back to utf-8.")
    df_main = pl.read_csv(file_path, encoding="utf-8")

# ------------------------------------------------------------------------------
# 2. EXTRACT GEOGRAPHIC ATTRIBUTES
# ------------------------------------------------------------------------------
dim_geo = (
    df_main
    .select(["order_state", "order_country", "order_region", "market"])
    .unique()
    .sort(["order_state", "order_country"])
)

# ------------------------------------------------------------------------------
# 3. VALIDATION CHECK (ENCODING SANITY)
# ------------------------------------------------------------------------------
# Verify that special characters survived encoding
check = dim_geo.filter(pl.col("order_state").str.contains("ilina"))

print("\nGeography encoding validation:")
if check.height > 0:
    print("Special characters preserved correctly:")
    print(check.head())
else:
    print("Warning: expected values not found. Verify source file.")

# ------------------------------------------------------------------------------
# 4. ASSIGN SURROGATE KEYS
# ------------------------------------------------------------------------------
dim_geo = dim_geo.with_row_index(name="geo_id", offset=1000)
dim_geo = dim_geo.select(
    ["geo_id", "order_state", "order_country", "order_region", "market"]
)

# ------------------------------------------------------------------------------
# 5. EXPORT DIMENSION TABLE
# ------------------------------------------------------------------------------
output_filename = "dim_geo.parquet"
output_path = os.path.join(folder_path, output_filename)

print(f"Saving Geography dimension to: {output_path}")
dim_geo.write_parquet(output_path)

print("Geography dimension creation complete.")

In [None]:
### Dimension: Customer Geography (Dim_Customer_Geo)
'''
This block creates the Customer Geography dimension table used for analytical joins.

Purpose:
- Extract unique customer location combinations
- Standardize column naming
- Assign stable surrogate keys

Notes:
- This is a one-time setup step
- Dimension keys should not be regenerated after initial load
- Output is consumed by the Silver → SQL pipeline
'''

In [None]:
import polars as pl
import os

# ------------------------------------------------------------------------------
# FILE SETUP
# ------------------------------------------------------------------------------
# Adjust input path if required
input_csv = r"D:\Data Lake\Bronze\DataCo_Final_2M.csv"
output_parquet = "Dim_Customer_Geo.parquet"

# ------------------------------------------------------------------------------
# 1. LOAD CLEAN SOURCE DATA
# ------------------------------------------------------------------------------
# Default encoding is sufficient here as the source file is already normalized
df_raw = pl.read_csv(input_csv)

# ------------------------------------------------------------------------------
# 2. STANDARDIZE COLUMN NAMES
# ------------------------------------------------------------------------------
# Enforce snake_case for schema consistency
df_raw.columns = [
    c.strip()
     .lower()
     .replace(" ", "_")
     .replace("(", "_")
     .replace(")", "_")
    for c in df_raw.columns
]

# ------------------------------------------------------------------------------
# 3. BUILD CUSTOMER GEOGRAPHY DIMENSION
# ------------------------------------------------------------------------------
target_cols = ["customer_state", "customer_country"]

dim_cust_geo = (
    df_raw
    .select(target_cols)
    .unique()
    .sort(target_cols)
)

# ------------------------------------------------------------------------------
# 4. ASSIGN SURROGATE KEYS
# ------------------------------------------------------------------------------
dim_cust_geo = dim_cust_geo.with_row_index(
    name="customer_geo_id",
    offset=100
)

dim_cust_geo = dim_cust_geo.select(
    ["customer_geo_id"] + target_cols
)

# ------------------------------------------------------------------------------
# 5. EXPORT DIMENSION TABLE
# ------------------------------------------------------------------------------
print(f"Generated {dim_cust_geo.height} unique customer locations.")
print(dim_cust_geo.head())

dim_cust_geo.write_parquet(output_parquet)
print(f"Saved Customer Geography dimension to: {output_parquet}")

In [None]:
### Dimension: Product (Dim_Product)
'''
This block creates the Product dimension table used for analytical joins.

Purpose:
- Extract unique product definitions
- Preserve product → category → department hierarchy
- Assign stable surrogate keys for fact table joins

Notes:
- This is a one-time setup step
- Product keys should remain stable once created
- Output is consumed by the Silver → SQL pipeline
'''

In [None]:
import polars as pl
import os

# ------------------------------------------------------------------------------
# FILE SETUP
# ------------------------------------------------------------------------------
input_csv = r"D:\Data Lake\Bronze\DataCo_Final_2M.csv"
output_parquet = "Dim_Product.parquet"

# ------------------------------------------------------------------------------
# 1. LOAD CLEAN SOURCE DATA
# ------------------------------------------------------------------------------
# Default encoding is sufficient as the source file has already been normalized
df_raw = pl.read_csv(input_csv)

# ------------------------------------------------------------------------------
# 2. STANDARDIZE COLUMN NAMES
# ------------------------------------------------------------------------------
# Enforce snake_case for consistency across all dimensions
df_raw.columns = [
    c.strip()
     .lower()
     .replace(" ", "_")
     .replace("(", "_")
     .replace(")", "_")
    for c in df_raw.columns
]

# ------------------------------------------------------------------------------
# 3. BUILD PRODUCT DIMENSION
# ------------------------------------------------------------------------------
# Product uniqueness is defined by name + category + department
target_cols = ["product_name", "category_name", "department_name"]

dim_product = (
    df_raw
    .select(target_cols)
    .unique()
    .sort(target_cols)
)

# ------------------------------------------------------------------------------
# 4. ASSIGN SURROGATE KEYS
# ------------------------------------------------------------------------------
dim_product = dim_product.with_row_index(
    name="product_key",
    offset=100
)

dim_product = dim_product.select(
    ["product_key"] + target_cols
)

# ------------------------------------------------------------------------------
# 5. EXPORT DIMENSION TABLE
# ------------------------------------------------------------------------------
print(f"Generated {dim_product.height} unique products.")
print(dim_product.head())

dim_product.write_parquet(output_parquet)
print(f"Saved Product dimension to: {output_parquet}")