## ESTUDIO DE COMO PRETENDE LLEGAR LA DATA

---
### DEPENDENCIAS
---

In [1]:
import sys
import subprocess
import os

def instalar_si_no(package, pip_name=None):
    try:
        __import__(package)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", pip_name or package])

paquetes = {
    "pandas": "pandas",
    "pyodbc": "pyodbc",
    "pymssql": "pymssql",
    "sqlalchemy": "sqlalchemy",
    "dotenv": "python-dotenv"
}

for modulo, pip_name in paquetes.items():
    instalar_si_no(modulo, pip_name)
    
import pandas as pd
import pyodbc
from dotenv import load_dotenv
from sqlalchemy import create_engine, text
import urllib.parse
import re
from datetime import datetime

---
### Cargue de variables de entorno
---

In [3]:
load_dotenv(override=True)
server   = os.getenv("SPACEPARTS_SERVER")
database = os.getenv("SPACEPARTS_DATABASE")
username = os.getenv("SPACEPARTS_USERNAME")
password = os.getenv("SPACEPARTS_PASSWORD")

---
### Definición de funciones
---

In [4]:
def get_connection(database="master"):
    conn_str = (
        "DRIVER={ODBC Driver 18 for SQL Server};"
        f"SERVER={server};DATABASE={database};"
        f"UID={username};PWD={password};"
        "Encrypt=yes;TrustServerCertificate=no;Connection Timeout=30;"
    )
    return pyodbc.connect(conn_str)

def list_databases():
    """Mapeamos las bases a las cuales la llamada nos va a pertimir acceder"""
    with get_connection("master") as conn:
        cursor = conn.cursor()
        cursor.execute("SELECT name FROM sys.databases;")
        return [row[0] for row in cursor.fetchall()]

def list_tables(database):
    """Lista todas las tablas en una base de datos"""
    with get_connection(database) as conn:
        cursor = conn.cursor()
        cursor.execute("""
            SELECT TABLE_SCHEMA, TABLE_NAME 
            FROM INFORMATION_SCHEMA.TABLES 
            WHERE TABLE_TYPE='BASE TABLE'
            ORDER BY TABLE_SCHEMA, TABLE_NAME;
        """)
        return [f"{row[0]}.{row[1]}" for row in cursor.fetchall()]

def sample_table(database, table_name, n=5):
    """Obtiene una muestra de n filas de una tabla"""
    # Nota: Separa esquema y tabla
    if "." in table_name:
        schema, table = table_name.split(".", 1)
    else:
        schema, table = "dbo", table_name
    
    full_name = f"[{schema}].[{table}]"
    query = f"SELECT TOP {n} * FROM {full_name};"
    odbc_str = (
        f"DRIVER={{ODBC Driver 18 for SQL Server}};"
        f"SERVER={server};DATABASE={database};"
        f"UID={username};PWD={password};"
        "Encrypt=yes;TrustServerCertificate=no;Connection Timeout=30;"
    )
    conn_str = f"mssql+pyodbc:///?odbc_connect={urllib.parse.quote_plus(odbc_str)}"
    engine = create_engine(conn_str)
    
    try:
        df = pd.read_sql(query, engine)
        return df
    except Exception as e:
        print(f"No se pudo leer {full_name}: {e}")
        return None

def _engine(database: str):
    odbc = (
        f"DRIVER={{ODBC Driver 18 for SQL Server}};"
        f"SERVER={server};DATABASE={database};"
        f"UID={username};PWD={password};"
        "Encrypt=yes;TrustServerCertificate=no;Connection Timeout=30;"
    )
    return create_engine(f"mssql+pyodbc:///?odbc_connect={urllib.parse.quote_plus(odbc)}")

def _q(database: str, sql: str):
    eng = _engine(database)
    with eng.connect() as conn:
        return pd.read_sql(text(sql), conn)

def profile_table(database: str, table_name: str, sample_n: int = 5):
    if "." in table_name:
        schema, table = table_name.split(".", 1)
    else:
        schema, table = "dbo", table_name
    obj = f"{schema}.{table}"
    full = f"[{schema}].[{table}]"

    metrics = _q(database, f"""
        SELECT SUM(p.rows) as row_count
        FROM sys.objects o
        JOIN sys.indexes i ON i.object_id = o.object_id
        JOIN sys.partitions p ON p.object_id = i.object_id AND p.index_id = i.index_id
        WHERE o.object_id = OBJECT_ID('{obj}') AND i.index_id IN (0,1);
    """)

    columns = _q(database, f"""
        SELECT c.column_id, c.name AS column_name, t.name AS sql_type,
               c.max_length, c.precision, c.scale, c.is_nullable
        FROM sys.columns c
        JOIN sys.types t ON t.user_type_id = c.user_type_id
        WHERE c.object_id = OBJECT_ID('{obj}')
        ORDER BY c.column_id;
    """)

    try:
        sample = _q(database, f"SELECT TOP {sample_n} * FROM {full};")
    except Exception as e:
        print(f"Muestra no disponible para {full}: {e}")
        sample = pd.DataFrame()

    display(pd.DataFrame([{
        "table_name": obj,
        "row_count": int(metrics.iloc[0]["row_count"]) if not metrics.empty else 0,
        "column_count": len(columns)
    }]))
    display(columns)
    if not sample.empty:
        display(sample)

def profile_many(database: str, tables: list[str], sample_n: int = 5):
    for t in tables:
        print(f"\n--- {t} ---")
        profile_table(database, t, sample_n=sample_n)
        

---
### Resultados de estudio origen de data
---

In [5]:
dbs = list_databases()
print("Database disponibles:", dbs)

Database disponibles: ['master', 'SpacePartsCoDW']


In [6]:
tables = list_tables("master")
tables

['dbo.sysdac_history_internal', 'dbo.sysdac_instances_internal']

### Nota: 
tablas internas del sistema que crea SQL Server (o Azure SQL) cuando se despliegan DAC packages (Data-tier Application Components).
- **sysdac_history_internal:** guarda metadatos de las operaciones de despliegue del DAC.
- **sysdac_instances_internal:** registra las instancias del DAC aplicadas en la base.

In [7]:
tables = list_tables("SpacePartsCoDW")
tables

['dim.Brands',
 'dim.Budget-Rate',
 'dim.Customers',
 'dim.Employees',
 'dim.Exchange-Rate',
 'dim.Invoice-DocType',
 'dim.Order-DocType',
 'dim.Order-Status',
 'dim.Products',
 'dim.Regions',
 'fact.Budget',
 'fact.Forecast',
 'fact.Invoices',
 'fact.Orders']

---
### Segun la pagina que documenta la información:
---

- Invoices: facturación + penalizaciones + devoluciones.
- Orders: pedidos de clientes + fechas de entrega.
- Budget: metas anuales por cliente/producto/mes.
- Forecast: revisiones mensuales del budget.

In [8]:
df_sample = sample_table("SpacePartsCoDW", "dim.Invoice-DocType", n=5)
df_sample

Unnamed: 0,Billing Document Type Code,Text,Doc. Type Ordinal,Group,Group Ordinal,DWCreatedDate
0,F2,Normal Invoice,1,Invoice,1,2023-02-10 14:52:08.297
1,L2,Express Order,2,Invoice,1,2023-02-10 14:52:08.297
2,YW,Warranty,3,Adjustment,2,2023-02-10 14:52:08.297
3,YR,Return,4,Adjustment,2,2023-02-10 14:52:08.297
4,Z2,Other,5,Other,3,2023-02-10 14:52:08.297


---
### Exploración de la data
---

In [9]:
dim_tables = [
 'dim.Brands',
 'dim.Budget-Rate',
 'dim.Customers',
 'dim.Employees',
 'dim.Exchange-Rate',
 'dim.Invoice-DocType',
 'dim.Order-DocType',
 'dim.Order-Status',
 'dim.Products',
 'dim.Regions'
]

profile_many("SpacePartsCoDW", dim_tables, sample_n=5)


--- dim.Brands ---


Unnamed: 0,table_name,row_count,column_count
0,dim.Brands,20,7


Unnamed: 0,column_id,column_name,sql_type,max_length,precision,scale,is_nullable
0,1,Flagship,nvarchar,100,0,0,True
1,2,Class,nvarchar,100,0,0,True
2,3,Type,nvarchar,100,0,0,True
3,4,Brand,nvarchar,100,0,0,True
4,5,Sub Brand,nvarchar,200,0,0,True
5,6,Product Brand VP,nvarchar,200,0,0,True
6,7,DWCreatedDate,datetime,8,23,3,False


Unnamed: 0,Flagship,Class,Type,Brand,Sub Brand,Product Brand VP,DWCreatedDate
0,Other Brand,Private Brand,Value,ASAN,ASAN Terran Systems,Nicole Hande,2023-02-10 14:52:07.983
1,Other Brand,Private Brand,Standard,Bruis,Bruis,Siras Invictus,2023-02-10 14:52:07.983
2,Other Brand,Private Brand,Value,FixIt Co.,FixIt Co.,Carlos Mangold,2023-02-10 14:52:07.983
3,Growth Brand,SpaceParts Brand,Value,Galileo,Galileo Aeronautics,Karol Andersen,2023-02-10 14:52:07.983
4,Growth Brand,SpaceParts Brand,Value,GateRite,GateRite,Maurizio Prei,2023-02-10 14:52:07.983



--- dim.Budget-Rate ---


Unnamed: 0,table_name,row_count,column_count
0,dim.Budget-Rate,15,5


Unnamed: 0,column_id,column_name,sql_type,max_length,precision,scale,is_nullable
0,1,Rate,numeric,9,10,5,True
1,2,From Currency,nvarchar,20,0,0,True
2,3,To Currency,nvarchar,20,0,0,True
3,4,Currency System,nvarchar,100,0,0,True
4,5,DWCreatedDate,datetime,8,23,3,False


Unnamed: 0,Rate,From Currency,To Currency,Currency System,DWCreatedDate
0,0.4211,ARC,EUR,Arcadia System,2023-02-10 14:52:08.250
1,0.6626,BELT,EUR,Sol System,2023-02-10 14:52:08.250
2,0.8208,BLO,EUR,Lakonía System,2023-02-10 14:52:08.250
3,0.81,BLT,EUR,Lakonía System,2023-02-10 14:52:08.250
4,1.5151,CAL,EUR,Îlos System,2023-02-10 14:52:08.250



--- dim.Customers ---


Unnamed: 0,table_name,row_count,column_count
0,dim.Customers,3911,10


Unnamed: 0,column_id,column_name,sql_type,max_length,precision,scale,is_nullable
0,1,Customer Key,nvarchar,200,0,0,True
1,2,Customer Sold-To Name,nvarchar,200,0,0,True
2,3,Account Name,nvarchar,200,0,0,True
3,4,Key Account Name,nvarchar,200,0,0,True
4,5,Transaction Type,nvarchar,200,0,0,True
5,6,Account Type,nvarchar,200,0,0,True
6,7,Key Account Manager,nvarchar,200,0,0,True
7,8,Account Manager,nvarchar,200,0,0,True
8,9,Station,nvarchar,200,0,0,True
9,10,DWCreatedDate,datetime,8,23,3,False


Unnamed: 0,Customer Key,Customer Sold-To Name,Account Name,Key Account Name,Transaction Type,Account Type,Key Account Manager,Account Manager,Station,DWCreatedDate
0,10000101B,Perilous Acqusitions,Perilous Acqusitions,No Key Account,B2B,No Key Account,No Key Account Manager,Kristin Grifeo,Juliet Station,2023-02-10 14:52:09.250
1,10000102B,Andromeda Shipyards (Darnadus Station Branch),Andromeda Shipyards,Andromeda Shipyards,B2B,Key Account,Rosalind Franklin,Hector Lightbringer,Darnadus Station,2023-02-10 14:52:09.250
2,10000103B,Pioneering Systems (Phobos Station Branch),Pioneering Systems,Pioneering Systems,B2B,Key Account,Carmen San Diago,Susan Horák,Phobos Station,2023-02-10 14:52:09.250
3,10000104B,Miller Space (Mimas Branch),Miller Space,Miller Space,B2B,Key Account,Matthew Rocket,Ben Guy,Mimas,2023-02-10 14:52:09.250
4,10000105B,Pioneering Systems (Îlos IV Branch),Pioneering Systems,Pioneering Systems,B2B,Key Account,Carmen San Diago,Balthazaar Viscari,Îlos IV,2023-02-10 14:52:09.250



--- dim.Employees ---


Unnamed: 0,table_name,row_count,column_count
0,dim.Employees,893,5


Unnamed: 0,column_id,column_name,sql_type,max_length,precision,scale,is_nullable
0,1,Role,nvarchar,300,0,0,True
1,2,Employee Name,nvarchar,300,0,0,True
2,3,Employee Email,nvarchar,300,0,0,True
3,4,Data Security Rule,nvarchar,300,0,0,True
4,5,DWCreatedDate,datetime,8,23,3,False


Unnamed: 0,Role,Employee Name,Employee Email,Data Security Rule,DWCreatedDate
0,Business Line Leader,Aaneta Gibson,aaneta.gibson@spaceparts.co,"Sales for all accounts, cost & margin only for...",2023-02-10 14:52:07.963
1,Business Line Leader,Aaron Rogacz,aaron.rogacz@spaceparts.co,"Sales for all accounts, cost & margin only for...",2023-02-10 14:52:07.963
2,Station Sales Managers,Abilio Decker,abilio.decker@spaceparts.co,All accounts in Stations for which they are re...,2023-02-10 14:52:07.963
3,Account Manager,Adam Alexander,adam.alexander@spaceparts.co,Sales for all stations in which they have a re...,2023-02-10 14:52:07.963
4,Account Manager,Adam Medina,adam.medina@spaceparts.co,Sales for all stations in which they have a re...,2023-02-10 14:52:07.963



--- dim.Exchange-Rate ---


Unnamed: 0,table_name,row_count,column_count
0,dim.Exchange-Rate,57900,9


Unnamed: 0,column_id,column_name,sql_type,max_length,precision,scale,is_nullable
0,1,Rate Type,nvarchar,60,0,0,True
1,2,From Currency,nvarchar,20,0,0,True
2,3,To Currency,nvarchar,20,0,0,True
3,4,Currency System,nvarchar,100,0,0,True
4,5,Rate,numeric,9,10,5,True
5,6,Date,datetime,8,23,3,True
6,7,Month,nvarchar,40,0,0,True
7,8,Exchange Rate Composite Key,nvarchar,200,0,0,True
8,9,DWCreatedDate,datetime,8,23,3,False


Unnamed: 0,Rate Type,From Currency,To Currency,Currency System,Rate,Date,Month,Exchange Rate Composite Key,DWCreatedDate
0,Monthly Rate,ILOS,EUR,Îlos System,0.8918,2018-01-12,Jan 18,Monthly Rate-AMN-1/12/2018,2023-02-10 14:52:09.007
1,Budget Rate,LAK,EUR,Lakonía System,0.9211,2018-01-12,Jan 18,Budget Rate-EAS-1/12/2018,2023-02-10 14:52:09.007
2,Monthly Rate,LAK,EUR,Lakonía System,0.9218,2018-01-12,Jan 18,Monthly Rate-EAS-1/12/2018,2023-02-10 14:52:09.007
3,Budget Rate,MCR,EUR,Sol System,1.7991,2018-01-12,Jan 18,Budget Rate-DWA-1/12/2018,2023-02-10 14:52:09.007
4,Monthly Rate,MCR,EUR,Sol System,1.836,2018-01-12,Jan 18,Monthly Rate-DWA-1/12/2018,2023-02-10 14:52:09.007



--- dim.Invoice-DocType ---


Unnamed: 0,table_name,row_count,column_count
0,dim.Invoice-DocType,5,6


Unnamed: 0,column_id,column_name,sql_type,max_length,precision,scale,is_nullable
0,1,Billing Document Type Code,nvarchar,20,0,0,True
1,2,Text,nvarchar,200,0,0,True
2,3,Doc. Type Ordinal,int,4,10,0,True
3,4,Group,nvarchar,200,0,0,True
4,5,Group Ordinal,int,4,10,0,True
5,6,DWCreatedDate,datetime,8,23,3,False


Unnamed: 0,Billing Document Type Code,Text,Doc. Type Ordinal,Group,Group Ordinal,DWCreatedDate
0,F2,Normal Invoice,1,Invoice,1,2023-02-10 14:52:08.297
1,L2,Express Order,2,Invoice,1,2023-02-10 14:52:08.297
2,YW,Warranty,3,Adjustment,2,2023-02-10 14:52:08.297
3,YR,Return,4,Adjustment,2,2023-02-10 14:52:08.297
4,Z2,Other,5,Other,3,2023-02-10 14:52:08.297



--- dim.Order-DocType ---


Unnamed: 0,table_name,row_count,column_count
0,dim.Order-DocType,4,6


Unnamed: 0,column_id,column_name,sql_type,max_length,precision,scale,is_nullable
0,1,Sales Order Document Type Code,nvarchar,20,0,0,True
1,2,Text,nvarchar,200,0,0,True
2,3,Doc. Type Ordinal,int,4,10,0,True
3,4,Group,nvarchar,200,0,0,True
4,5,Group Ordinal,int,4,10,0,True
5,6,DWCreatedDate,datetime,8,23,3,False


Unnamed: 0,Sales Order Document Type Code,Text,Doc. Type Ordinal,Group,Group Ordinal,DWCreatedDate
0,GLIT,Goods Lost in Transit,1,Cancellation,2,2023-02-10 14:52:07.390
1,YLS,Cancelled Order,2,Cancellation,2,2023-02-10 14:52:07.390
2,YLF,Express Order,3,Order,1,2023-02-10 14:52:07.390
3,YOR,Standard Order,4,Order,1,2023-02-10 14:52:07.390



--- dim.Order-Status ---


Unnamed: 0,table_name,row_count,column_count
0,dim.Order-Status,6,6


Unnamed: 0,column_id,column_name,sql_type,max_length,precision,scale,is_nullable
0,1,Order Status Code,nvarchar,100,0,0,True
1,2,Order Status Text,nvarchar,200,0,0,True
2,3,Order Status Ordinal,int,4,10,0,True
3,4,Order Status Group,nvarchar,100,0,0,True
4,5,Order Status Grouping Ordinal,int,4,10,0,True
5,6,DWCreatedDate,datetime,8,23,3,False


Unnamed: 0,Order Status Code,Order Status Text,Order Status Ordinal,Order Status Group,Order Status Grouping Ordinal,DWCreatedDate
0,Cancelled,Cancelled Order,1,Cancelled,1,2023-02-10 14:52:08.540
1,Late Open Order,Late Order,2,Open,2,2023-02-10 14:52:08.540
2,Open Delivery,WIP Order,3,Open,2,2023-02-10 14:52:08.540
3,Future Open Order,Future Order,4,Open,2,2023-02-10 14:52:08.540
4,Overdue,Overdue Order,5,Overdue,5,2023-02-10 14:52:08.540



--- dim.Products ---


Unnamed: 0,table_name,row_count,column_count
0,dim.Products,256293,18


Unnamed: 0,column_id,column_name,sql_type,max_length,precision,scale,is_nullable
0,1,Sub Brand Name,nvarchar,300,0,0,True
1,2,Ship Class for Part,nvarchar,300,0,0,True
2,3,Product Name,nvarchar,300,0,0,True
3,4,Product Business Line Leader,nvarchar,300,0,0,True
4,5,Part Fit Grading,nvarchar,300,0,0,True
5,6,Product Key,int,4,10,0,True
6,7,Subtype,nvarchar,300,0,0,True
7,8,Type,nvarchar,300,0,0,True
8,9,Weight (Tonnes),numeric,9,10,5,True
9,10,Maximum Temperature (K),int,4,10,0,True


Unnamed: 0,Sub Brand Name,Ship Class for Part,Product Name,Product Business Line Leader,Part Fit Grading,Product Key,Subtype,Type,Weight (Tonnes),Maximum Temperature (K),Velocity Tolerance (Meters / Second),Tolerance (g),MK,Color,Production Series,Nameplate,Material,DWCreatedDate
0,Krakatoa,Civilian Transports,Liquid Fuel Fuselage,Shannon Trujillo,Original Equipment,1000101,Liquid Fuel Tanks & Adapters,Fuel & Piping,0.378,2058,7,51,MK1,Blue,SERIES NW-86,Frey and Sons Inc,SERIES NW-86 Liquid Fuel Fuselage MK1 (Blue),2023-02-10 14:52:07.873
1,Plasma Solutions,Civilian Transports,Light Nose Plating,Britt Martin,Original Equipment,1000102,Light Armor Plating,Hull & Shields,0.0751,2257,6,58,MK1,Teal,SERIES YU-51,Ellis-Johnson PLC,SERIES YU-51 Light Nose Plating MK1 (Teal),2023-02-10 14:52:07.873
2,GateRite,Civilian Transports,Cubic Octagonal Strut,Luigi Rudolf,Original Equipment,1000103,"Beams, Panels & Radial",Structural,0.482,2077,16,54,MK2,Gray,SERIES XC-16,"Cummings, Valdez and Blair Inc",SERIES XC-16 Cubic Octagonal Strut MK2 (Gray),2023-02-10 14:52:07.873
3,GateRite,Civilian Transports,Probe-Bodyne Hecs,Heather Kosak,Original Equipment,1000104,Cockpit,Modules,0.311,1237,18,47,MK2,Navy,SERIES IV-06,"Pope, Bass and Long PLC",SERIES IV-06 Probe-Bodyne Hecs MK2 (Navy),2023-02-10 14:52:07.873
4,Plasma Solutions,Civilian Transports,Thick Fuselage Plating,Britt Martin,Original Equipment,1000105,Thick Armor Plating,Hull & Shields,0.7226,1855,16,51,MK2,Green,SERIES KH-05,"Pope, Bass and Long PLC",SERIES KH-05 Thick Fuselage Plating MK2 (Green),2023-02-10 14:52:07.873



--- dim.Regions ---


Unnamed: 0,table_name,row_count,column_count
0,dim.Regions,181,12


Unnamed: 0,column_id,column_name,sql_type,max_length,precision,scale,is_nullable
0,1,All,nvarchar,200,0,0,True
1,2,System,nvarchar,200,0,0,True
2,3,Interplanetary Region,nvarchar,200,0,0,True
3,4,Territory,nvarchar,200,0,0,True
4,5,Station,nvarchar,200,0,0,True
5,6,Station Type,nvarchar,200,0,0,True
6,7,Tax Rate,numeric,9,10,5,True
7,8,System Sales Directors,nvarchar,200,0,0,True
8,9,Station Sales Managers,nvarchar,200,0,0,True
9,10,System Regional Managers,nvarchar,200,0,0,True


Unnamed: 0,All,System,Interplanetary Region,Territory,Station,Station Type,Tax Rate,System Sales Directors,Station Sales Managers,System Regional Managers,Territory Directors,DWCreatedDate
0,All,Îlos System,Îlos VI-VII Region,Îlos VII,Affligem Station,Station,0.02,Victor Canales,Nicoletta Burcardo,Nicholas Basa,Patrick Larson,2023-02-10 14:52:09.130
1,All,Transit Gate,Transit Gate Region,Arcadia Gate,AG Refueling & Supply Station,Station,0.0,Garrett Novotná,Fermín Serraglio,Amy Blankenship,Michael Huerta,2023-02-10 14:52:09.130
2,All,Îlos System,Îlos VI-VII Region,Îlos VII,Andromedus Station,Station,0.02,Victor Canales,Madeleine Valdez,Nicholas Basa,Patrick Larson,2023-02-10 14:52:09.130
3,All,Sol System,Sol V Region,Io,Ao Substation,Station,0.04,Martine Roś,Shade Tabaxii,Alyssa Ariño,Agnès Poirier,2023-02-10 14:52:09.130
4,All,Îlos System,Îlos IV Region,Îlos IV,Apollo Station,Station,0.2,Victor Canales,Felisa Long,Christopher Jędraszczyk,Evita Calgari,2023-02-10 14:52:09.130


---
### Limpieza y estandarización.
---

In [18]:
DB = "SpacePartsCoDW"

def _engine(database: str):
    odbc = (
        f"DRIVER={{ODBC Driver 18 for SQL Server}};"
        f"SERVER={server};DATABASE={database};"
        f"UID={username};PWD={password};"
        "Encrypt=yes;TrustServerCertificate=no;Connection Timeout=30;"
    )
    return create_engine(f"mssql+pyodbc:///?odbc_connect={urllib.parse.quote_plus(odbc)}")

def read_dim(table: str, top: int|None=None) -> pd.DataFrame:
    """Lee dim.<table> de SpacePartsCoDW; usa TOP si lo pasas."""
    eng = _engine(DB)
    full = f"[dim].[{table}]"
    sql = f"SELECT {'TOP '+str(top) if top else ''} * FROM {full};"
    with eng.connect() as conn:
        return pd.read_sql(text(sql), conn)


dim_brands         = read_dim("Brands")
dim_budget_rate    = read_dim("Budget-Rate")
dim_customers      = read_dim("Customers")
dim_employees      = read_dim("Employees")
dim_exchange_rate  = read_dim("Exchange-Rate")
dim_invoice_doctype= read_dim("Invoice-DocType")
dim_order_doctype  = read_dim("Order-DocType")
dim_order_status   = read_dim("Order-Status")
dim_products       = read_dim("Products")     
dim_regions        = read_dim("Regions")


In [19]:
# Normaliza nombres de columnas → snake_case sin espacios/acentos/guiones
def std_colname(s: str) -> str:
    s = s.strip()
    s = re.sub(r"[^\w]+", "_", s, flags=re.UNICODE)   # espacios, guiones, etc → _
    s = re.sub(r"__+", "_", s)
    return s.strip("_").lower()

def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    out.columns = [std_colname(c) for c in df.columns]
    return out

# Limpia strings (trim, colapsa espacios), opcional upper/lower
def clean_string_series(s: pd.Series, to="asis") -> pd.Series:
    s = s.astype("string")
    s = s.str.replace(r"\s+", " ", regex=True).str.strip()
    if to == "upper": s = s.str.upper()
    if to == "lower": s = s.str.lower()
    return s

# Casteo robusto por tipo
def to_datetime_safe(s: pd.Series) -> pd.Series:
    return pd.to_datetime(s, errors="coerce", utc=False)

def to_int_safe(s: pd.Series) -> pd.Series:
    return pd.to_numeric(s, errors="coerce").astype("Int64")

def to_float_safe(s: pd.Series) -> pd.Series:
    return pd.to_numeric(s, errors="coerce").astype("Float64")

# Inferencia heurística por nombre de columna
def infer_cast(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for c in out.columns:
        cl = c.lower()
        # fechas
        if any(k in cl for k in ["date", "dt", "created"]):
            out[c] = to_datetime_safe(out[c])
            continue
        # claves numéricas típicas
        if cl.endswith("_key") or cl in {"product_key","customer_key","brand_key","region_key","order_status_ordinal","doc_type_ordinal"}:
            out[c] = to_int_safe(out[c])
            continue
        # cantidades/valores/rates
        if any(k in cl for k in ["rate","value","amount","quantity","volume","tax","penalt","weight","temperature","tolerance"]):
            out[c] = to_float_safe(out[c])
            continue
        # emails/monedas/textos → string limpiado
        if any(k in cl for k in ["email","currency","name","text","group","class","type","station","system","color","material","series","mk","role","manager"]):
            out[c] = clean_string_series(out[c], to="asis")
            continue
        # por defecto: si es object, limpiar
        if out[c].dtype == "object":
            out[c] = clean_string_series(out[c], to="asis")
    return out

# Reglas específicas por tabla/columna (sobrescriben la inferencia)
# usa nombres de tabla en snake_case (p. ej., 'dim_brands')
TYPE_HINTS = {
    "dim_brands": {
        "dwcreateddate": "datetime",
    },
    "dim_budget_rate": {
        "rate": "float",
        "from_currency": "upper",
        "to_currency": "upper",
        "dwcreateddate": "datetime",
    },
    "dim_customers": {
        "customer_key": "string",
        "station": "string",
        "dwcreateddate": "datetime",
    },
    "dim_employees": {
        "employee_email": "lower",
        "dwcreateddate": "datetime",
    },
    "dim_exchange_rate": {
        "rate": "float",
        "date": "datetime",
        "from_currency": "upper",
        "to_currency": "upper",
        "dwcreateddate": "datetime",
    },
    "dim_invoice_doctype": {
        "doc_type_ordinal": "int",
        "group_ordinal": "int",
        "dwcreateddate": "datetime",
    },
    "dim_order_doctype": {
        "doc_type_ordinal": "int",
        "group_ordinal": "int",
        "dwcreateddate": "datetime",
    },
    "dim_order_status": {
        "order_status_ordinal": "int",
        "order_status_grouping_ordinal": "int",
        "dwcreateddate": "datetime",
    },
    "dim_products": {
        "product_key": "int",
        "weight_tonnes": "float",
        "maximum_temperature_k": "int",
        "velocity_tolerance_meters_second": "int",
        "tolerance_g": "int",
        "dwcreateddate": "datetime",
    },
    "dim_regions": {
        "tax_rate": "float",
        "dwcreateddate": "datetime",
    },
}

def apply_type_hints(df: pd.DataFrame, table_name_snake: str) -> pd.DataFrame:
    out = df.copy()
    hints = TYPE_HINTS.get(table_name_snake, {})
    for col, kind in hints.items():
        if col not in out.columns: 
            continue
        if kind == "datetime":
            out[col] = to_datetime_safe(out[col])
        elif kind == "int":
            out[col] = to_int_safe(out[col])
        elif kind == "float":
            out[col] = to_float_safe(out[col])
        elif kind == "upper":
            out[col] = clean_string_series(out[col], to="upper")
        elif kind == "lower":
            out[col] = clean_string_series(out[col], to="lower")
        elif kind == "string":
            out[col] = clean_string_series(out[col], to="asis")
    return out


In [20]:
def profile_df(df: pd.DataFrame, table_label: str, sample_n: int = 5):
    # resumen general
    summary = pd.DataFrame([{
        "table_name": table_label,
        "row_count": len(df),
        "column_count": df.shape[1]
    }])

    # métricas por columna
    cols = []
    for c in df.columns:
        s = df[c]
        dtype = str(s.dtype)
        n_null = int(s.isna().sum())
        n_dist = int(s.nunique(dropna=True))
        ex = s.dropna().head(3).tolist()
        item = {
            "column": c,
            "dtype": dtype,
            "nulls": n_null,
            "null_pct": round(100 * n_null / max(len(s),1), 2),
            "distinct": n_dist,
            "example_values": ex
        }
        if pd.api.types.is_numeric_dtype(s):
            item.update({
                "min": s.min(),
                "max": s.max(),
                "mean": s.mean()
            })
        elif pd.api.types.is_datetime64_any_dtype(s):
            item.update({
                "min": s.min(),
                "max": s.max()
            })
        else:
            # longitud media para strings
            try:
                item.update({"avg_len": float(s.astype("string").str.len().mean())})
            except Exception:
                item.update({"avg_len": None})
        cols.append(item)

    columns_profile = pd.DataFrame(cols)
    display(summary)
    display(columns_profile.sort_values(["dtype","column"]))
    display(df.head(sample_n))

    return summary, columns_profile


In [21]:
# (1) Estandariza nombres, (2) inferencia, (3) hints específicos, (4) perfil
def clean_and_profile(df: pd.DataFrame, label: str):
    t_snake = std_colname(label)
    d0 = standardize_columns(df)
    d1 = infer_cast(d0)
    d2 = apply_type_hints(d1, t_snake)
    print(f"\n--- {label} ---")
    return profile_df(d2, label, sample_n=5)

dim_brands_sum, dim_brands_cols = clean_and_profile(dim_brands, "dim.Brands")
dim_budget_rate_sum, dim_budget_rate_cols = clean_and_profile(dim_budget_rate, "dim.Budget-Rate")
dim_customers_sum, dim_customers_cols = clean_and_profile(dim_customers, "dim.Customers")
dim_employees_sum, dim_employees_cols = clean_and_profile(dim_employees, "dim.Employees")
dim_exchange_rate_sum, dim_exchange_rate_cols = clean_and_profile(dim_exchange_rate, "dim.Exchange-Rate")
dim_invoice_dtype_sum, dim_invoice_dtype_cols = clean_and_profile(dim_invoice_doctype, "dim.Invoice-DocType")
dim_order_dtype_sum, dim_order_dtype_cols = clean_and_profile(dim_order_doctype, "dim.Order-DocType")
dim_order_status_sum, dim_order_status_cols = clean_and_profile(dim_order_status, "dim.Order-Status")
dim_products_sum, dim_products_cols = clean_and_profile(dim_products, "dim.Products")
dim_regions_sum, dim_regions_cols = clean_and_profile(dim_regions, "dim.Regions")



--- dim.Brands ---


Unnamed: 0,table_name,row_count,column_count
0,dim.Brands,20,7


Unnamed: 0,column,dtype,nulls,null_pct,distinct,example_values,avg_len,min,max
6,dwcreateddate,datetime64[ns],0,0.0,1,"[2023-02-10 14:52:07.983000, 2023-02-10 14:52:...",,2023-02-10 14:52:07.983,2023-02-10 14:52:07.983
3,brand,string,0,0.0,14,"[ASAN, Bruis, FixIt Co.]",6.55,NaT,NaT
1,class,string,0,0.0,2,"[Private Brand, Private Brand, Private Brand]",14.8,NaT,NaT
0,flagship,string,0,0.0,3,"[Other Brand, Other Brand, Other Brand]",12.5,NaT,NaT
5,product_brand_vp,string,0,0.0,14,"[Nicole Hande, Siras Invictus, Carlos Mangold]",12.9,NaT,NaT
4,sub_brand,string,0,0.0,20,"[ASAN Terran Systems, Bruis, FixIt Co.]",10.95,NaT,NaT
2,type,string,0,0.0,4,"[Value, Standard, Value]",6.65,NaT,NaT


Unnamed: 0,flagship,class,type,brand,sub_brand,product_brand_vp,dwcreateddate
0,Other Brand,Private Brand,Value,ASAN,ASAN Terran Systems,Nicole Hande,2023-02-10 14:52:07.983
1,Other Brand,Private Brand,Standard,Bruis,Bruis,Siras Invictus,2023-02-10 14:52:07.983
2,Other Brand,Private Brand,Value,FixIt Co.,FixIt Co.,Carlos Mangold,2023-02-10 14:52:07.983
3,Growth Brand,SpaceParts Brand,Value,Galileo,Galileo Aeronautics,Karol Andersen,2023-02-10 14:52:07.983
4,Growth Brand,SpaceParts Brand,Value,GateRite,GateRite,Maurizio Prei,2023-02-10 14:52:07.983



--- dim.Budget-Rate ---


Unnamed: 0,table_name,row_count,column_count
0,dim.Budget-Rate,15,5


Unnamed: 0,column,dtype,nulls,null_pct,distinct,example_values,min,max,mean,avg_len
0,rate,Float64,0,0.0,15,"[0.4211, 0.6626, 0.8208]",0.4169,3.12,1.117807,
4,dwcreateddate,datetime64[ns],0,0.0,1,"[2023-02-10 14:52:08.250000, 2023-02-10 14:52:...",2023-02-10 14:52:08.250000,2023-02-10 14:52:08.250000,,
3,currency_system,string,0,0.0,6,"[Arcadia System, Sol System, Lakonía System]",,,,11.8
1,from_currency,string,0,0.0,15,"[ARC, BELT, BLO]",,,,3.4
2,to_currency,string,0,0.0,1,"[EUR, EUR, EUR]",,,,3.0


Unnamed: 0,rate,from_currency,to_currency,currency_system,dwcreateddate
0,0.4211,ARC,EUR,Arcadia System,2023-02-10 14:52:08.250
1,0.6626,BELT,EUR,Sol System,2023-02-10 14:52:08.250
2,0.8208,BLO,EUR,Lakonía System,2023-02-10 14:52:08.250
3,0.81,BLT,EUR,Lakonía System,2023-02-10 14:52:08.250
4,1.5151,CAL,EUR,Îlos System,2023-02-10 14:52:08.250



--- dim.Customers ---


Unnamed: 0,table_name,row_count,column_count
0,dim.Customers,3911,10


Unnamed: 0,column,dtype,nulls,null_pct,distinct,example_values,avg_len,min,max
9,dwcreateddate,datetime64[ns],0,0.0,1,"[2023-02-10 14:52:09.250000, 2023-02-10 14:52:...",,2023-02-10 14:52:09.250,2023-02-10 14:52:09.250
7,account_manager,string,0,0.0,604,"[Kristin Grifeo, Hector Lightbringer, Susan Ho...",14.115827,NaT,NaT
2,account_name,string,0,0.0,1978,"[Perilous Acqusitions, Andromeda Shipyards, Pi...",18.543595,NaT,NaT
5,account_type,string,0,0.0,3,"[No Key Account, Key Account, Key Account]",12.35055,NaT,NaT
0,customer_key,string,3911,100.0,0,[],,NaT,NaT
1,customer_sold_to_name,string,0,0.0,3136,"[Perilous Acqusitions, Andromeda Shipyards (Da...",30.007926,NaT,NaT
6,key_account_manager,string,0,0.0,14,"[No Key Account Manager, Rosalind Franklin, Ca...",16.975454,NaT,NaT
3,key_account_name,string,0,0.0,14,"[No Key Account, Andromeda Shipyards, Pioneeri...",16.392994,NaT,NaT
8,station,string,0,0.0,179,"[Juliet Station, Darnadus Station, Phobos Stat...",14.117106,NaT,NaT
4,transaction_type,string,0,0.0,3,"[B2B, B2B, B2B]",3.0,NaT,NaT


Unnamed: 0,customer_key,customer_sold_to_name,account_name,key_account_name,transaction_type,account_type,key_account_manager,account_manager,station,dwcreateddate
0,,Perilous Acqusitions,Perilous Acqusitions,No Key Account,B2B,No Key Account,No Key Account Manager,Kristin Grifeo,Juliet Station,2023-02-10 14:52:09.250
1,,Andromeda Shipyards (Darnadus Station Branch),Andromeda Shipyards,Andromeda Shipyards,B2B,Key Account,Rosalind Franklin,Hector Lightbringer,Darnadus Station,2023-02-10 14:52:09.250
2,,Pioneering Systems (Phobos Station Branch),Pioneering Systems,Pioneering Systems,B2B,Key Account,Carmen San Diago,Susan Horák,Phobos Station,2023-02-10 14:52:09.250
3,,Miller Space (Mimas Branch),Miller Space,Miller Space,B2B,Key Account,Matthew Rocket,Ben Guy,Mimas,2023-02-10 14:52:09.250
4,,Pioneering Systems (Îlos IV Branch),Pioneering Systems,Pioneering Systems,B2B,Key Account,Carmen San Diago,Balthazaar Viscari,Îlos IV,2023-02-10 14:52:09.250



--- dim.Employees ---


Unnamed: 0,table_name,row_count,column_count
0,dim.Employees,893,5


Unnamed: 0,column,dtype,nulls,null_pct,distinct,example_values,avg_len,min,max
4,dwcreateddate,datetime64[ns],0,0.0,1,"[2023-02-10 14:52:07.963000, 2023-02-10 14:52:...",,2023-02-10 14:52:07.963,2023-02-10 14:52:07.963
3,data_security_rule,string,0,0.0,6,"[Sales for all accounts, cost & margin only fo...",99.287794,NaT,NaT
2,employee_email,string,0,0.0,867,"[aaneta.gibson@spaceparts.co, aaron.rogacz@spa...",27.293393,NaT,NaT
1,employee_name,string,0,0.0,867,"[Aaneta Gibson, Aaron Rogacz, Abilio Decker]",13.431131,NaT,NaT
0,role,string,0,0.0,8,"[Business Line Leader, Business Line Leader, S...",16.899216,NaT,NaT


Unnamed: 0,role,employee_name,employee_email,data_security_rule,dwcreateddate
0,Business Line Leader,Aaneta Gibson,aaneta.gibson@spaceparts.co,"Sales for all accounts, cost & margin only for...",2023-02-10 14:52:07.963
1,Business Line Leader,Aaron Rogacz,aaron.rogacz@spaceparts.co,"Sales for all accounts, cost & margin only for...",2023-02-10 14:52:07.963
2,Station Sales Managers,Abilio Decker,abilio.decker@spaceparts.co,All accounts in Stations for which they are re...,2023-02-10 14:52:07.963
3,Account Manager,Adam Alexander,adam.alexander@spaceparts.co,Sales for all stations in which they have a re...,2023-02-10 14:52:07.963
4,Account Manager,Adam Medina,adam.medina@spaceparts.co,Sales for all stations in which they have a re...,2023-02-10 14:52:07.963



--- dim.Exchange-Rate ---


Unnamed: 0,table_name,row_count,column_count
0,dim.Exchange-Rate,57900,9


Unnamed: 0,column,dtype,nulls,null_pct,distinct,example_values,min,max,mean,avg_len
4,rate,Float64,0,0.0,774,"[0.8918, 0.9211, 0.9218]",0.3808,3.3874,1.121654,
0,rate_type,Float64,57900,100.0,0,[],,,,
7,exchange_rate_composite_key,Int64,57900,100.0,0,[],,,,
5,date,datetime64[ns],0,0.0,1930,"[2018-01-12 00:00:00, 2018-01-12 00:00:00, 201...",2018-01-01 00:00:00,2023-04-14 00:00:00,,
8,dwcreateddate,datetime64[ns],0,0.0,2,"[2023-02-10 14:52:09.007000, 2023-02-10 14:52:...",2023-02-10 14:52:09.007000,2023-02-10 14:52:09.243000,,
3,currency_system,string,0,0.0,6,"[Îlos System, Lakonía System, Lakonía System]",,,,11.8
1,from_currency,string,0,0.0,15,"[ILOS, LAK, LAK]",,,,3.4
6,month,string,0,0.0,64,"[Jan 18, Jan 18, Jan 18]",,,,6.0
2,to_currency,string,0,0.0,1,"[EUR, EUR, EUR]",,,,3.0


Unnamed: 0,rate_type,from_currency,to_currency,currency_system,rate,date,month,exchange_rate_composite_key,dwcreateddate
0,,ILOS,EUR,Îlos System,0.8918,2018-01-12,Jan 18,,2023-02-10 14:52:09.007
1,,LAK,EUR,Lakonía System,0.9211,2018-01-12,Jan 18,,2023-02-10 14:52:09.007
2,,LAK,EUR,Lakonía System,0.9218,2018-01-12,Jan 18,,2023-02-10 14:52:09.007
3,,MCR,EUR,Sol System,1.7991,2018-01-12,Jan 18,,2023-02-10 14:52:09.007
4,,MCR,EUR,Sol System,1.836,2018-01-12,Jan 18,,2023-02-10 14:52:09.007



--- dim.Invoice-DocType ---


Unnamed: 0,table_name,row_count,column_count
0,dim.Invoice-DocType,5,6


Unnamed: 0,column,dtype,nulls,null_pct,distinct,example_values,avg_len,min,max,mean
2,doc_type_ordinal,Int64,0,0.0,5,"[1, 2, 3]",,1,5,3.0
4,group_ordinal,Int64,0,0.0,3,"[1, 1, 2]",,1,3,1.8
5,dwcreateddate,datetime64[ns],0,0.0,1,"[2023-02-10 14:52:08.297000, 2023-02-10 14:52:...",,2023-02-10 14:52:08.297000,2023-02-10 14:52:08.297000,
0,billing_document_type_code,string,0,0.0,5,"[F2, L2, YW]",2.0,,,
3,group,string,0,0.0,3,"[Invoice, Invoice, Adjustment]",7.8,,,
1,text,string,0,0.0,5,"[Normal Invoice, Express Order, Warranty]",9.2,,,


Unnamed: 0,billing_document_type_code,text,doc_type_ordinal,group,group_ordinal,dwcreateddate
0,F2,Normal Invoice,1,Invoice,1,2023-02-10 14:52:08.297
1,L2,Express Order,2,Invoice,1,2023-02-10 14:52:08.297
2,YW,Warranty,3,Adjustment,2,2023-02-10 14:52:08.297
3,YR,Return,4,Adjustment,2,2023-02-10 14:52:08.297
4,Z2,Other,5,Other,3,2023-02-10 14:52:08.297



--- dim.Order-DocType ---


Unnamed: 0,table_name,row_count,column_count
0,dim.Order-DocType,4,6


Unnamed: 0,column,dtype,nulls,null_pct,distinct,example_values,avg_len,min,max,mean
2,doc_type_ordinal,Int64,0,0.0,4,"[1, 2, 3]",,1,4,2.5
4,group_ordinal,Int64,0,0.0,2,"[2, 2, 1]",,1,2,1.5
5,dwcreateddate,datetime64[ns],0,0.0,1,"[2023-02-10 14:52:07.390000, 2023-02-10 14:52:...",,2023-02-10 14:52:07.390000,2023-02-10 14:52:07.390000,
3,group,string,0,0.0,2,"[Cancellation, Cancellation, Order]",8.5,,,
0,sales_order_document_type_code,string,0,0.0,4,"[GLIT, YLS, YLF]",3.25,,,
1,text,string,0,0.0,4,"[Goods Lost in Transit, Cancelled Order, Expre...",15.75,,,


Unnamed: 0,sales_order_document_type_code,text,doc_type_ordinal,group,group_ordinal,dwcreateddate
0,GLIT,Goods Lost in Transit,1,Cancellation,2,2023-02-10 14:52:07.390
1,YLS,Cancelled Order,2,Cancellation,2,2023-02-10 14:52:07.390
2,YLF,Express Order,3,Order,1,2023-02-10 14:52:07.390
3,YOR,Standard Order,4,Order,1,2023-02-10 14:52:07.390



--- dim.Order-Status ---


Unnamed: 0,table_name,row_count,column_count
0,dim.Order-Status,6,6


Unnamed: 0,column,dtype,nulls,null_pct,distinct,example_values,avg_len,min,max,mean
4,order_status_grouping_ordinal,Int64,0,0.0,4,"[1, 2, 2]",,1,6,3.0
2,order_status_ordinal,Int64,0,0.0,6,"[1, 2, 3]",,1,6,3.5
5,dwcreateddate,datetime64[ns],0,0.0,1,"[2023-02-10 14:52:08.540000, 2023-02-10 14:52:...",,2023-02-10 14:52:08.540000,2023-02-10 14:52:08.540000,
0,order_status_code,string,0,0.0,6,"[Cancelled, Late Open Order, Open Delivery]",11.5,,,
3,order_status_group,string,0,0.0,4,"[Cancelled, Open, Open]",6.0,,,
1,order_status_text,string,0,0.0,6,"[Cancelled Order, Late Order, WIP Order]",12.166667,,,


Unnamed: 0,order_status_code,order_status_text,order_status_ordinal,order_status_group,order_status_grouping_ordinal,dwcreateddate
0,Cancelled,Cancelled Order,1,Cancelled,1,2023-02-10 14:52:08.540
1,Late Open Order,Late Order,2,Open,2,2023-02-10 14:52:08.540
2,Open Delivery,WIP Order,3,Open,2,2023-02-10 14:52:08.540
3,Future Open Order,Future Order,4,Open,2,2023-02-10 14:52:08.540
4,Overdue,Overdue Order,5,Overdue,5,2023-02-10 14:52:08.540



--- dim.Products ---


Unnamed: 0,table_name,row_count,column_count
0,dim.Products,256293,18


Unnamed: 0,column,dtype,nulls,null_pct,distinct,example_values,avg_len,min,max,mean
8,weight_tonnes,Float64,0,0.0,14364,"[0.378, 0.0751, 0.482]",,0.0,42.998,0.7891469
9,maximum_temperature_k,Int64,0,0.0,2295,"[2058, 2257, 2077]",,903,4099,2039.388
5,product_key,Int64,0,0.0,256293,"[1000101, 1000102, 1000103]",,1000101,1256393,1128247.0
11,tolerance_g,Int64,0,0.0,49,"[51, 58, 54]",,24,157,51.02957
10,velocity_tolerance_meters_second,Int64,0,0.0,92,"[7, 6, 16]",,0,131,18.06905
17,dwcreateddate,datetime64[ns],0,0.0,15,"[2023-02-10 14:52:07.873000, 2023-02-10 14:52:...",,2023-02-10 14:52:07.873000,2023-02-10 14:52:16.610000,
13,color,string,0,0.0,12,"[Blue, Teal, Gray]",4.608776,,,
16,material,string,0,0.0,256045,"[SERIES NW-86 Liquid Fuel Fuselage MK1 (Blue),...",50.474906,,,
12,mk,string,0,0.0,3,"[MK1, MK1, MK2]",3.0,,,
15,nameplate,string,0,0.0,23,"[Frey and Sons Inc, Ellis-Johnson PLC, Cumming...",21.606255,,,


Unnamed: 0,sub_brand_name,ship_class_for_part,product_name,product_business_line_leader,part_fit_grading,product_key,subtype,type,weight_tonnes,maximum_temperature_k,velocity_tolerance_meters_second,tolerance_g,mk,color,production_series,nameplate,material,dwcreateddate
0,Krakatoa,Civilian Transports,Liquid Fuel Fuselage,Shannon Trujillo,Original Equipment,1000101,Liquid Fuel Tanks & Adapters,Fuel & Piping,0.378,2058,7,51,MK1,Blue,SERIES NW-86,Frey and Sons Inc,SERIES NW-86 Liquid Fuel Fuselage MK1 (Blue),2023-02-10 14:52:07.873
1,Plasma Solutions,Civilian Transports,Light Nose Plating,Britt Martin,Original Equipment,1000102,Light Armor Plating,Hull & Shields,0.0751,2257,6,58,MK1,Teal,SERIES YU-51,Ellis-Johnson PLC,SERIES YU-51 Light Nose Plating MK1 (Teal),2023-02-10 14:52:07.873
2,GateRite,Civilian Transports,Cubic Octagonal Strut,Luigi Rudolf,Original Equipment,1000103,"Beams, Panels & Radial",Structural,0.482,2077,16,54,MK2,Gray,SERIES XC-16,"Cummings, Valdez and Blair Inc",SERIES XC-16 Cubic Octagonal Strut MK2 (Gray),2023-02-10 14:52:07.873
3,GateRite,Civilian Transports,Probe-Bodyne Hecs,Heather Kosak,Original Equipment,1000104,Cockpit,Modules,0.311,1237,18,47,MK2,Navy,SERIES IV-06,"Pope, Bass and Long PLC",SERIES IV-06 Probe-Bodyne Hecs MK2 (Navy),2023-02-10 14:52:07.873
4,Plasma Solutions,Civilian Transports,Thick Fuselage Plating,Britt Martin,Original Equipment,1000105,Thick Armor Plating,Hull & Shields,0.7226,1855,16,51,MK2,Green,SERIES KH-05,"Pope, Bass and Long PLC",SERIES KH-05 Thick Fuselage Plating MK2 (Green),2023-02-10 14:52:07.873



--- dim.Regions ---


Unnamed: 0,table_name,row_count,column_count
0,dim.Regions,181,12


Unnamed: 0,column,dtype,nulls,null_pct,distinct,example_values,avg_len,min,max,mean
6,tax_rate,Float64,0,0.0,21,"[0.02, 0.0, 0.02]",,0.0,0.4,0.071685
11,dwcreateddate,datetime64[ns],0,0.0,1,"[2023-02-10 14:52:09.130000, 2023-02-10 14:52:...",,2023-02-10 14:52:09.130000,2023-02-10 14:52:09.130000,
0,all,string,0,0.0,1,"[All, All, All]",3.0,,,
2,interplanetary_region,string,0,0.0,15,"[Îlos VI-VII Region, Transit Gate Region, Îlos...",16.138122,,,
4,station,string,0,0.0,181,"[Affligem Station, AG Refueling & Supply Stati...",14.237569,,,
8,station_sales_managers,string,0,0.0,122,"[Nicoletta Burcardo, Fermín Serraglio, Madelei...",13.917127,,,
5,station_type,string,0,0.0,3,"[Station, Station, Station]",6.519337,,,
1,system,string,0,0.0,6,"[Îlos System, Transit Gate, Îlos System]",11.364641,,,
9,system_regional_managers,string,0,0.0,15,"[Nicholas Basa, Amy Blankenship, Nicholas Basa]",14.696133,,,
7,system_sales_directors,string,0,0.0,5,"[Victor Canales, Garrett Novotná, Victor Canales]",12.552486,,,


Unnamed: 0,all,system,interplanetary_region,territory,station,station_type,tax_rate,system_sales_directors,station_sales_managers,system_regional_managers,territory_directors,dwcreateddate
0,All,Îlos System,Îlos VI-VII Region,Îlos VII,Affligem Station,Station,0.02,Victor Canales,Nicoletta Burcardo,Nicholas Basa,Patrick Larson,2023-02-10 14:52:09.130
1,All,Transit Gate,Transit Gate Region,Arcadia Gate,AG Refueling & Supply Station,Station,0.0,Garrett Novotná,Fermín Serraglio,Amy Blankenship,Michael Huerta,2023-02-10 14:52:09.130
2,All,Îlos System,Îlos VI-VII Region,Îlos VII,Andromedus Station,Station,0.02,Victor Canales,Madeleine Valdez,Nicholas Basa,Patrick Larson,2023-02-10 14:52:09.130
3,All,Sol System,Sol V Region,Io,Ao Substation,Station,0.04,Martine Roś,Shade Tabaxii,Alyssa Ariño,Agnès Poirier,2023-02-10 14:52:09.130
4,All,Îlos System,Îlos IV Region,Îlos IV,Apollo Station,Station,0.2,Victor Canales,Felisa Long,Christopher Jędraszczyk,Evita Calgari,2023-02-10 14:52:09.130


In [22]:
# Une todos los perfiles de columnas en un solo catálogo
catalog_all = pd.concat([
    dim_brands_cols, dim_budget_rate_cols, dim_customers_cols, dim_employees_cols,
    dim_exchange_rate_cols, dim_invoice_dtype_cols, dim_order_dtype_cols,
    dim_order_status_cols, dim_products_cols, dim_regions_cols
], ignore_index=True)

# Resumen por tabla
summary_all = pd.concat([
    dim_brands_sum, dim_budget_rate_sum, dim_customers_sum, dim_employees_sum,
    dim_exchange_rate_sum, dim_invoice_dtype_sum, dim_order_dtype_sum,
    dim_order_status_sum, dim_products_sum, dim_regions_sum
], ignore_index=True)

In [23]:
catalog_all

Unnamed: 0,column,dtype,nulls,null_pct,distinct,example_values,avg_len,min,max,mean
0,flagship,string,0,0.0,3,"[Other Brand, Other Brand, Other Brand]",12.500000,NaT,NaT,
1,class,string,0,0.0,2,"[Private Brand, Private Brand, Private Brand]",14.800000,NaT,NaT,
2,type,string,0,0.0,4,"[Value, Standard, Value]",6.650000,NaT,NaT,
3,brand,string,0,0.0,14,"[ASAN, Bruis, FixIt Co.]",6.550000,NaT,NaT,
4,sub_brand,string,0,0.0,20,"[ASAN Terran Systems, Bruis, FixIt Co.]",10.950000,NaT,NaT,
...,...,...,...,...,...,...,...,...,...,...
79,system_sales_directors,string,0,0.0,5,"[Victor Canales, Garrett Novotná, Victor Canales]",12.552486,,,
80,station_sales_managers,string,0,0.0,122,"[Nicoletta Burcardo, Fermín Serraglio, Madelei...",13.917127,,,
81,system_regional_managers,string,0,0.0,15,"[Nicholas Basa, Amy Blankenship, Nicholas Basa]",14.696133,,,
82,territory_directors,string,0,0.0,46,"[Patrick Larson, Michael Huerta, Patrick Larson]",13.541436,,,


In [24]:
summary_all

Unnamed: 0,table_name,row_count,column_count
0,dim.Brands,20,7
1,dim.Budget-Rate,15,5
2,dim.Customers,3911,10
3,dim.Employees,893,5
4,dim.Exchange-Rate,57900,9
5,dim.Invoice-DocType,5,6
6,dim.Order-DocType,4,6
7,dim.Order-Status,6,6
8,dim.Products,256293,18
9,dim.Regions,181,12
