# Sprint 2: Schema Design

#### Initial configuration and conection to duckdb and ducklake

In [1]:
import duckdb
import pandas as pd
import os
import glob

# --- 1. Configuration & Paths ---
RAW_DATA_PATH = '../data/raw/'
LAKEHOUSE_PATH = '../data/lakehouse'
METADATA_PATH = os.path.join(LAKEHOUSE_PATH, 'metadata.duckdb')

# Create the base directory if it doesn't exist
os.makedirs(LAKEHOUSE_PATH, exist_ok=True)

# --- 2. Initialize DuckDB & Load DuckLake Extension ---
# Connect to in-memory DuckDB (Compute Layer)
con = duckdb.connect(database=':memory:')

print("--- Initializing DuckLake Extension ---")

# ‚úÖ ACTIVATE DUCKLAKE: This downloads/installs the extension if missing
# and loads it into the current session.
try:
    con.execute("INSTALL ducklake;")
    con.execute("LOAD ducklake;")
    print("‚úÖ Extension 'ducklake' loaded successfully.")
except Exception as e:
    print(f"‚ùå Error loading 'ducklake'. Make sure the extension is available in your environment.\nError: {e}")

--- Initializing DuckLake Extension ---
‚úÖ Extension 'ducklake' loaded successfully.


#### Attach the Catalog and Schema Management

In [2]:
# --- 3. Attach the Catalog ---
# We attach the persistent storage. 
# Note: Depending on your specific DuckLake version, the syntax for ATTACH might vary slightly.
# This assumes standard syntax where we point to the metadata file.
con.execute(f"ATTACH '{METADATA_PATH}' AS lakehouse")
print(f"‚úÖ Lakehouse catalog attached at: {METADATA_PATH}")

# --- 4. Schema Management ---
# Create logical schemas within the managed catalog
schemas = ['bronze', 'silver', 'gold']
for schema in schemas:
    con.execute(f"CREATE SCHEMA IF NOT EXISTS lakehouse.{schema}")
print(f"‚úÖ Schemas ready: {', '.join(schemas)}")

‚úÖ Lakehouse catalog attached at: ../data/lakehouse\metadata.duckdb
‚úÖ Schemas ready: bronze, silver, gold


#### Ingestion of mobility data

In [None]:
# --- 5. Ingestion: Mobility Data (Bronze Layer) ---
mitma_raw_glob_path = os.path.join(RAW_DATA_PATH, 'mitma', '*_Viajes_municipios.csv.gz')
mobility_files = glob.glob(mitma_raw_glob_path)

print(f"\n--- Ingesting Mobility Data ---")
if not mobility_files:
    print("‚ùå No mobility files found!")
else:
    print(f"-> Found {len(mobility_files)} files.")
    
    # Use CREATE TABLE to let DuckLake manage the data
    # This creates a transaction, writes the Parquet files, and updates metadata.
    query_mobility = f"""
        CREATE OR REPLACE TABLE lakehouse.bronze.mobility_sample_week AS
        SELECT 
            *,
            CURRENT_TIMESTAMP AS ingestion_timestamp,
            'https://www.transportes.gob.es/ministerio/proyectos-singulares/estudios-de-movilidad-con-big-data/opendata-movilidad' AS source_url
        FROM read_csv_auto({mobility_files}, filename=true, all_varchar=true);
    """
    con.execute(query_mobility)
    print(f"‚úÖ Transformed & Ingested: lakehouse.bronze.mobility_sample_week")


--- Ingesting Mobility Data ---
-> Found 7 files.


#### Data and Schema Preview of Mobility Files

In [None]:
# --- INSPECTION & METADATA CHECK ---
print("\n--- üîç INSPECTION: Mobility Table ---")

# 1. Content Preview
# Verify that 'origen' and 'destino' columns look like municipal codes (5 digits)
print("\n[1] Data Preview (First 5 rows):")
con.execute("SELECT * FROM lakehouse.bronze.mobility_sample_week LIMIT 5").df()


--- üîç INSPECTION: Mobility Table ---

[1] Data Preview (First 5 rows):


Unnamed: 0,fecha,periodo,origen,destino,distancia,actividad_origen,actividad_destino,estudio_origen_posible,estudio_destino_posible,residencia,renta,edad,sexo,viajes,viajes_km,filename,ingestion_timestamp,source_url
0,20230508,0,1001,01001,2-10,frecuente,casa,no,no,1,<10,,,9.831,42.73,../data/raw/mitma\\20230508_Viajes_municipios....,2025-11-23 19:45:01.383545+01:00,https://www.transportes.gob.es/ministerio/proy...
1,20230508,0,1001,01009_AM,0.5-2,frecuente,casa,no,no,1,>15,,,3.622,3.093,../data/raw/mitma\\20230508_Viajes_municipios....,2025-11-23 19:45:01.383545+01:00,https://www.transportes.gob.es/ministerio/proy...
2,20230508,0,1001,01009_AM,2-10,casa,frecuente,no,no,1,<10,,,2.685,20.619,../data/raw/mitma\\20230508_Viajes_municipios....,2025-11-23 19:45:01.383545+01:00,https://www.transportes.gob.es/ministerio/proy...
3,20230508,0,1001,01009_AM,2-10,frecuente,casa,no,no,1,>15,,,3.435,23.035,../data/raw/mitma\\20230508_Viajes_municipios....,2025-11-23 19:45:01.383545+01:00,https://www.transportes.gob.es/ministerio/proy...
4,20230508,0,1001,01058_AM,2-10,frecuente,casa,no,no,1,>15,,,1.806,16.771,../data/raw/mitma\\20230508_Viajes_municipios....,2025-11-23 19:45:01.383545+01:00,https://www.transportes.gob.es/ministerio/proy...


In [None]:
# 2. Schema Check
# Confirm column names and ensure types are currently VARCHAR (as expected for Bronze)
print("\n[2] Schema (Columns & Types):")
con.execute("DESCRIBE lakehouse.bronze.mobility_sample_week").df()


[2] Schema (Columns & Types):


Unnamed: 0,column_name,column_type,null,key,default,extra
0,fecha,VARCHAR,YES,,,
1,periodo,VARCHAR,YES,,,
2,origen,VARCHAR,YES,,,
3,destino,VARCHAR,YES,,,
4,distancia,VARCHAR,YES,,,
5,actividad_origen,VARCHAR,YES,,,
6,actividad_destino,VARCHAR,YES,,,
7,estudio_origen_posible,VARCHAR,YES,,,
8,estudio_destino_posible,VARCHAR,YES,,,
9,residencia,VARCHAR,YES,,,


In [None]:
# 3. Quality Profile
# Check for 100% nulls or weird values. This might take a moment.
print("\n[3] Data Quality Profile (Nulls & Unique Values):")
con.execute("SUMMARIZE lakehouse.bronze.mobility_sample_week").df()


[3] Data Quality Profile (Nulls & Unique Values):


Unnamed: 0,column_name,column_type,min,max,approx_unique,avg,std,q25,q50,q75,count,null_percentage
0,fecha,VARCHAR,20230508,20230514,7,,,,,,84401339,0.0
1,periodo,VARCHAR,00,23,24,,,,,,84401339,0.0
2,origen,VARCHAR,01001,externo,2307,,,,,,84401339,0.0
3,destino,VARCHAR,01001,externo,2307,,,,,,84401339,0.0
4,distancia,VARCHAR,0.5-2,>50,4,,,,,,84401339,0.0
5,actividad_origen,VARCHAR,casa,trabajo_estudio,4,,,,,,84401339,0.0
6,actividad_destino,VARCHAR,casa,trabajo_estudio,4,,,,,,84401339,0.0
7,estudio_origen_posible,VARCHAR,no,si,2,,,,,,84401339,0.0
8,estudio_destino_posible,VARCHAR,no,si,2,,,,,,84401339,0.0
9,residencia,VARCHAR,01,52,52,,,,,,84401339,0.0


#### Ingesting other tables

In [None]:
# --- 6. Ingestion: Auxiliary Tables (Refactorizado con Linaje) ---

# Define URLs 
URL_MITMA = "https://www.transportes.gob.es/ministerio/proyectos-singulares/estudios-de-movilidad-con-big-data/opendata-movilidad"
URL_INE = "https://www.ine.es/"
URL_CNIG = "https://centrodedescargas.cnig.es/CentroDescargas/index.jsp"

def ingest_dimension(table_name, filename, source_url, folder='mitma', sep=';', encoding='utf-8'):
    path = os.path.join(RAW_DATA_PATH, folder, filename)
    
    if os.path.exists(path):
        # Escape single quotes in the URL for SQL safety
        safe_url = source_url.replace("'", "''")
        
        # Read the CSV and append audit columns
        # We use robust typing (try_cast/all_varchar) to prevent failures if any numeric column contains irregular characters
        con.execute(f"""
            CREATE OR REPLACE TABLE lakehouse.bronze.{table_name} AS
            SELECT 
                *,
                CURRENT_TIMESTAMP AS ingestion_timestamp,
                '{safe_url}' AS source_url
            FROM read_csv_auto('{path}', all_varchar=true, sep='{sep}', encoding='{encoding}');
        """)
        print(f"‚úÖ Ingested: lakehouse.bronze.{table_name} (Source: {source_url})")
    else:
        print(f"‚ö†Ô∏è Missing file: {filename}")

print("\n--- Ingesting Dictionaries & Dimensions ---")

# 1. Nombres de Distritos (MITMA)
# Fuente: Open Data Movilidad
ingest_dimension('zoning_municipalities', 'nombres_municipios.csv', source_url=URL_MITMA, folder='mitma')

# 2. Poblaci√≥n por Distrito (MITMA)
# Fuente: Open Data Movilidad
ingest_dimension('population_municipalities', 'poblacion_municipios.csv', source_url=URL_MITMA, folder='mitma')

# 3. Relaci√≥n Zonificaci√≥n MITMA <-> INE
# Fuente: Open Data Movilidad
ingest_dimension('mapping_ine_mitma', 'relacion_ine_zonificacionMitma.csv', source_url=URL_MITMA, folder='mitma')

# 4. Renta Media (INE)
# Fuente: Instituto Nacional de Estad√≠stica
ingest_dimension('ine_rent_municipalities', 'ine_renta.csv', source_url=URL_INE, folder='ine', sep=';')

# 5. Coordenadas Municipales (IGN/CNIG)
# Fuente: Centro de Descargas del CNIG
ingest_dimension('municipal_coordinates', 'municipios_coordenadas.csv', source_url=URL_CNIG, folder='ine', sep=';')


--- Ingesting Dictionaries & Dimensions ---
‚úÖ Ingested: lakehouse.bronze.zoning_municipalities (Source: https://www.transportes.gob.es/ministerio/proyectos-singulares/estudios-de-movilidad-con-big-data/opendata-movilidad)
‚úÖ Ingested: lakehouse.bronze.population_municipalities (Source: https://www.transportes.gob.es/ministerio/proyectos-singulares/estudios-de-movilidad-con-big-data/opendata-movilidad)
‚úÖ Ingested: lakehouse.bronze.mapping_ine_mitma (Source: https://www.transportes.gob.es/ministerio/proyectos-singulares/estudios-de-movilidad-con-big-data/opendata-movilidad)
‚úÖ Ingested: lakehouse.bronze.ine_rent_municipalities (Source: https://www.ine.es/)
‚úÖ Ingested: lakehouse.bronze.municipal_coordinates (Source: https://centrodedescargas.cnig.es/CentroDescargas/index.jsp)


#### Inspection: Rent Table

In [3]:
# Define the table we want to inspect
target_table = "lakehouse.bronze.ine_rent_municipalities"

print(f"\n--- üîç INSPECTING: {target_table} ---")

# 1. Content Preview
# Check if the columns were separated correctly (look for separate columns, not one big text blob)
# Also verify the 'source_url' is correct
print("\n[1] Content Preview (First 5 rows):")
con.execute(f"SELECT * FROM {target_table} LIMIT 5").df()


--- üîç INSPECTING: lakehouse.bronze.ine_rent_municipalities ---

[1] Content Preview (First 5 rows):


Unnamed: 0,Municipios,Distritos,Secciones,Indicadores de renta media,Periodo,Total,ingestion_timestamp,source_url
0,01001 Alegr√≠a-Dulantzi,,,Renta neta media por persona,2023,16.429,2025-11-23 19:52:37.787423+01:00,https://www.ine.es/
1,01001 Alegr√≠a-Dulantzi,,,Renta neta media por persona,2022,15.116,2025-11-23 19:52:37.787423+01:00,https://www.ine.es/
2,01001 Alegr√≠a-Dulantzi,,,Renta neta media por persona,2021,14.647,2025-11-23 19:52:37.787423+01:00,https://www.ine.es/
3,01001 Alegr√≠a-Dulantzi,,,Renta neta media por persona,2020,13.969,2025-11-23 19:52:37.787423+01:00,https://www.ine.es/
4,01001 Alegr√≠a-Dulantzi,,,Renta neta media por persona,2019,14.299,2025-11-23 19:52:37.787423+01:00,https://www.ine.es/


In [None]:
# 2. Schema Metadata (Structure)
# Shows column names and types. Since we used 'all_varchar=true', everything should be VARCHAR.
print("\n[2] Schema Metadata (Columns & Types):")
con.execute(f"DESCRIBE {target_table}").df()


[2] Schema Metadata (Columns & Types):


Unnamed: 0,column_name,column_type,null,key,default,extra
0,Municipios,VARCHAR,YES,,,
1,Distritos,VARCHAR,YES,,,
2,Secciones,VARCHAR,YES,,,
3,Indicadores de renta media,VARCHAR,YES,,,
4,Periodo,VARCHAR,YES,,,
5,Total,VARCHAR,YES,,,
6,ingestion_timestamp,TIMESTAMP WITH TIME ZONE,YES,,,
7,source_url,VARCHAR,YES,,,


In [4]:
# 3. Quality Profile (Statistics)
# Check 'approx_unique' to see how many municipalities have data
# Check 'null_percentage' to ensure the ingestion didn't fail silently
print("\n[3] Quality Statistics (Nulls & Uniques):")
con.execute(f"SUMMARIZE {target_table}").df()


[3] Quality Statistics (Nulls & Uniques):


Unnamed: 0,column_name,column_type,min,max,approx_unique,avg,std,q25,q50,q75,count,null_percentage
0,Municipios,VARCHAR,01001 Alegr√≠a-Dulantzi,52001 Melilla,8369,,,,,,3009312,0.0
1,Distritos,VARCHAR,0100101 Alegr√≠a-Dulantzi distrito 01,5200108 Melilla distrito 08,12412,,,,,,3009312,14.6
2,Secciones,VARCHAR,0100101001 Alegr√≠a-Dulantzi secci√≥n 01001,5200108015 Melilla secci√≥n 08015,35092,,,,,,3009312,33.48
3,Indicadores de renta media,VARCHAR,Media de la renta por unidad de consumo,Renta neta media por persona,6,,,,,,3009312,0.0
4,Periodo,VARCHAR,2015,2023,9,,,,,,3009312,0.0
5,Total,VARCHAR,.,99.994,85954,,,,,,3009312,2.83
6,ingestion_timestamp,TIMESTAMP WITH TIME ZONE,2025-11-23 19:52:37.787423+01,2025-11-23 19:52:37.787423+01,1,2025-11-23 19:52:37.787423+01,,2025-11-23 19:52:37.787423+01,2025-11-23 19:52:37.787423+01,2025-11-23 19:52:37.787423+01,3009312,0.0
7,source_url,VARCHAR,https://www.ine.es/,https://www.ine.es/,1,,,,,,3009312,0.0


#### Final Check: table names and schema name

In [5]:
# --- 7. Final Check ---
print("\n--- Current Lakehouse State (Bronze Layer) ---")

# We use the internal system function 'duckdb_tables()'
# This function sees EVERYTHING connected to the current session, regardless of the extension used.
query_check = """
    SELECT table_name, schema_name
    FROM duckdb_tables()
    WHERE database_name = 'lakehouse' 
      AND schema_name = 'bronze';
"""
df_result = con.execute(query_check).df()
print(df_result)


--- Current Lakehouse State (Bronze Layer) ---
                  table_name schema_name
0    ine_rent_municipalities      bronze
1          mapping_ine_mitma      bronze
2       mobility_sample_week      bronze
3      municipal_coordinates      bronze
4  population_municipalities      bronze
5      zoning_municipalities      bronze


In [6]:
con.close()