# Sprint 1: Schema Design and Prototyping

#### Initial configuration and conection to duckdb

In [None]:
import duckdb
import pandas as pd
import os
import glob

# --- Configuration ---
# Define paths to your raw data and the future lakehouse layers
RAW_DATA_PATH = '../data/raw'
LAKEHOUSE_PATH = '../data/lakehouse'
LAKEHOUSE_DB_PATH = os.path.join(LAKEHOUSE_PATH, 'lakehouse.duckdb')

# Create the lakehouse directories if they don't exist
os.makedirs(os.path.join(LAKEHOUSE_PATH, 'bronze'), exist_ok=True)
os.makedirs(os.path.join(LAKEHOUSE_PATH, 'silver'), exist_ok=True)
os.makedirs(os.path.join(LAKEHOUSE_PATH, 'gold'), exist_ok=True)

# --- DuckDB Connection ---
con = duckdb.connect(database=LAKEHOUSE_DB_PATH, read_only=False)

print(f"DuckDB is now connected to the persistent database file at:")
print(f"-> {LAKEHOUSE_DB_PATH}")

DuckDB is now connected to the persistent database file at:
-> ../data/lakehouse\lakehouse.duckdb


---

#### Step 1: Data Exploration


## 1. Data Exploration: Inspecting the Raw Files

#### Explore a MITMA Mobility File

In [5]:
# Path to one of the daily mobility files
mobility_file_path = os.path.join(RAW_DATA_PATH, 'mitma', '20230508_Viajes_distritos.csv.gz')

# Use DuckDB to directly read and describe the gzipped CSV
# The 'read_csv_auto' function is powerful and can infer types, headers, etc.
query = f"""--sql
    SELECT *
    FROM read_csv_auto('{mobility_file_path}')
    LIMIT 5;
"""
df_mobility_sample = con.execute(query).df()

print("--- Sample of Mobility Data ---")
display(df_mobility_sample)

print("\n--- Schema of Mobility Data ---")
# Let's get the column names and data types as inferred by DuckDB
query_desc = f"DESCRIBE SELECT * FROM read_csv_auto('{mobility_file_path}');"
df_mobility_schema = con.execute(query_desc).df()
display(df_mobility_schema)

--- Sample of Mobility Data ---


Unnamed: 0,fecha,periodo,origen,destino,distancia,actividad_origen,actividad_destino,estudio_origen_posible,estudio_destino_posible,residencia,renta,edad,sexo,viajes,viajes_km
0,20230508,3,01009_AM,1001,0.5-2,frecuente,casa,no,False,1,<10,,,2.521,2.703
1,20230508,18,01009_AM,1001,0.5-2,frecuente,casa,no,False,1,<10,,,6.162,7.997
2,20230508,19,01009_AM,1001,0.5-2,frecuente,casa,no,False,1,<10,,,6.162,6.208
3,20230508,20,01009_AM,1001,0.5-2,frecuente,casa,no,False,1,<10,,,9.03,11.528
4,20230508,7,01009_AM,1001,10-50,frecuente,casa,no,False,1,<10,,,2.685,30.125



--- Schema of Mobility Data ---


Unnamed: 0,column_name,column_type,null,key,default,extra
0,fecha,BIGINT,YES,,,
1,periodo,VARCHAR,YES,,,
2,origen,VARCHAR,YES,,,
3,destino,VARCHAR,YES,,,
4,distancia,VARCHAR,YES,,,
5,actividad_origen,VARCHAR,YES,,,
6,actividad_destino,VARCHAR,YES,,,
7,estudio_origen_posible,VARCHAR,YES,,,
8,estudio_destino_posible,BOOLEAN,YES,,,
9,residencia,VARCHAR,YES,,,


#### Explore the MITMA Zoning File

In [6]:
# Path to the district names file
zoning_file_path = os.path.join(RAW_DATA_PATH, 'mitma', 'nombres_distritos.csv')

# Load and inspect the zoning file
query = f"""--sql
    SELECT *
    FROM read_csv_auto('{zoning_file_path}')
    LIMIT 5;
"""
df_zoning_sample = con.execute(query).df()

print("--- Sample of Zoning Data ---")
display(df_zoning_sample)

print("\n--- Schema of Zoning Data ---")
query_desc = f"DESCRIBE SELECT * FROM read_csv_auto('{zoning_file_path}');"
df_zoning_schema = con.execute(query_desc).df()
display(df_zoning_schema)

--- Sample of Zoning Data ---


Unnamed: 0,ID,name
0,01001,Alegría-Dulantzi
1,01002,Amurrio
2,01004_AM,Artziniega agregacion de municipios
3,01009_AM,Asparrena agregacion de municipios
4,01010,Ayala/Aiara



--- Schema of Zoning Data ---


Unnamed: 0,column_name,column_type,null,key,default,extra
0,ID,VARCHAR,YES,,,
1,name,VARCHAR,YES,,,


#### Explore the INE Economic File

In [7]:
# Path to the INE GDP file
ine_file_path = os.path.join(RAW_DATA_PATH, 'ine', 'ine_provincial_gdp_2000-2022.csv') # Or whatever you named it

# This file is semicolon-separated, so we tell DuckDB explicitly
query = f"""--sql
    SELECT *
    FROM read_csv_auto('{ine_file_path}', sep=';')
    LIMIT 5;
"""
df_ine_sample = con.execute(query).df()

print("--- Sample of INE Economic Data ---")
display(df_ine_sample)

--- Sample of INE Economic Data ---


Unnamed: 0,Provincias,Ramas de actividad,Periodo,Total
0,Albacete,PRODUCTO INTERIOR BRUTO A PRECIOS DE MERCADO,2022 (P),9.485.962
1,Albacete,PRODUCTO INTERIOR BRUTO A PRECIOS DE MERCADO,2021,8.853.382
2,Albacete,PRODUCTO INTERIOR BRUTO A PRECIOS DE MERCADO,2020,8.010.434
3,Albacete,PRODUCTO INTERIOR BRUTO A PRECIOS DE MERCADO,2019,8.627.212
4,Albacete,PRODUCTO INTERIOR BRUTO A PRECIOS DE MERCADO,2018,8.285.269


---

#### Step 2: Data Ingestion

---
### 2.1 Bronze ingestion

#### 2.1.1 Mobility files

In [None]:

# --- 1. Define File Paths ---
# Source: All gzipped CSV files for the week in the raw/mitma directory
mitma_raw_glob_path = os.path.join(RAW_DATA_PATH, 'mitma', '*_Viajes_distritos.csv.gz')
# Destination: A single Parquet file in the bronze layer
bronze_mobility_path = os.path.join(LAKEHOUSE_PATH, 'bronze', 'mobility_sample_week.parquet')

# --- 2. Find all the raw mobility files ---
mobility_files = glob.glob(mitma_raw_glob_path)
# It's good practice to print the files you've found to ensure it's working
print("Found the following mobility files to ingest:")
for f in mobility_files:
    print(f" - {os.path.basename(f)}")

# --- 3. Construct and Execute the Ingestion Query ---
# The query reads all CSVs at once, adds metadata, and copies the result to a Parquet file.
# DuckDB's read_csv_auto can take a list of files.
# We also use 'filename=true' to automatically add a column with the source filename.
ingestion_query = f"""--sql
    COPY (
        SELECT 
            *,
            CURRENT_TIMESTAMP AS ingestion_timestamp
        FROM read_csv_auto({mobility_files}, filename=true, all_varchar=true) -- <--- THIS IS THE FIX
    ) TO '{bronze_mobility_path}' (FORMAT PARQUET, OVERWRITE_OR_IGNORE 1);
"""

# Execute the query
con.execute(ingestion_query)

print(f"\n✅ Successfully ingested {len(mobility_files)} files into a single Bronze Parquet file:")
print(f"   -> {bronze_mobility_path}")

Found the following mobility files to ingest:
 - 20230508_Viajes_distritos.csv.gz
 - 20230509_Viajes_distritos.csv.gz
 - 20230510_Viajes_distritos.csv.gz
 - 20230511_Viajes_distritos.csv.gz
 - 20230512_Viajes_distritos.csv.gz
 - 20230513_Viajes_distritos.csv.gz
 - 20230514_Viajes_distritos.csv.gz

✅ Successfully ingested 7 files into a single Bronze Parquet file:
   -> ../data/lakehouse\bronze\mobility_sample_week.parquet


In [None]:
# --- 4. Verification ---
# Let's read back from the new Parquet file to verify it was created correctly.
print("\n--- Verifying the Bronze Data ---")
verification_query = f"SELECT * FROM '{bronze_mobility_path}' LIMIT 5;"
bronze_sample_df = con.execute(verification_query).df()
display(bronze_sample_df)

print("\n--- Verifying the Bronze Schema ---")
schema_query = f"DESCRIBE FROM '{bronze_mobility_path}';"
bronze_schema_df = con.execute(schema_query).df()
display(bronze_schema_df)


--- Verifying the Bronze Data ---


Unnamed: 0,fecha,periodo,origen,destino,distancia,actividad_origen,actividad_destino,estudio_origen_posible,estudio_destino_posible,residencia,renta,edad,sexo,viajes,viajes_km,filename,ingestion_timestamp
0,20230508,3,01009_AM,1001,0.5-2,frecuente,casa,no,no,1,<10,,,2.521,2.703,../data/raw\\mitma\\20230508_Viajes_distritos....,2025-11-14 15:49:04.378473+01:00
1,20230508,18,01009_AM,1001,0.5-2,frecuente,casa,no,no,1,<10,,,6.162,7.997,../data/raw\\mitma\\20230508_Viajes_distritos....,2025-11-14 15:49:04.378473+01:00
2,20230508,19,01009_AM,1001,0.5-2,frecuente,casa,no,no,1,<10,,,6.162,6.208,../data/raw\\mitma\\20230508_Viajes_distritos....,2025-11-14 15:49:04.378473+01:00
3,20230508,20,01009_AM,1001,0.5-2,frecuente,casa,no,no,1,<10,,,9.03,11.528,../data/raw\\mitma\\20230508_Viajes_distritos....,2025-11-14 15:49:04.378473+01:00
4,20230508,7,01009_AM,1001,10-50,frecuente,casa,no,no,1,<10,,,2.685,30.125,../data/raw\\mitma\\20230508_Viajes_distritos....,2025-11-14 15:49:04.378473+01:00



--- Verifying the Bronze Schema ---


Unnamed: 0,column_name,column_type,null,key,default,extra
0,fecha,VARCHAR,YES,,,
1,periodo,VARCHAR,YES,,,
2,origen,VARCHAR,YES,,,
3,destino,VARCHAR,YES,,,
4,distancia,VARCHAR,YES,,,
5,actividad_origen,VARCHAR,YES,,,
6,actividad_destino,VARCHAR,YES,,,
7,estudio_origen_posible,VARCHAR,YES,,,
8,estudio_destino_posible,VARCHAR,YES,,,
9,residencia,VARCHAR,YES,,,


### 2.1.2 Supporting MITMA and INE Data into Bronze

In [10]:
# --- Ingest nombres_distritos.csv ---
source_path = os.path.join(RAW_DATA_PATH, 'mitma', 'nombres_distritos.csv')
dest_path = os.path.join(LAKEHOUSE_PATH, 'bronze', 'zoning_districts.parquet')
query = f"COPY (SELECT * FROM read_csv_auto('{source_path}', all_varchar=true)) TO '{dest_path}' (FORMAT PARQUET, OVERWRITE_OR_IGNORE 1);"
con.execute(query)
print(f"✅ Ingested zoning names to: {dest_path}")

# --- Ingest poblacion_distritos.csv ---
source_path = os.path.join(RAW_DATA_PATH, 'mitma', 'poblacion_distritos.csv')
dest_path = os.path.join(LAKEHOUSE_PATH, 'bronze', 'population_districts.parquet')
query = f"COPY (SELECT * FROM read_csv_auto('{source_path}', all_varchar=true)) TO '{dest_path}' (FORMAT PARQUET, OVERWRITE_OR_IGNORE 1);"
con.execute(query)
print(f"✅ Ingested population data to: {dest_path}")

# --- Ingest relacion_ine_zonificacionMitma.csv ---
source_path = os.path.join(RAW_DATA_PATH, 'mitma', 'relacion_ine_zonificacionMitma.csv')
dest_path = os.path.join(LAKEHOUSE_PATH, 'bronze', 'mapping_ine_mitma.parquet')
query = f"COPY (SELECT * FROM read_csv_auto('{source_path}', all_varchar=true)) TO '{dest_path}' (FORMAT PARQUET, OVERWRITE_OR_IGNORE 1);"
con.execute(query)
print(f"✅ Ingested INE-MITMA mapping to: {dest_path}")

# --- Ingest INE GDP data --- (semicolon-separated)
source_path = os.path.join(RAW_DATA_PATH, 'ine', 'ine_provincial_gdp_2000-2022.csv')
dest_path = os.path.join(LAKEHOUSE_PATH, 'bronze', 'gdp_provinces.parquet')
query = f"COPY (SELECT * FROM read_csv_auto('{source_path}', all_varchar=true, sep=';')) TO '{dest_path}' (FORMAT PARQUET, OVERWRITE_OR_IGNORE 1);"
con.execute(query)
print(f"✅ Ingested INE GDP data to: {dest_path}")

✅ Ingested zoning names to: ../data/lakehouse\bronze\zoning_districts.parquet
✅ Ingested population data to: ../data/lakehouse\bronze\population_districts.parquet
✅ Ingested INE-MITMA mapping to: ../data/lakehouse\bronze\mapping_ine_mitma.parquet
✅ Ingested INE GDP data to: ../data/lakehouse\bronze\gdp_provinces.parquet


In [None]:
import glob

# --- 1. Create the 'bronze' schema if it doesn't exist ---
# This command creates the "drawer" in our database.
print("--- Creating database schemas (if they don't exist) ---")
con.execute("CREATE SCHEMA IF NOT EXISTS bronze;")
print("  - Schema 'bronze' is ready.")

# --- 2. Find all the Parquet files ---
bronze_files = glob.glob(os.path.join(LAKEHOUSE_PATH, 'bronze', '*.parquet'))

# --- 3. Register the files as views INSIDE the new schema ---
print("\n--- Registering Bronze Parquet files as persistent VIEWS in DuckDB ---")
for file_path in bronze_files:
    # Sanitize the filename to create a valid view name
    view_name = os.path.basename(file_path).replace('.parquet', '')
    
    query = f"CREATE OR REPLACE VIEW bronze.{view_name} AS SELECT * FROM read_parquet('{file_path}');"
    con.execute(query)
    print(f"  - View 'bronze.{view_name}' created.")
    
print("\n✅ All Bronze files are now visible to external tools under the 'bronze' schema.")
con.close()

--- Creating database schemas (if they don't exist) ---
  - Schema 'bronze' is ready.

--- Registering Bronze Parquet files as persistent VIEWS in DuckDB ---


  - View 'bronze.gdp_provinces' created.
  - View 'bronze.mapping_ine_mitma' created.
  - View 'bronze.mobility_sample_week' created.
  - View 'bronze.population_districts' created.
  - View 'bronze.zoning_districts' created.

✅ All Bronze files are now visible to external tools under the 'bronze' schema.


---
### 2.2 Silver ingestion