# Sprint 1: Schema Design and Prototyping

#### Initial configuration and conection to duckdb

In [65]:
import duckdb
import pandas as pd
import os
import glob

# --- Configuration ---
# Define paths to your raw data and the future lakehouse layers
RAW_DATA_PATH = '../data/raw'
LAKEHOUSE_PATH = '../data/lakehouse'
LAKEHOUSE_DB_PATH = os.path.join(LAKEHOUSE_PATH, 'lakehouse.duckdb')

# Create the lakehouse directories if they don't exist
os.makedirs(os.path.join(LAKEHOUSE_PATH, 'bronze'), exist_ok=True)
os.makedirs(os.path.join(LAKEHOUSE_PATH, 'silver'), exist_ok=True)
os.makedirs(os.path.join(LAKEHOUSE_PATH, 'gold'), exist_ok=True)

# --- DuckDB Connection ---
con = duckdb.connect(database=LAKEHOUSE_DB_PATH, read_only=False)

print(f"DuckDB is now connected to the persistent database file at:")
print(f"-> {LAKEHOUSE_DB_PATH}")

DuckDB is now connected to the persistent database file at:
-> ../data/lakehouse\lakehouse.duckdb


---

#### Step 1: Data Exploration


## 1. Data Exploration: Inspecting the Raw Files

#### Explore a MITMA Mobility File

In [2]:
# Path to one of the daily mobility files
mobility_file_path = os.path.join(RAW_DATA_PATH, 'mitma', '20230508_Viajes_distritos.csv.gz')

# Use DuckDB to directly read and describe the gzipped CSV
# The 'read_csv_auto' function is powerful and can infer types, headers, etc.
query = f"""--sql
    SELECT *
    FROM read_csv_auto('{mobility_file_path}')
    LIMIT 5;
"""
df_mobility_sample = con.execute(query).df()

print("--- Sample of Mobility Data ---")
display(df_mobility_sample)

print("\n--- Schema of Mobility Data ---")
# Let's get the column names and data types as inferred by DuckDB
query_desc = f"DESCRIBE SELECT * FROM read_csv_auto('{mobility_file_path}');"
df_mobility_schema = con.execute(query_desc).df()
display(df_mobility_schema)

--- Sample of Mobility Data ---


Unnamed: 0,fecha,periodo,origen,destino,distancia,actividad_origen,actividad_destino,estudio_origen_posible,estudio_destino_posible,residencia,renta,edad,sexo,viajes,viajes_km
0,20230508,3,01009_AM,1001,0.5-2,frecuente,casa,no,False,1,<10,,,2.521,2.703
1,20230508,18,01009_AM,1001,0.5-2,frecuente,casa,no,False,1,<10,,,6.162,7.997
2,20230508,19,01009_AM,1001,0.5-2,frecuente,casa,no,False,1,<10,,,6.162,6.208
3,20230508,20,01009_AM,1001,0.5-2,frecuente,casa,no,False,1,<10,,,9.03,11.528
4,20230508,7,01009_AM,1001,10-50,frecuente,casa,no,False,1,<10,,,2.685,30.125



--- Schema of Mobility Data ---


Unnamed: 0,column_name,column_type,null,key,default,extra
0,fecha,BIGINT,YES,,,
1,periodo,VARCHAR,YES,,,
2,origen,VARCHAR,YES,,,
3,destino,VARCHAR,YES,,,
4,distancia,VARCHAR,YES,,,
5,actividad_origen,VARCHAR,YES,,,
6,actividad_destino,VARCHAR,YES,,,
7,estudio_origen_posible,VARCHAR,YES,,,
8,estudio_destino_posible,BOOLEAN,YES,,,
9,residencia,VARCHAR,YES,,,


#### Explore the MITMA Zoning File

In [3]:
# Path to the district names file
zoning_file_path = os.path.join(RAW_DATA_PATH, 'mitma', 'nombres_distritos.csv')

# Load and inspect the zoning file
query = f"""--sql
    SELECT *
    FROM read_csv_auto('{zoning_file_path}')
    LIMIT 5;
"""
df_zoning_sample = con.execute(query).df()

print("--- Sample of Zoning Data ---")
display(df_zoning_sample)

print("\n--- Schema of Zoning Data ---")
query_desc = f"DESCRIBE SELECT * FROM read_csv_auto('{zoning_file_path}');"
df_zoning_schema = con.execute(query_desc).df()
display(df_zoning_schema)

--- Sample of Zoning Data ---


Unnamed: 0,ID,name
0,01001,Alegría-Dulantzi
1,01002,Amurrio
2,01004_AM,Artziniega agregacion de municipios
3,01009_AM,Asparrena agregacion de municipios
4,01010,Ayala/Aiara



--- Schema of Zoning Data ---


Unnamed: 0,column_name,column_type,null,key,default,extra
0,ID,VARCHAR,YES,,,
1,name,VARCHAR,YES,,,


#### Explore the INE Economic File

In [13]:
# Path to the INE GDP file
ine_file_path = os.path.join(RAW_DATA_PATH, 'ine', 'ine_renta_distritos.csv') # Or whatever you named it

# This file is semicolon-separated, so we tell DuckDB explicitly
query = f"""--sql
    SELECT *
    FROM read_csv_auto('{ine_file_path}', sep=';')
    LIMIT 5;
"""
df_ine_sample = con.execute(query).df()

print("--- Sample of INE Economic Data ---")
display(df_ine_sample)

--- Sample of INE Economic Data ---


Unnamed: 0,Municipios,Distritos,Secciones,Indicadores de renta media,Periodo,Total
0,01001 Alegría-Dulantzi,,,Renta neta media por persona,2023,16.429
1,01001 Alegría-Dulantzi,,,Renta neta media por persona,2022,15.116
2,01001 Alegría-Dulantzi,,,Renta neta media por persona,2021,14.647
3,01001 Alegría-Dulantzi,,,Renta neta media por persona,2020,13.969
4,01001 Alegría-Dulantzi,,,Renta neta media por persona,2019,14.299


---

#### Step 2: Data Ingestion

---
### 2.1 Bronze ingestion

#### 2.1.1 Mobility files

In [23]:

# --- 1. Define File Paths ---
# Source: All gzipped CSV files for the week in the raw/mitma directory
mitma_raw_glob_path = os.path.join(RAW_DATA_PATH, 'mitma', '*_Viajes_distritos.csv.gz')
# Destination: A single Parquet file in the bronze layer
bronze_mobility_path = os.path.join(LAKEHOUSE_PATH, 'bronze', 'mobility_sample_week.parquet')

# --- 2. Find all the raw mobility files ---
mobility_files = glob.glob(mitma_raw_glob_path)
# It's good practice to print the files you've found to ensure it's working
print("Found the following mobility files to ingest:")
for f in mobility_files:
    print(f" - {os.path.basename(f)}")

# --- 3. Construct and Execute the Ingestion Query ---
# The query reads all CSVs at once, adds metadata, and copies the result to a Parquet file.
# DuckDB's read_csv_auto can take a list of files.
# We also use 'filename=true' to automatically add a column with the source filename.
ingestion_query = f"""--sql
    COPY (
        SELECT 
            *,
            CURRENT_TIMESTAMP AS ingestion_timestamp
        FROM read_csv_auto({mobility_files}, filename=true, all_varchar=true) -- <--- THIS IS THE FIX
    ) TO '{bronze_mobility_path}' (FORMAT PARQUET, OVERWRITE_OR_IGNORE 1);
"""

# Execute the query
con.execute(ingestion_query)

print(f"\n✅ Successfully ingested {len(mobility_files)} files into a single Bronze Parquet file:")
print(f"   -> {bronze_mobility_path}")

Found the following mobility files to ingest:
 - 20230508_Viajes_distritos.csv.gz
 - 20230509_Viajes_distritos.csv.gz
 - 20230510_Viajes_distritos.csv.gz
 - 20230511_Viajes_distritos.csv.gz
 - 20230512_Viajes_distritos.csv.gz
 - 20230513_Viajes_distritos.csv.gz
 - 20230514_Viajes_distritos.csv.gz

✅ Successfully ingested 7 files into a single Bronze Parquet file:
   -> ../data/lakehouse\bronze\mobility_sample_week.parquet


In [10]:
# --- 4. Verification ---
# Let's read back from the new Parquet file to verify it was created correctly.
print("\n--- Verifying the Bronze Data ---")
verification_query = f"SELECT * FROM '{bronze_mobility_path}' LIMIT 5;"
bronze_sample_df = con.execute(verification_query).df()
display(bronze_sample_df)

print("\n--- Verifying the Bronze Schema ---")
schema_query = f"DESCRIBE FROM '{bronze_mobility_path}';"
bronze_schema_df = con.execute(schema_query).df()
display(bronze_schema_df)


--- Verifying the Bronze Data ---


Unnamed: 0,fecha,periodo,origen,destino,distancia,actividad_origen,actividad_destino,estudio_origen_posible,estudio_destino_posible,residencia,renta,edad,sexo,viajes,viajes_km,filename,ingestion_timestamp
0,20230508,3,01009_AM,1001,0.5-2,frecuente,casa,no,no,1,<10,,,2.521,2.703,../data/raw\\mitma\\20230508_Viajes_distritos....,2025-11-16 12:29:39.357110+01:00
1,20230508,18,01009_AM,1001,0.5-2,frecuente,casa,no,no,1,<10,,,6.162,7.997,../data/raw\\mitma\\20230508_Viajes_distritos....,2025-11-16 12:29:39.357110+01:00
2,20230508,19,01009_AM,1001,0.5-2,frecuente,casa,no,no,1,<10,,,6.162,6.208,../data/raw\\mitma\\20230508_Viajes_distritos....,2025-11-16 12:29:39.357110+01:00
3,20230508,20,01009_AM,1001,0.5-2,frecuente,casa,no,no,1,<10,,,9.03,11.528,../data/raw\\mitma\\20230508_Viajes_distritos....,2025-11-16 12:29:39.357110+01:00
4,20230508,7,01009_AM,1001,10-50,frecuente,casa,no,no,1,<10,,,2.685,30.125,../data/raw\\mitma\\20230508_Viajes_distritos....,2025-11-16 12:29:39.357110+01:00



--- Verifying the Bronze Schema ---


Unnamed: 0,column_name,column_type,null,key,default,extra
0,fecha,VARCHAR,YES,,,
1,periodo,VARCHAR,YES,,,
2,origen,VARCHAR,YES,,,
3,destino,VARCHAR,YES,,,
4,distancia,VARCHAR,YES,,,
5,actividad_origen,VARCHAR,YES,,,
6,actividad_destino,VARCHAR,YES,,,
7,estudio_origen_posible,VARCHAR,YES,,,
8,estudio_destino_posible,VARCHAR,YES,,,
9,residencia,VARCHAR,YES,,,


### 2.1.2 Supporting MITMA and INE Data into Bronze

In [24]:
# --- Ingest nombres_distritos.csv ---
source_path = os.path.join(RAW_DATA_PATH, 'mitma', 'nombres_distritos.csv')
dest_path = os.path.join(LAKEHOUSE_PATH, 'bronze', 'zoning_districts.parquet')
query = f"COPY (SELECT * FROM read_csv_auto('{source_path}', all_varchar=true)) TO '{dest_path}' (FORMAT PARQUET, OVERWRITE_OR_IGNORE 1);"
con.execute(query)
print(f"✅ Ingested zoning names to: {dest_path}")

# --- Ingest poblacion_distritos.csv ---
source_path = os.path.join(RAW_DATA_PATH, 'mitma', 'poblacion_distritos.csv')
dest_path = os.path.join(LAKEHOUSE_PATH, 'bronze', 'population_districts.parquet')
query = f"COPY (SELECT * FROM read_csv_auto('{source_path}', all_varchar=true)) TO '{dest_path}' (FORMAT PARQUET, OVERWRITE_OR_IGNORE 1);"
con.execute(query)
print(f"✅ Ingested population data to: {dest_path}")

# --- Ingest relacion_ine_zonificacionMitma.csv ---
source_path = os.path.join(RAW_DATA_PATH, 'mitma', 'relacion_ine_zonificacionMitma.csv')
dest_path = os.path.join(LAKEHOUSE_PATH, 'bronze', 'mapping_ine_mitma.parquet')
query = f"COPY (SELECT * FROM read_csv_auto('{source_path}', all_varchar=true)) TO '{dest_path}' (FORMAT PARQUET, OVERWRITE_OR_IGNORE 1);"
con.execute(query)
print(f"✅ Ingested INE-MITMA mapping to: {dest_path}")

# --- Ingest INE GDP data --- (semicolon-separated)
source_path = os.path.join(RAW_DATA_PATH, 'ine', 'ine_renta_distritos.csv')
dest_path = os.path.join(LAKEHOUSE_PATH, 'bronze', 'ine_rent_districts.parquet')
query = f"COPY (SELECT * FROM read_csv_auto('{source_path}', all_varchar=true, sep=';')) TO '{dest_path}' (FORMAT PARQUET, OVERWRITE_OR_IGNORE 1);"
con.execute(query)
print(f"✅ Ingested INE GDP data to: {dest_path}")

✅ Ingested zoning names to: ../data/lakehouse\bronze\zoning_districts.parquet
✅ Ingested population data to: ../data/lakehouse\bronze\population_districts.parquet
✅ Ingested INE-MITMA mapping to: ../data/lakehouse\bronze\mapping_ine_mitma.parquet
✅ Ingested INE GDP data to: ../data/lakehouse\bronze\ine_rent_districts.parquet


In [25]:
import glob

# --- 1. Create the 'bronze' schema if it doesn't exist ---
# This command creates the "drawer" in our database.
print("--- Creating database schemas (if they don't exist) ---")
con.execute("CREATE SCHEMA IF NOT EXISTS bronze;")
print("  - Schema 'bronze' is ready.")

# --- 2. Find all the Parquet files ---
bronze_files = glob.glob(os.path.join(LAKEHOUSE_PATH, 'bronze', '*.parquet'))

# --- 3. Register the files as views INSIDE the new schema ---
print("\n--- Registering Bronze Parquet files as persistent VIEWS in DuckDB ---")
for file_path in bronze_files:
    # Sanitize the filename to create a valid view name
    view_name = os.path.basename(file_path).replace('.parquet', '')
    
    query = f"CREATE OR REPLACE VIEW bronze.{view_name} AS SELECT * FROM read_parquet('{file_path}');"
    con.execute(query)
    print(f"  - View 'bronze.{view_name}' created.")
    
print("\n✅ All Bronze files are now visible to external tools under the 'bronze' schema.")

--- Creating database schemas (if they don't exist) ---
  - Schema 'bronze' is ready.

--- Registering Bronze Parquet files as persistent VIEWS in DuckDB ---
  - View 'bronze.ine_rent_districts' created.
  - View 'bronze.mapping_ine_mitma' created.
  - View 'bronze.mobility_sample_week' created.
  - View 'bronze.population_districts' created.
  - View 'bronze.zoning_districts' created.

✅ All Bronze files are now visible to external tools under the 'bronze' schema.


---
### 2.2 Silver layer

In [26]:
# --- Setup: Create the 'silver' schema ---
print("--- Creating 'silver' schema if it doesn't exist ---")
con.execute("CREATE SCHEMA IF NOT EXISTS silver;")
print("  - Schema 'silver' is ready.")

--- Creating 'silver' schema if it doesn't exist ---
  - Schema 'silver' is ready.


Cleaned version of mobility data

In [None]:
# Clean the mobility data
print("Cleaning mobility data...")

con.execute("""
-- Create cleaned version of mobility data
CREATE OR REPLACE VIEW silver.cleaned_mobility AS
SELECT
    -- Fix dates (convert 20230508 → 2023-05-08)
    CAST(
        SUBSTR(fecha, 1, 4) || '-' || 
        SUBSTR(fecha, 5, 2) || '-' || 
        SUBSTR(fecha, 7, 2) 
    AS DATE) as trip_date,
    
    -- Fix hour (ensure it's a number)
    CAST(periodo AS INTEGER) as hour,
    
    -- Clean zone IDs
    TRIM(CAST(origen AS VARCHAR)) as origin_zone_id,
    TRIM(CAST(destino AS VARCHAR)) as destination_zone_id,
    
    -- Clean distance (handle messy values)
     CASE
        WHEN TRY_CAST(viajes AS BIGINT) > 0 THEN
            TRY_CAST(viajes_km AS DOUBLE) / TRY_CAST(viajes AS BIGINT)
        ELSE 0 -- If there are no trips, the average distance is 0
    END as distance_km,
    
    -- Clean trips (ensure it's a number)
    CAST(viajes AS INTEGER) as trips_count,
    
    -- Add extra data
    filename,
    ingestion_timestamp

FROM bronze.mobility_sample_week
WHERE 
    -- Remove bad records
    fecha IS NOT NULL 
    AND origen IS NOT NULL 
    AND destino IS NOT NULL 
    AND viajes IS NOT NULL;
""")

# STEP 2: Check our results
print("✓ Cleaning complete!")
print("\nFirst 5 cleaned rows:")
result = con.execute("SELECT * FROM silver.cleaned_mobility ORDER BY distance_km DESC LIMIT 5").df()
print(result)

print("\nData types:")
schema = con.execute("DESCRIBE silver.cleaned_mobility").df()
print(schema)

Cleaning mobility data...
✓ Cleaning complete!

First 5 cleaned rows:
   trip_date  hour origin_zone_id destination_zone_id  distance_km  \
0 2023-05-10     9          38009            08018_AM     3457.994   
1 2023-05-13    18        3802402             0820006     3394.862   
2 2023-05-13    17        3802402             0830102     3392.586   
3 2023-05-09     6          38006             0801907     3307.771   
4 2023-05-13    15        0801502               38006     3284.082   

   trips_count                                           filename  \
0            1  ../data/raw\\mitma\\20230510_Viajes_distritos....   
1            1  ../data/raw\\mitma\\20230513_Viajes_distritos....   
2            1  ../data/raw\\mitma\\20230513_Viajes_distritos....   
3            1  ../data/raw\\mitma\\20230509_Viajes_distritos....   
4            1  ../data/raw\\mitma\\20230513_Viajes_distritos....   

               ingestion_timestamp  
0 2025-11-16 14:00:52.389967+01:00  
1 2025-11-16 14:00:5

In [29]:
print("=== DATA QUALITY CHECKS ===")

# Check for any remaining data issues
print("\n1. Distance values summary:")
dist_check = con.execute("""
    SELECT 
        COUNT(*) as total_records,
        COUNT(distance_km) as records_with_distance,
        AVG(distance_km) as avg_distance,
        MIN(distance_km) as min_distance,
        MAX(distance_km) as max_distance
    FROM silver.cleaned_mobility
""").df()
print(dist_check)

print("\n2. Sample of unique zone IDs (check for formatting):")
zones_check = con.execute("""
    SELECT DISTINCT origin_zone_id 
    FROM silver.cleaned_mobility 
    LIMIT 10
""").df()
print(zones_check)

print("\n3. Date range:")
date_range = con.execute("""
    SELECT MIN(trip_date) as start_date, MAX(trip_date) as end_date 
    FROM silver.cleaned_mobility
""").df()
print(date_range)


=== DATA QUALITY CHECKS ===

1. Distance values summary:
   total_records  records_with_distance  avg_distance  min_distance  \
0      134726205              134726205     13.925455        0.3755   

   max_distance  
0      3457.994  

2. Sample of unique zone IDs (check for formatting):
  origin_zone_id
0          08102
1        0830504
2          08086
3        0818401
4       08262_AM
5        0807305
6        0811304
7        0821703
8          08105
9        0810602

3. Date range:
  start_date   end_date
0 2023-05-08 2023-05-14


Clean Population Data

In [19]:
print("Cleaning population data...")

# First, let's check what the raw population data looks like
print("\nSample of raw population data:")
pop_sample = con.execute("SELECT * FROM bronze.poblacion_distritos LIMIT 5").df()
print(pop_sample)

print("\nColumn names in population data:")
pop_columns = con.execute("PRAGMA table_info(bronze.population_districts)").df()
print(pop_columns)


Cleaning population data...

Sample of raw population data:
    column0  column1
0     01001   2925.0
1     01002  10307.0
2  01004_AM   3005.0
3  01009_AM   4599.0
4     01010   2951.0

Column names in population data:
   cid     name     type  notnull dflt_value     pk
0    0  column0  VARCHAR    False       None  False
1    1  column1  VARCHAR    False       None  False


In [30]:
print("Cleaning population data...")

# Create cleaned population table
con.execute("""
CREATE OR REPLACE VIEW silver.cleaned_population AS
SELECT
    -- Clean zone IDs (remove any whitespace)
    TRIM(CAST(column0 AS VARCHAR)) as zone_id,
    
    -- Clean population count (handle 'NA' values and convert to integer)
    CASE 
        WHEN column1 = 'NA' THEN NULL
        WHEN TRIM(column1) = '' THEN NULL
        ELSE CAST(CAST(column1 AS DOUBLE) AS INTEGER)
    END as population_count

FROM bronze.population_districts
WHERE 
    -- Remove records with missing zone IDs
    column0 IS NOT NULL 
    AND column1 IS NOT NULL
    AND TRIM(column0) != '';  -- Remove empty zone IDs
""")

# Verify the results
print("✓ Population data cleaned!")
print("\nFirst 5 cleaned population rows:")
result = con.execute("SELECT * FROM silver.cleaned_population LIMIT 5").df()
print(result)

print("\nData types:")
schema = con.execute("DESCRIBE silver.cleaned_population").df()
print(schema)

print("\nPopulation data quality summary:")
quality_check = con.execute("""
    SELECT 
        COUNT(*) as total_records,
        COUNT(population_count) as records_with_population,
        COUNT(*) - COUNT(population_count) as records_with_null_population,
        AVG(population_count) as avg_population
    FROM silver.cleaned_population
""").df()
print(quality_check)


Cleaning population data...
✓ Population data cleaned!

First 5 cleaned population rows:
    zone_id  population_count
0     01001              2925
1     01002             10307
2  01004_AM              3005
3  01009_AM              4599
4     01010              2951

Data types:
        column_name column_type null   key default extra
0           zone_id     VARCHAR  YES  None    None  None
1  population_count     INTEGER  YES  None    None  None

Population data quality summary:
   total_records  records_with_population  records_with_null_population  \
0           3792                     3743                            49   

   avg_population  
0    12659.659898  


Clean INE rent data

In [38]:
print("Creating VIEW for cleaned INE district rent data...")

# This query transforms the raw rent data into a clean, structured format.
# It filters for the correct indicator and year, and cleans the key columns.
cleaned_rent_query = """
CREATE OR REPLACE VIEW silver.cleaned_district_rent AS
SELECT
    -- Use a regular expression to extract the numeric part from the start of the 'Distritos' string.
    -- This is robust and handles cases where the name might have numbers.
    REGEXP_EXTRACT(Distritos, '(\d+)') AS district_code,
        
    -- Cast the year to an integer.
    CAST(Periodo AS INTEGER) AS year,
    
    -- Clean the rent value: remove '.' separator and cast to a number safely.
    TRY_CAST(REPLACE(Total, '.', '') AS INTEGER) AS avg_net_rent_eur

FROM bronze.ine_rent_districts
WHERE
    -- Filter to keep only the specific indicator and year we need.
    "Indicadores de renta media" = 'Renta neta media por persona'
    AND Periodo = '2023'
    AND Secciones IS NULL
    AND Distritos IS NOT NULL; -- Important this one or it will take '' as one districtwith all its rows (8139)
"""

con.execute(cleaned_rent_query)
print("✓ View 'silver.cleaned_district_rent' created successfully.")

# --- Verification ---
print("\n--- Verifying the 'silver.cleaned_district_rent' view ---")
print("   (Showing the 10 districts with the highest average rent)")

# This query checks if the data types and values look correct.
verification_query = """--sql
    SELECT *
    FROM silver.cleaned_district_rent
    WHERE avg_net_rent_eur IS NOT NULL
    ORDER BY avg_net_rent_eur DESC
    LIMIT 10;
"""
display(con.execute(verification_query).df())

verification_query = """--sql
    SELECT COUNT(*)
    FROM silver.cleaned_district_rent
    GROUP BY district_code
    ORDER BY COUNT(*) DESC
    LIMIT 5;
"""
display(con.execute(verification_query).df())

# Also, let's verify the schema to ensure the data types are correct
print("\n--- Verifying the final schema ---")
display(con.execute("DESCRIBE silver.cleaned_district_rent;").df())

Creating VIEW for cleaned INE district rent data...
✓ View 'silver.cleaned_district_rent' created successfully.

--- Verifying the 'silver.cleaned_district_rent' view ---
   (Showing the 10 districts with the highest average rent)


  REGEXP_EXTRACT(Distritos, '(\d+)') AS district_code,


Unnamed: 0,district_code,year,avg_net_rent_eur
0,807705,2023,35886
1,2006902,2023,32257
2,2811501,2023,32039
3,2807905,2023,32019
4,801905,2023,31710
5,807704,2023,31405
6,807707,2023,31255
7,4802006,2023,30762
8,3125501,2023,30242
9,2807904,2023,29687


Unnamed: 0,count_star()
0,1
1,1
2,1
3,1
4,1



--- Verifying the final schema ---


Unnamed: 0,column_name,column_type,null,key,default,extra
0,district_code,VARCHAR,YES,,,
1,year,INTEGER,YES,,,
2,avg_net_rent_eur,INTEGER,YES,,,


Clean mapping_ine_districts

In [42]:
# --- STEP 5: Create a View for the Cleaned INE-to-MITMA District Mapping ---
print("Creating VIEW for cleaned INE-to-MITMA district mapping...")

# This query selects and cleans the essential columns from the mapping file.
# It ensures both district codes are treated as VARCHAR to handle special characters.
cleaned_mapping_query = """
CREATE OR REPLACE VIEW silver.mapping_ine_mitma_districts AS
SELECT
    -- Trim whitespace and cast to VARCHAR to ensure consistency
    TRIM(CAST(distrito_ine AS VARCHAR)) AS ine_district_code,
    TRIM(CAST(distrito_mitma AS VARCHAR)) AS mitma_district_code
    
FROM bronze.mapping_ine_mitma
WHERE 
    -- Add a quality filter to remove rows where either key is missing
    distrito_ine IS NOT NULL AND TRIM(distrito_ine) != ''
    AND distrito_mitma IS NOT NULL AND TRIM(distrito_mitma) != '';
"""

con.execute(cleaned_mapping_query)
print("✓ View 'silver.mapping_ine_mitma_districts' created successfully.")

# --- Verification ---
print("\n--- Verifying the 'silver.mapping_ine_mitma_districts' view ---")

# Let's check a few rows to ensure it looks correct
print("\nSample of the mapping data:")
display(con.execute("SELECT * FROM silver.mapping_ine_mitma_districts LIMIT 10;").df())

print("\nChecking for special '_AM' codes:")
special_codes_query = """
SELECT *
FROM silver.mapping_ine_mitma_districts
WHERE mitma_district_code LIKE '%_AM'
LIMIT 5;
"""
display(con.execute(special_codes_query).df())

# Also, let's verify the schema
print("\n--- Verifying the final schema ---")
display(con.execute("DESCRIBE silver.mapping_ine_mitma_districts;").df())

Creating VIEW for cleaned INE-to-MITMA district mapping...
✓ View 'silver.mapping_ine_mitma_districts' created successfully.

--- Verifying the 'silver.mapping_ine_mitma_districts' view ---

Sample of the mapping data:


Unnamed: 0,ine_district_code,mitma_district_code
0,100101,01001
1,100101,01001
2,100201,01002
3,100201,01002
4,100201,01002
5,100201,01002
6,100201,01002
7,100201,01002
8,100201,01002
9,100301,01058_AM



Checking for special '_AM' codes:


Unnamed: 0,ine_district_code,mitma_district_code
0,100301,01058_AM
1,100401,01004_AM
2,100601,01047_AM
3,100801,01058_AM
4,100901,01009_AM



--- Verifying the final schema ---


Unnamed: 0,column_name,column_type,null,key,default,extra
0,ine_district_code,VARCHAR,YES,,,
1,mitma_district_code,VARCHAR,YES,,,


Clean zoning_districs

In [43]:
print("Creating a pass-through VIEW for zoning districts...")

# Even though the data is already clean, this view serves as a stable
# interface for the Silver layer, adhering to our architecture.
# It renames columns to our standard convention and ensures data types.
silver_zoning_query = """
CREATE OR REPLACE VIEW silver.zoning_districts AS
SELECT
    CAST("ID" AS VARCHAR) AS district_id,
    CAST("name" AS VARCHAR) AS district_name
FROM bronze.zoning_districts;
"""

con.execute(silver_zoning_query)
print("✓ View 'silver.zoning_districts' created successfully.")

# --- Verification ---
print("\n--- Verifying the 'silver.zoning_districts' view ---")
display(con.execute("SELECT * FROM silver.zoning_districts LIMIT 5;").df())
display(con.execute("DESCRIBE silver.zoning_districts;").df())

Creating a pass-through VIEW for zoning districts...
✓ View 'silver.zoning_districts' created successfully.

--- Verifying the 'silver.zoning_districts' view ---


Unnamed: 0,district_id,district_name
0,01001,Alegría-Dulantzi
1,01002,Amurrio
2,01004_AM,Artziniega agregacion de municipios
3,01009_AM,Asparrena agregacion de municipios
4,01010,Ayala/Aiara


Unnamed: 0,column_name,column_type,null,key,default,extra
0,district_id,VARCHAR,YES,,,
1,district_name,VARCHAR,YES,,,


Combine cleaned tables into silver_integrated_od

In [None]:
print("Creating integrated OD table with demographics...")

# Create the integrated table by joining mobility and population data
con.execute("""--sql
CREATE OR REPLACE VIEW silver.silver_integrated_od AS
SELECT
    -- Mobility data
    m.trip_date,
    m.hour,
    m.origin_zone_id,
    m.destination_zone_id,
    m.distance_km,
    m.trips_count,
    
    -- Population data for ORIGIN zone
    orig_pop.population_count as origin_population,
    
    -- Population data for DESTINATION zone  
    dest_pop.population_count as destination_population,

FROM silver.cleaned_mobility m
LEFT JOIN silver.cleaned_population orig_pop 
    ON m.origin_zone_id = orig_pop.zone_id
LEFT JOIN silver.cleaned_population dest_pop 
    ON m.destination_zone_id = dest_pop.zone_id
""")

# Verify the results
print("✓ Integrated table created!")
print("\nFirst 5 rows of integrated data:")
result = con.execute("SELECT * FROM silver.silver_integrated_od LIMIT 5").df()
print(result)

print("\nJoin quality check:")
join_quality = con.execute("""
    SELECT 
        COUNT(*) as total_trips,
        COUNT(origin_population) as trips_with_origin_population,
        COUNT(destination_population) as trips_with_destination_population
    FROM silver.silver_integrated_od
""").df()
print(join_quality)


Creating integrated OD table with demographics...
✓ Integrated table created!

First 5 rows of integrated data:
   trip_date  hour origin_zone_id destination_zone_id  distance_km  \
0 2023-05-08     3       01009_AM               01001     0.901000   
1 2023-05-08    18       01009_AM               01001     1.332833   
2 2023-05-08    19       01009_AM               01001     1.034667   
3 2023-05-08    20       01009_AM               01001     1.280889   
4 2023-05-08     7       01009_AM               01001    10.041667   

   trips_count  origin_population  destination_population  
0            3               4599                    2925  
1            6               4599                    2925  
2            6               4599                    2925  
3            9               4599                    2925  
4            3               4599                    2925  

Join quality check:
   total_trips  trips_with_origin_population  \
0    134726205                     134

Combine cleaned tables into silver_integrated_od

In [None]:
print("Creating VIEW 'silver.zone_metrics' with correct district code mapping for rent data...")

# This query has been UPDATED to use the mapping view to translate MITMA district codes
# to INE district codes before joining with the rent data.
zone_metrics_query = """--sql
CREATE OR REPLACE VIEW silver.silver_zone_metrics AS

-- Step 1: Get all unique origin-destination pairs from the mobility data (using MITMA codes).
WITH od_pairs AS (
    SELECT
        origin_zone_id AS mitma_origin_code,
        destination_zone_id AS mitma_destination_code,
        AVG(distance_km) AS avg_distance_km
    FROM silver.cleaned_mobility
    GROUP BY 1, 2
),

-- Step 2: Translate MITMA codes to INE codes using the mapping view.
mapped_od_pairs AS (
    SELECT
        od.mitma_origin_code,
        od.mitma_destination_code,
        od.avg_distance_km,
        map_origin.ine_district_code AS ine_origin_code,
        map_dest.ine_district_code AS ine_destination_code
    FROM od_pairs AS od
    -- Join to translate the origin code
    LEFT JOIN silver.mapping_ine_mitma_districts AS map_origin
        ON od.mitma_origin_code = map_origin.mitma_district_code
    -- Join to translate the destination code
    LEFT JOIN silver.mapping_ine_mitma_districts AS map_dest
        ON od.mitma_destination_code = map_dest.mitma_district_code
)

-- Step 3: Join the mapped pairs with the attribute tables using the correct codes for each join.
SELECT
    mop.mitma_origin_code AS origin_zone_id,
    mop.mitma_destination_code AS destination_zone_id,
    mop.avg_distance_km,
    
    -- Metrics for the ORIGIN zone
    pop_origin.population_count AS origin_population,
    -- Join rent using the translated INE code
    rent_origin.avg_net_rent_eur AS origin_avg_rent_eur,
    
    -- Metrics for the DESTINATION zone
    pop_dest.population_count AS destination_population,
    -- Join rent using the translated INE code
    rent_dest.avg_net_rent_eur AS destination_avg_rent_eur

FROM mapped_od_pairs AS mop

-- Join population using the original MITMA code
LEFT JOIN silver.cleaned_population AS pop_origin 
    ON mop.mitma_origin_code = pop_origin.zone_id
    
-- Join rent using the translated INE code
LEFT JOIN silver.cleaned_district_rent AS rent_origin 
    ON mop.ine_origin_code = rent_origin.district_code

-- Join population using the original MITMA code
LEFT JOIN silver.cleaned_population AS pop_dest 
    ON mop.mitma_destination_code = pop_dest.zone_id

-- Join rent using the translated INE code
LEFT JOIN silver.cleaned_district_rent AS rent_dest 
    ON mop.ine_destination_code = rent_dest.district_code;
"""

con.execute(zone_metrics_query)
print("✓ View 'silver.zone_metrics' (with correct mapping) created successfully.")

# --- Verification ---
# Let's also specifically check a row where the rent is NOT NULL to prove the join worked
print("\nSample of rows with successful rent joins:")
successful_join_query = "SELECT * FROM silver.zone_metrics WHERE origin_avg_rent_eur IS NOT NULL LIMIT 5;"
display(con.execute(successful_join_query).df())

Creating VIEW 'silver.zone_metrics' with correct district code mapping for rent data...
✓ View 'silver.zone_metrics' (with correct mapping) created successfully.

Sample of rows with successful rent joins:


Unnamed: 0,origin_zone_id,destination_zone_id,avg_distance_km,origin_population,origin_avg_rent_eur,destination_population,destination_avg_rent_eur
0,07031,0701301,13.69617,38224,13662,5170,14277
1,07023_AM,0701502,10.966274,6392,17547,7696,14280
2,0704301,0702201,13.686144,3066,14344,3758,13255
3,0700202,07023_AM,11.693411,5659,15135,6392,17547
4,07012_AM,0702702,8.218982,3768,16087,13474,13781


## 5. Gold Layer Aggregation

In [18]:
# --- Setup: Create the 'gold' schema ---
print("--- Creating 'gold' schema if it doesn't exist ---")
con.execute("CREATE SCHEMA IF NOT EXISTS gold;")
print("  - Schema 'gold' is ready.")

--- Creating 'gold' schema if it doesn't exist ---
  - Schema 'gold' is ready.


In [24]:
# This query answers Business Question 1.
# It has been UPDATED to read from silver.silver_integrated_od and
# to derive the province_code from the origin_zone_id.

gold_hourly_patterns_query = """
CREATE OR REPLACE VIEW gold.hourly_mobility_patterns AS
WITH daily_patterns AS (
    SELECT
        EXTRACT(isodow FROM trip_date) AS day_of_week,
        hour,
        origin_zone_id,
        destination_zone_id,
        trips_count
    FROM silver.silver_integrated_od
)
SELECT
    CASE 
        WHEN day_of_week <= 5 THEN 'weekday'
        ELSE 'weekend'
    END AS day_type,
    hour,
    origin_zone_id,
    destination_zone_id,
    AVG(trips_count) AS avg_trips
FROM daily_patterns
GROUP BY 1, 2, 3, 4;
"""
con.execute(gold_hourly_patterns_query)
print("✅ View 'gold.hourly_mobility_patterns' created successfully.")

# --- Corrected Verification ---
# We will use a subquery to get the province_name for filtering. This is a bit
# more complex but correctly uses the data we have.
print("\n--- Verifying the 'gold.hourly_mobility_patterns' view (Corrected) ---")
madrid_pattern_query = """
-- We need a lookup from province code to province name for our filter
WITH province_lookup AS (
    SELECT DISTINCT
        SUBSTRING("Provincias", 1, 2) AS province_code,
        SUBSTRING("Provincias", 4) AS province_name
    FROM bronze.gdp_provinces
    WHERE "Periodo" = '2022' AND "Provincias" NOT LIKE 'Total%' AND LENGTH("Provincias") > 3
)
SELECT
    gp.hour,
    SUM(gp.avg_trips) as total_avg_trips_in_madrid
FROM gold.hourly_mobility_patterns AS gp
-- Join to the lookup table using the province code from the origin_zone_id
JOIN province_lookup AS pl ON SUBSTRING(gp.origin_zone_id, 1, 2) = pl.province_code
WHERE pl.province_name = 'Madrid'
  AND gp.day_type = 'weekday'
GROUP BY gp.hour
ORDER BY gp.hour;
"""
display(con.execute(madrid_pattern_query).df())

✅ View 'gold.hourly_mobility_patterns' created successfully.

--- Verifying the 'gold.hourly_mobility_patterns' view (Corrected) ---


Unnamed: 0,hour,total_avg_trips_in_madrid


In [25]:
# This query answers Business Question 2.
# UPDATED to use your existing Silver views.

gold_gravity_model_query = """
-- First, create a lookup for provincial GDP
CREATE OR REPLACE VIEW gold.province_economics AS
SELECT
    SUBSTRING("Provincias", 1, 2) AS province_code,
    SUBSTRING("Provincias", 4) AS province_name,
    TRY_CAST(REPLACE(REPLACE(Total, '.', ''), ',', '.') AS DOUBLE) AS gdp_euros
FROM bronze.gdp_provinces
WHERE "Periodo" = '2022'
  AND "Provincias" NOT LIKE 'Total%'
  AND LENGTH("Provincias") > 3;

-- Now, build the main gravity model input view
CREATE OR REPLACE VIEW gold.gravity_model_inputs AS
WITH od_summary AS (
    SELECT
        origin_zone_id,
        destination_zone_id,
        SUM(trips_count) AS total_actual_trips,
        AVG(distance_km) AS avg_distance_km,
        -- Get population from the already-joined data
        ANY_VALUE(origin_population) AS origin_population -- Use ANY_VALUE as it's the same for all trips in the group
    FROM silver.silver_integrated_od
    GROUP BY 1, 2
)
SELECT
    od.origin_zone_id,
    od.destination_zone_id,
    od.total_actual_trips,
    od.avg_distance_km,
    od.origin_population,
    -- Get Destination Economic Activity (Ej) by joining our new economics view
    econ.gdp_euros AS destination_province_gdp
FROM od_summary AS od
LEFT JOIN gold.province_economics AS econ
    ON SUBSTRING(od.destination_zone_id, 1, 2) = econ.province_code;
"""
con.execute(gold_gravity_model_query)
print("✅ Views 'gold.province_economics' and 'gold.gravity_model_inputs' created successfully.")

# --- Corrected Verification ---
print("\n--- Verifying the 'gold.gravity_model_inputs' view ---")
display(con.execute("""
    SELECT *
    FROM gold.gravity_model_inputs
    WHERE total_actual_trips > 100
    ORDER BY total_actual_trips DESC
    LIMIT 10;
""").df())

✅ Views 'gold.province_economics' and 'gold.gravity_model_inputs' created successfully.

--- Verifying the 'gold.gravity_model_inputs' view ---


Unnamed: 0,origin_zone_id,destination_zone_id,total_actual_trips,avg_distance_km,origin_population,destination_province_gdp
0,2807908,2807908,2442194.0,3.434795,247327,
1,801910,801910,2144993.0,2.5499,238245,
2,2807911,2807911,1890866.0,2.675785,258064,
3,801902,801902,1886197.0,2.516749,264353,
4,2807916,2807916,1804357.0,3.060398,192809,
5,4902,4902,1771652.0,7.559942,84005,
6,2807913,2807913,1559203.0,2.680747,238577,
7,704004,704004,1494921.0,4.893642,151206,
8,28106,28106,1340829.0,3.024372,131689,
9,2807910,2807910,1330275.0,2.852121,239693,


In [67]:
con.close()