# Sprint 1: Schema Design and Prototyping

#### Initial configuration and conection to duckdb

In [62]:
import duckdb
import pandas as pd
import os
import glob

# --- Configuration ---
# Define paths to your raw data and the future lakehouse layers
RAW_DATA_PATH = '../data/raw'
LAKEHOUSE_PATH = '../data/lakehouse'
LAKEHOUSE_DB_PATH = os.path.join(LAKEHOUSE_PATH, 'lakehouse.duckdb')

# Create the lakehouse directories if they don't exist
os.makedirs(os.path.join(LAKEHOUSE_PATH, 'bronze'), exist_ok=True)
os.makedirs(os.path.join(LAKEHOUSE_PATH, 'silver'), exist_ok=True)
os.makedirs(os.path.join(LAKEHOUSE_PATH, 'gold'), exist_ok=True)

# --- DuckDB Connection ---
con = duckdb.connect(database=LAKEHOUSE_DB_PATH, read_only=False)

print(f"DuckDB is now connected to the persistent database file at:")
print(f"-> {LAKEHOUSE_DB_PATH}")

DuckDB is now connected to the persistent database file at:
-> ../data/lakehouse\lakehouse.duckdb


---

#### Step 1: Data Exploration


## 1. Data Exploration: Inspecting the Raw Files

#### Explore a MITMA Mobility File

In [14]:
# Path to one of the daily mobility files
mobility_file_path = os.path.join(RAW_DATA_PATH, 'mitma', '20230508_Viajes_distritos.csv.gz')

# Use DuckDB to directly read and describe the gzipped CSV
# The 'read_csv_auto' function is powerful and can infer types, headers, etc.
query = f"""--sql
    SELECT *
    FROM read_csv_auto('{mobility_file_path}')
    LIMIT 5;
"""
df_mobility_sample = con.execute(query).df()

print("--- Sample of Mobility Data ---")
display(df_mobility_sample)

print("\n--- Schema of Mobility Data ---")
# Let's get the column names and data types as inferred by DuckDB
query_desc = f"DESCRIBE SELECT * FROM read_csv_auto('{mobility_file_path}');"
df_mobility_schema = con.execute(query_desc).df()
display(df_mobility_schema)

--- Sample of Mobility Data ---


Unnamed: 0,fecha,periodo,origen,destino,distancia,actividad_origen,actividad_destino,estudio_origen_posible,estudio_destino_posible,residencia,renta,edad,sexo,viajes,viajes_km
0,20230508,3,01009_AM,1001,0.5-2,frecuente,casa,no,False,1,<10,,,2.521,2.703
1,20230508,18,01009_AM,1001,0.5-2,frecuente,casa,no,False,1,<10,,,6.162,7.997
2,20230508,19,01009_AM,1001,0.5-2,frecuente,casa,no,False,1,<10,,,6.162,6.208
3,20230508,20,01009_AM,1001,0.5-2,frecuente,casa,no,False,1,<10,,,9.03,11.528
4,20230508,7,01009_AM,1001,10-50,frecuente,casa,no,False,1,<10,,,2.685,30.125



--- Schema of Mobility Data ---


Unnamed: 0,column_name,column_type,null,key,default,extra
0,fecha,BIGINT,YES,,,
1,periodo,VARCHAR,YES,,,
2,origen,VARCHAR,YES,,,
3,destino,VARCHAR,YES,,,
4,distancia,VARCHAR,YES,,,
5,actividad_origen,VARCHAR,YES,,,
6,actividad_destino,VARCHAR,YES,,,
7,estudio_origen_posible,VARCHAR,YES,,,
8,estudio_destino_posible,BOOLEAN,YES,,,
9,residencia,VARCHAR,YES,,,


#### Explore the MITMA Zoning File

In [15]:
# Path to the district names file
zoning_file_path = os.path.join(RAW_DATA_PATH, 'mitma', 'nombres_distritos.csv')

# Load and inspect the zoning file
query = f"""--sql
    SELECT *
    FROM read_csv_auto('{zoning_file_path}')
    LIMIT 5;
"""
df_zoning_sample = con.execute(query).df()

print("--- Sample of Zoning Data ---")
display(df_zoning_sample)

print("\n--- Schema of Zoning Data ---")
query_desc = f"DESCRIBE SELECT * FROM read_csv_auto('{zoning_file_path}');"
df_zoning_schema = con.execute(query_desc).df()
display(df_zoning_schema)

--- Sample of Zoning Data ---


Unnamed: 0,ID,name
0,01001,Alegría-Dulantzi
1,01002,Amurrio
2,01004_AM,Artziniega agregacion de municipios
3,01009_AM,Asparrena agregacion de municipios
4,01010,Ayala/Aiara



--- Schema of Zoning Data ---


Unnamed: 0,column_name,column_type,null,key,default,extra
0,ID,VARCHAR,YES,,,
1,name,VARCHAR,YES,,,


#### Explore the INE Economic File

In [16]:
# Path to the INE GDP file
ine_file_path = os.path.join(RAW_DATA_PATH, 'ine', 'ine_provincial_gdp_2000-2022.csv') # Or whatever you named it

# This file is semicolon-separated, so we tell DuckDB explicitly
query = f"""--sql
    SELECT *
    FROM read_csv_auto('{ine_file_path}', sep=';')
    LIMIT 5;
"""
df_ine_sample = con.execute(query).df()

print("--- Sample of INE Economic Data ---")
display(df_ine_sample)

--- Sample of INE Economic Data ---


Unnamed: 0,Provincias,Ramas de actividad,Periodo,Total
0,Albacete,PRODUCTO INTERIOR BRUTO A PRECIOS DE MERCADO,2022 (P),9.485.962
1,Albacete,PRODUCTO INTERIOR BRUTO A PRECIOS DE MERCADO,2021,8.853.382
2,Albacete,PRODUCTO INTERIOR BRUTO A PRECIOS DE MERCADO,2020,8.010.434
3,Albacete,PRODUCTO INTERIOR BRUTO A PRECIOS DE MERCADO,2019,8.627.212
4,Albacete,PRODUCTO INTERIOR BRUTO A PRECIOS DE MERCADO,2018,8.285.269


---

#### Step 2: Data Ingestion

---
### 2.1 Bronze ingestion

#### 2.1.1 Mobility files

In [17]:

# --- 1. Define File Paths ---
# Source: All gzipped CSV files for the week in the raw/mitma directory
mitma_raw_glob_path = os.path.join(RAW_DATA_PATH, 'mitma', '*_Viajes_distritos.csv.gz')
# Destination: A single Parquet file in the bronze layer
bronze_mobility_path = os.path.join(LAKEHOUSE_PATH, 'bronze', 'mobility_sample_week.parquet')

# --- 2. Find all the raw mobility files ---
mobility_files = glob.glob(mitma_raw_glob_path)
# It's good practice to print the files you've found to ensure it's working
print("Found the following mobility files to ingest:")
for f in mobility_files:
    print(f" - {os.path.basename(f)}")

# --- 3. Construct and Execute the Ingestion Query ---
# The query reads all CSVs at once, adds metadata, and copies the result to a Parquet file.
# DuckDB's read_csv_auto can take a list of files.
# We also use 'filename=true' to automatically add a column with the source filename.
ingestion_query = f"""--sql
    COPY (
        SELECT 
            *,
            CURRENT_TIMESTAMP AS ingestion_timestamp
        FROM read_csv_auto({mobility_files}, filename=true, all_varchar=true) -- <--- THIS IS THE FIX
    ) TO '{bronze_mobility_path}' (FORMAT PARQUET, OVERWRITE_OR_IGNORE 1);
"""

# Execute the query
con.execute(ingestion_query)

print(f"\n✅ Successfully ingested {len(mobility_files)} files into a single Bronze Parquet file:")
print(f"   -> {bronze_mobility_path}")

Found the following mobility files to ingest:
 - 20230508_Viajes_distritos.csv.gz
 - 20230509_Viajes_distritos.csv.gz
 - 20230510_Viajes_distritos.csv.gz
 - 20230511_Viajes_distritos.csv.gz
 - 20230512_Viajes_distritos.csv.gz
 - 20230513_Viajes_distritos.csv.gz
 - 20230514_Viajes_distritos.csv.gz

✅ Successfully ingested 7 files into a single Bronze Parquet file:
   -> ../data/lakehouse\bronze\mobility_sample_week.parquet


In [18]:
# --- 4. Verification ---
# Let's read back from the new Parquet file to verify it was created correctly.
print("\n--- Verifying the Bronze Data ---")
verification_query = f"SELECT * FROM '{bronze_mobility_path}' LIMIT 5;"
bronze_sample_df = con.execute(verification_query).df()
display(bronze_sample_df)

print("\n--- Verifying the Bronze Schema ---")
schema_query = f"DESCRIBE FROM '{bronze_mobility_path}';"
bronze_schema_df = con.execute(schema_query).df()
display(bronze_schema_df)


--- Verifying the Bronze Data ---


Unnamed: 0,fecha,periodo,origen,destino,distancia,actividad_origen,actividad_destino,estudio_origen_posible,estudio_destino_posible,residencia,renta,edad,sexo,viajes,viajes_km,filename,ingestion_timestamp
0,20230508,3,01009_AM,1001,0.5-2,frecuente,casa,no,no,1,<10,,,2.521,2.703,../data/raw\\mitma\\20230508_Viajes_distritos....,2025-11-14 16:07:44.032928+01:00
1,20230508,18,01009_AM,1001,0.5-2,frecuente,casa,no,no,1,<10,,,6.162,7.997,../data/raw\\mitma\\20230508_Viajes_distritos....,2025-11-14 16:07:44.032928+01:00
2,20230508,19,01009_AM,1001,0.5-2,frecuente,casa,no,no,1,<10,,,6.162,6.208,../data/raw\\mitma\\20230508_Viajes_distritos....,2025-11-14 16:07:44.032928+01:00
3,20230508,20,01009_AM,1001,0.5-2,frecuente,casa,no,no,1,<10,,,9.03,11.528,../data/raw\\mitma\\20230508_Viajes_distritos....,2025-11-14 16:07:44.032928+01:00
4,20230508,7,01009_AM,1001,10-50,frecuente,casa,no,no,1,<10,,,2.685,30.125,../data/raw\\mitma\\20230508_Viajes_distritos....,2025-11-14 16:07:44.032928+01:00



--- Verifying the Bronze Schema ---


Unnamed: 0,column_name,column_type,null,key,default,extra
0,fecha,VARCHAR,YES,,,
1,periodo,VARCHAR,YES,,,
2,origen,VARCHAR,YES,,,
3,destino,VARCHAR,YES,,,
4,distancia,VARCHAR,YES,,,
5,actividad_origen,VARCHAR,YES,,,
6,actividad_destino,VARCHAR,YES,,,
7,estudio_origen_posible,VARCHAR,YES,,,
8,estudio_destino_posible,VARCHAR,YES,,,
9,residencia,VARCHAR,YES,,,


### 2.1.2 Supporting MITMA and INE Data into Bronze

In [None]:
# --- Ingest nombres_distritos.csv ---
source_path = os.path.join(RAW_DATA_PATH, 'mitma', 'nombres_distritos.csv')
dest_path = os.path.join(LAKEHOUSE_PATH, 'bronze', 'zoning_districts.parquet')
query = f"COPY (SELECT * FROM read_csv_auto('{source_path}', all_varchar=true)) TO '{dest_path}' (FORMAT PARQUET, OVERWRITE_OR_IGNORE 1);"
con.execute(query)
print(f"✅ Ingested zoning names to: {dest_path}")

# --- Ingest poblacion_distritos.csv ---
source_path = os.path.join(RAW_DATA_PATH, 'mitma', 'poblacion_distritos.csv')
dest_path = os.path.join(LAKEHOUSE_PATH, 'bronze', 'population_districts.parquet')
query = f"COPY (SELECT * FROM read_csv_auto('{source_path}', all_varchar=true)) TO '{dest_path}' (FORMAT PARQUET, OVERWRITE_OR_IGNORE 1);"
con.execute(query)
print(f"✅ Ingested population data to: {dest_path}")

# --- Ingest relacion_ine_zonificacionMitma.csv ---
source_path = os.path.join(RAW_DATA_PATH, 'mitma', 'relacion_ine_zonificacionMitma.csv')
dest_path = os.path.join(LAKEHOUSE_PATH, 'bronze', 'mapping_ine_mitma.parquet')
query = f"COPY (SELECT * FROM read_csv_auto('{source_path}', all_varchar=true)) TO '{dest_path}' (FORMAT PARQUET, OVERWRITE_OR_IGNORE 1);"
con.execute(query)
print(f"✅ Ingested INE-MITMA mapping to: {dest_path}")

# --- Ingest INE GDP data --- (semicolon-separated)
source_path = os.path.join(RAW_DATA_PATH, 'ine', 'ine_provincial_gdp_2000-2022.csv')
dest_path = os.path.join(LAKEHOUSE_PATH, 'bronze', 'gdp_provinces.parquet')
query = f"COPY (SELECT * FROM read_csv_auto('{source_path}', all_varchar=true, sep=';')) TO '{dest_path}' (FORMAT PARQUET, OVERWRITE_OR_IGNORE 1);"
con.execute(query)
print(f"✅ Ingested INE GDP data to: {dest_path}")

✅ Ingested zoning names to: ../data/lakehouse\bronze\zoning_districts.parquet
✅ Ingested population data to: ../data/lakehouse\bronze\population_districts.parquet
✅ Ingested INE-MITMA mapping to: ../data/lakehouse\bronze\mapping_ine_mitma.parquet
✅ Ingested INE GDP data to: ../data/lakehouse\bronze\gdp_provinces.parquet


In [20]:
import glob

# --- 1. Create the 'bronze' schema if it doesn't exist ---
# This command creates the "drawer" in our database.
print("--- Creating database schemas (if they don't exist) ---")
con.execute("CREATE SCHEMA IF NOT EXISTS bronze;")
print("  - Schema 'bronze' is ready.")

# --- 2. Find all the Parquet files ---
bronze_files = glob.glob(os.path.join(LAKEHOUSE_PATH, 'bronze', '*.parquet'))

# --- 3. Register the files as views INSIDE the new schema ---
print("\n--- Registering Bronze Parquet files as persistent VIEWS in DuckDB ---")
for file_path in bronze_files:
    # Sanitize the filename to create a valid view name
    view_name = os.path.basename(file_path).replace('.parquet', '')
    
    query = f"CREATE OR REPLACE VIEW bronze.{view_name} AS SELECT * FROM read_parquet('{file_path}');"
    con.execute(query)
    print(f"  - View 'bronze.{view_name}' created.")
    
print("\n✅ All Bronze files are now visible to external tools under the 'bronze' schema.")

--- Creating database schemas (if they don't exist) ---
  - Schema 'bronze' is ready.

--- Registering Bronze Parquet files as persistent VIEWS in DuckDB ---
  - View 'bronze.gdp_provinces' created.
  - View 'bronze.mapping_ine_mitma' created.
  - View 'bronze.mobility_sample_week' created.
  - View 'bronze.population_districts' created.
  - View 'bronze.zoning_districts' created.

✅ All Bronze files are now visible to external tools under the 'bronze' schema.


---
### 2.2 Silver layer

In [None]:
# --- Setup: Create the 'silver' schema ---
print("--- Creating 'silver' schema if it doesn't exist ---")
con.execute("CREATE SCHEMA IF NOT EXISTS silver;")
print("  - Schema 'silver' is ready.")

--- Creating 'silver' schema if it doesn't exist ---
  - Schema 'silver' is ready.


Cleaned version of mobility data

In [56]:
# Clean the mobility data
print("Cleaning mobility data...")

con.execute("""
-- Create cleaned version of mobility data
CREATE OR REPLACE TABLE silver.cleaned_mobility AS
SELECT
    -- Fix dates (convert 20230508 → 2023-05-08)
    CAST(
        SUBSTR(fecha, 1, 4) || '-' || 
        SUBSTR(fecha, 5, 2) || '-' || 
        SUBSTR(fecha, 7, 2) 
    AS DATE) as trip_date,
    
    -- Fix hour (ensure it's a number)
    CAST(periodo AS INTEGER) as hour,
    
    -- Clean zone IDs
    TRIM(CAST(origen AS VARCHAR)) as origin_zone_id,
    TRIM(CAST(destino AS VARCHAR)) as destination_zone_id,
    
    -- Clean distance (handle messy values)
    CASE 
        WHEN distancia ~ '^[0-9]+\\.?[0-9]*$' THEN CAST(distancia AS DOUBLE)  -- Normal numbers
        WHEN distancia = '0.5-2' THEN 1.25  -- Specific case: use average
        WHEN distancia = '2-10' THEN 6.0    -- Specific case: use average  
        WHEN distancia = '10-50' THEN 30.0  -- Specific case: use average
        WHEN distancia = '>50' THEN 75.0    -- Specific case: estimate
        ELSE NULL  -- Can't convert? Set to NULL
    END as distance_km,
    
    -- Clean trips (ensure it's a number)
    CAST(viajes AS INTEGER) as trips_count,
    
    -- Keep other columns as-is for now
    actividad_origen as raw_origin_activity,
    actividad_destino as raw_destination_activity,
    filename,
    ingestion_timestamp

FROM bronze.mobility_sample_week
WHERE 
    -- Remove bad records
    fecha IS NOT NULL 
    AND origen IS NOT NULL 
    AND destino IS NOT NULL 
    AND viajes IS NOT NULL
""")

# STEP 2: Check our results
print("✓ Cleaning complete!")
print("\nFirst 5 cleaned rows:")
result = con.execute("SELECT * FROM silver.cleaned_mobility LIMIT 5").df()
print(result)

print("\nData types:")
schema = con.execute("DESCRIBE silver.cleaned_mobility").df()
print(schema)

Cleaning mobility data...
✓ Cleaning complete!

First 5 cleaned rows:
   trip_date  hour origin_zone_id destination_zone_id  distance_km  \
0 2023-05-08     3       01009_AM               01001         1.25   
1 2023-05-08    18       01009_AM               01001         1.25   
2 2023-05-08    19       01009_AM               01001         1.25   
3 2023-05-08    20       01009_AM               01001         1.25   
4 2023-05-08     7       01009_AM               01001        30.00   

   trips_count raw_origin_activity raw_destination_activity  \
0            3           frecuente                     casa   
1            6           frecuente                     casa   
2            6           frecuente                     casa   
3            9           frecuente                     casa   
4            3           frecuente                     casa   

                                            filename  \
0  ../data/raw\\mitma\\20230508_Viajes_distritos....   
1  ../data/raw\\mi

In [57]:
print("=== DATA QUALITY CHECKS ===")

# Check for any remaining data issues
print("\n1. Distance values summary:")
dist_check = con.execute("""
    SELECT 
        COUNT(*) as total_records,
        COUNT(distance_km) as records_with_distance,
        AVG(distance_km) as avg_distance,
        MIN(distance_km) as min_distance,
        MAX(distance_km) as max_distance
    FROM silver.cleaned_mobility
""").df()
print(dist_check)

print("\n2. Sample of unique zone IDs (check for formatting):")
zones_check = con.execute("""
    SELECT DISTINCT origin_zone_id 
    FROM silver.cleaned_mobility 
    LIMIT 10
""").df()
print(zones_check)

print("\n3. Date range:")
date_range = con.execute("""
    SELECT MIN(trip_date) as start_date, MAX(trip_date) as end_date 
    FROM silver.cleaned_mobility
""").df()
print(date_range)


=== DATA QUALITY CHECKS ===

1. Distance values summary:
   total_records  records_with_distance  avg_distance  min_distance  \
0      134726205              134726205     13.968322          1.25   

   max_distance  
0          75.0  

2. Sample of unique zone IDs (check for formatting):
  origin_zone_id
0          15009
1       15047_AM
2          15087
3        1503003
4        1505803
5        1508201
6        2804902
7          15067
8          15042
9          28022

3. Date range:
  start_date   end_date
0 2023-05-08 2023-05-14


Clean Population Data

In [58]:


print("Cleaning population data...")

# First, let's check what the raw population data looks like
print("\nSample of raw population data:")
pop_sample = con.execute("SELECT * FROM bronze.population_districts LIMIT 5").df()
print(pop_sample)

print("\nColumn names in population data:")
pop_columns = con.execute("PRAGMA table_info(bronze.population_districts)").df()
print(pop_columns)


Cleaning population data...

Sample of raw population data:
    column0  column1
0     01001   2925.0
1     01002  10307.0
2  01004_AM   3005.0
3  01009_AM   4599.0
4     01010   2951.0

Column names in population data:
   cid     name     type  notnull dflt_value     pk
0    0  column0  VARCHAR    False       None  False
1    1  column1  VARCHAR    False       None  False


In [59]:
print("Cleaning population data...")

# Create cleaned population table
con.execute("""
CREATE OR REPLACE TABLE silver.cleaned_population AS
SELECT
    -- Clean zone IDs (remove any whitespace)
    TRIM(CAST(column0 AS VARCHAR)) as zone_id,
    
    -- Clean population count (handle 'NA' values and convert to integer)
    CASE 
        WHEN column1 = 'NA' THEN NULL
        WHEN TRIM(column1) = '' THEN NULL
        ELSE CAST(CAST(column1 AS DOUBLE) AS INTEGER)
    END as population_count

FROM bronze.population_districts
WHERE 
    -- Remove records with missing zone IDs
    column0 IS NOT NULL 
    AND column1 IS NOT NULL
    AND TRIM(column0) != ''  -- Remove empty zone IDs
""")

# Verify the results
print("✓ Population data cleaned!")
print("\nFirst 5 cleaned population rows:")
result = con.execute("SELECT * FROM silver.cleaned_population LIMIT 5").df()
print(result)

print("\nData types:")
schema = con.execute("DESCRIBE silver.cleaned_population").df()
print(schema)

print("\nPopulation data quality summary:")
quality_check = con.execute("""
    SELECT 
        COUNT(*) as total_records,
        COUNT(population_count) as records_with_population,
        COUNT(*) - COUNT(population_count) as records_with_null_population,
        AVG(population_count) as avg_population
    FROM silver.cleaned_population
""").df()
print(quality_check)


Cleaning population data...
✓ Population data cleaned!

First 5 cleaned population rows:
    zone_id  population_count
0     01001              2925
1     01002             10307
2  01004_AM              3005
3  01009_AM              4599
4     01010              2951

Data types:
        column_name column_type null   key default extra
0           zone_id     VARCHAR  YES  None    None  None
1  population_count     INTEGER  YES  None    None  None

Population data quality summary:
   total_records  records_with_population  records_with_null_population  \
0           3792                     3743                            49   

   avg_population  
0    12659.659898  


Combine cleaned tables

In [60]:
print("Creating integrated OD table with demographics...")

# Create the integrated table by joining mobility and population data
con.execute("""
CREATE OR REPLACE TABLE silver.silver_integrated_od AS
SELECT
    -- Mobility data
    m.trip_date,
    m.hour,
    m.origin_zone_id,
    m.destination_zone_id,
    m.distance_km,
    m.trips_count,
    
    -- Population data for ORIGIN zone
    orig_pop.population_count as origin_population,
    
    -- Population data for DESTINATION zone  
    dest_pop.population_count as destination_population,
    
    -- Keep raw values for reference
    m.raw_origin_activity,
    m.raw_destination_activity,
    m.filename

FROM silver.cleaned_mobility m
LEFT JOIN silver.cleaned_population orig_pop 
    ON m.origin_zone_id = orig_pop.zone_id
LEFT JOIN silver.cleaned_population dest_pop 
    ON m.destination_zone_id = dest_pop.zone_id
""")

# Verify the results
print("✓ Integrated table created!")
print("\nFirst 5 rows of integrated data:")
result = con.execute("SELECT * FROM silver.silver_integrated_od LIMIT 5").df()
print(result)

print("\nJoin quality check:")
join_quality = con.execute("""
    SELECT 
        COUNT(*) as total_trips,
        COUNT(origin_population) as trips_with_origin_population,
        COUNT(destination_population) as trips_with_destination_population
    FROM silver.silver_integrated_od
""").df()
print(join_quality)


Creating integrated OD table with demographics...
✓ Integrated table created!

First 5 rows of integrated data:
   trip_date  hour origin_zone_id destination_zone_id  distance_km  \
0 2023-05-08     3       01009_AM               01001         1.25   
1 2023-05-08    18       01009_AM               01001         1.25   
2 2023-05-08    19       01009_AM               01001         1.25   
3 2023-05-08    20       01009_AM               01001         1.25   
4 2023-05-08     7       01009_AM               01001        30.00   

   trips_count  origin_population  destination_population raw_origin_activity  \
0            3               4599                    2925           frecuente   
1            6               4599                    2925           frecuente   
2            6               4599                    2925           frecuente   
3            9               4599                    2925           frecuente   
4            3               4599                    2925     

Create parquet files

In [None]:
# Ensure silver directory exists
os.makedirs('data/lakehouse/silver', exist_ok=True)

print("Exporting Silver tables to Parquet files...")

# Export each table to Parquet
tables = ['cleaned_mobility', 'cleaned_population', 'silver_integrated_od']

for table in tables:
    parquet_path = f'../data/lakehouse/silver/{table}.parquet'
    con.execute(f"""
    COPY silver.{table} 
    TO '{parquet_path}' 
    (FORMAT PARQUET)
    """)
    print(f"✓ Exported silver.{table} to {parquet_path}")

# Verify the files were created
print("\nChecking created Parquet files:")
import glob
parquet_files = glob.glob('../data/lakehouse/silver/*.parquet')
for file in parquet_files:
    file_size = os.path.getsize(file) / (1024*1024)
    print(f"  - {os.path.basename(file)} ({file_size:.2f} MB)")



Exporting Silver tables to Parquet files...
✓ Exported silver.cleaned_mobility to ../data/lakehouse/silver/cleaned_mobility.parquet
✓ Exported silver.cleaned_population to ../data/lakehouse/silver/cleaned_population.parquet
✓ Exported silver.silver_integrated_od to ../data/lakehouse/silver/silver_integrated_od.parquet

Checking created Parquet files:


Final check

In [None]:
print("=== SILVER LAYER COMPLETION CHECK ===")

# Check tables exist
print("1. Tables in silver schema:")
tables = con.execute("SHOW TABLES FROM silver").df()
print(tables)

# Check Parquet files exist
print("\n2. Parquet files in ../data/lakehouse/silver/:")
parquet_files = glob.glob('../data/lakehouse/silver/*.parquet')
for file in parquet_files:
    size_mb = os.path.getsize(file) / (1024*1024)
    print(f"   - {os.path.basename(file)} ({size_mb:.1f} MB)")

# Verify record counts match
print("\n3. Data consistency check:")
for table in ['cleaned_mobility', 'cleaned_population', 'silver_integrated_od']:
    count = con.execute(f"SELECT COUNT(*) as count FROM silver.{table}").df()
    print(f"   - {table}: {count.iloc[0]['count']:,} records")

=== SILVER LAYER COMPLETION CHECK ===
1. Tables in silver schema:
                   name
0      cleaned_mobility
1    cleaned_population
2  silver_integrated_od

2. Parquet files in ../data/lakehouse/silver/:
   - cleaned_mobility.parquet (275.3 MB)
   - cleaned_population.parquet (0.0 MB)
   - silver_integrated_od.parquet (372.7 MB)

3. Data consistency check:
   - cleaned_mobility: 134,726,205 records
   - cleaned_population: 3,792 records
   - silver_integrated_od: 134,726,205 records
