# Experimenting with processing this file. Still need to figure out how to structure this file
In summary:
- there are some types that should be fixed. For example: `sac_type` should not be `Integer64`, `bu_use` should be `Int8`, `civic_no` should be `Int32`

In [1]:
#!/usr/bin/env python
# coding: utf-8
import gc
import glob
import os
import sys 

import buckaroo
import duckdb
from IPython.core.interactiveshell import InteractiveShell  
import geopandas as gpd
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy import text

# Enable multiple outputs per cell
InteractiveShell.ast_node_interactivity = "all"
# Show all columns
pd.set_option('display.max_columns', None)

DATABASE = os.environ.get("POSTGRES_DB")
USER = os.environ.get("POSTGRES_USER")
PASSWORD = os.environ.get("POSTGRES_PASSWORD")

engine = create_engine(f"postgresql://{USER}:{PASSWORD}@db:5432/{DATABASE}")

Buckaroo has been enabled as the default DataFrame viewer.  To return to default dataframe visualization use `from buckaroo import disable; disable()`


In [39]:
input_folder = '/data/national_address_register/extracted'

# 1. Process 2024-06 vintage

In [8]:
nar_addresses_csvs = glob.glob(f'{input_folder}/2024-06/Addresses/*.csv')
nar_locations_csvs = glob.glob(f'{input_folder}/2024-06/Locations/*.csv')
encoding = 'utf-8'

In [12]:
def process_nar_locations_csvs(csvs_to_process, encoding):
    """
    1. Reads subset of fields for National Address Register locations
    2. Appends all of the processed CSVs as one dataframe
    """
    dataframes_to_concatenate = []
    for filename in csvs_to_process:
        print(f"Processing {filename}")
        params = {
            'filepath_or_buffer': filename,
            'encoding': encoding,
            'usecols': ['LOC_GUID', 
                        'REPPOINT_LATITUDE', 
                        'REPPOINT_LONGITUDE'
                        ]
        }
        nar_location_df = pd.read_csv(**params)
        # Lowercase columns
        nar_location_df.columns = [x.lower() for x in nar_location_df.columns]
        dataframes_to_concatenate.append(nar_location_df)
        
    print("Concatenating all dataframes into one")
    nar_locations_df = pd.concat(dataframes_to_concatenate)
    
    return nar_locations_df

def process_nar_addresses_csvs(csvs_to_process, encoding):
    """
    1. Reads subset of fields for National Address Register addresses
    2. Appends all of the processed CSVs as one dataframe
    """
    dataframes_to_concatenate = []
    for filename in csvs_to_process:
        print(f"Processing {filename}")
        params = {
            'filepath_or_buffer': filename,
            'encoding': encoding,
            'usecols': ['LOC_GUID', 
                        'ADDR_GUID', 
                        'APT_NO_LABEL',
                        'CIVIC_NO',
                        'CIVIC_NO_SUFFIX',
                        'OFFICIAL_STREET_NAME',
                        'OFFICIAL_STREET_TYPE',
                        'OFFICIAL_STREET_DIR',
                        'MAIL_STREET_NAME',
                        'MAIL_STREET_TYPE',
                        'MAIL_STEET_DIR',
                        'MAIL_MUN_NAME',
                        'MAIL_POSTAL_CODE',
                        'BG_DLS_LSD',
                        'BG_DLS_QTR',
                        'BG_DLS_SCTN',
                        'BG_DLS_RNG',
                        'BG_DLS_MRD',
                        # Removing since REPPOINT_LATITUDE and REPPOINT_LONGITUDE seem to have same purpose
                        #'BG_X',
                        #'BG_Y',
                        'BU_USE',
                        'BU_N_CIVIC_ADD'
                        ],
            'dtype': {
                "CIVIC_NO": "Int32", 
                "PROV_CODE": object,
                "BU_USE": "Int8",
                "BG_DLS_LSD": object,
                "BG_DLS_QTR": object,
                "BG_DLS_SCTN": object,
                "BG_DLS_TWNSHP": object,
                "BG_DLS_RNG": object,
                "BG_DLS_MRD": object
            }
        }
        nar_address_df = pd.read_csv(**params)
        # Lowercase columns
        nar_address_df.columns = [x.lower() for x in nar_address_df.columns]
        dataframes_to_concatenate.append(nar_address_df)
        
    print("Concatenating all dataframes into one")
    nar_addresses_df = pd.concat(dataframes_to_concatenate, ignore_index=True)
    
    return nar_addresses_df

In [13]:
nar_locations = process_nar_locations_csvs(nar_locations_csvs, encoding)
nar_addresses = process_nar_addresses_csvs(nar_addresses_csvs, encoding)

Processing /data/national_address_register/extracted/2024-06/Locations/Location_10.csv
Processing /data/national_address_register/extracted/2024-06/Locations/Location_11.csv
Processing /data/national_address_register/extracted/2024-06/Locations/Location_12.csv
Processing /data/national_address_register/extracted/2024-06/Locations/Location_13.csv
Processing /data/national_address_register/extracted/2024-06/Locations/Location_24_part_1.csv
Processing /data/national_address_register/extracted/2024-06/Locations/Location_24_part_2.csv
Processing /data/national_address_register/extracted/2024-06/Locations/Location_24_part_3.csv
Processing /data/national_address_register/extracted/2024-06/Locations/Location_24_part_4.csv
Processing /data/national_address_register/extracted/2024-06/Locations/Location_35_part_1.csv
Processing /data/national_address_register/extracted/2024-06/Locations/Location_35_part_2.csv
Processing /data/national_address_register/extracted/2024-06/Locations/Location_35_part_

# TODO
- look into why there are locations with empty reppoint_latitude and reppoint_longitude
    - There are 84,285 records that have an empty reppoint_latitude and reppoint_longitude

In [14]:
print("Combining nar_addresses and nar_locations")
nar_addresses_combined = duckdb.sql("""
SELECT a.addr_guid, a.apt_no_label, a.civic_no, a.civic_no_suffix, a.official_street_name, a.mail_street_name, a.official_street_type, a.mail_street_type,
       a.official_street_dir AS official_street_direction, a.mail_steet_dir AS mail_street_direction, a.mail_postal_code, a.mail_mun_name AS mail_municipality_name, 
       a.bu_n_civic_add, a.bu_use,
       a.bg_dls_lsd, a.bg_dls_qtr, a.bg_dls_sctn, a.bg_dls_rng, a.bg_dls_mrd,
       b.reppoint_latitude, b.reppoint_longitude
FROM nar_addresses AS a,
     nar_locations AS b
WHERE a.loc_guid = b.loc_guid AND b.reppoint_latitude IS NOT NULL
""").df()

del nar_addresses
del nar_locations
gc.collect()

Combining nar_addresses and nar_locations


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

616

In [15]:
gdf = gpd.GeoDataFrame(
    nar_addresses_combined, 
    geometry=gpd.points_from_xy(nar_addresses_combined.reppoint_longitude,
                                nar_addresses_combined.reppoint_latitude),
    crs="EPSG:4326"
)

In [16]:
print("Dropping 'reppoint_latitude', 'reppoint_longitude' from geodataframe")
gdf.drop(columns=["reppoint_latitude", "reppoint_longitude"], 
         inplace=True)

Dropping 'reppoint_latitude', 'reppoint_longitude' from geodataframe


In [17]:
del nar_addresses_combined
gc.collect()

0

In [35]:
print("Loading geodataframe to PostgreSQL as bronze.nar_2024_06")
gdf.to_postgis(name="nar_2024_06", 
               schema='bronze',
               con=engine,
               chunksize=150000)

Loading geodataframe to PostgreSQL as bronze.nar_2024_06


In [36]:
del(gdf)
gc.collect()

3477

## Link to 2021 geographies
There are 10 records that were not linked to 2021 geographies

In [37]:
sql = """
DROP TABLE IF EXISTS silver.nar_2024_06;
CREATE TABLE silver.nar_2024_06 AS
SELECT DISTINCT
    b.country_dguid,
    b.country_en_name,
    b.country_fr_name,
    b.country_en_abbreviation,
    b.country_fr_abbreviation,
    b.grc_dguid,
    b.grc_en_name,
    b.grc_fr_name,
    b.pr_dguid,
    b.pr_en_name,
    b.pr_fr_name,
    b.pr_en_abbreviation,
    b.pr_fr_abbreviation,
    b.pr_iso_code,
    b.car_dguid,
    b.car_en_name,
    b.car_fr_name,
    b.er_dguid,
    b.er_name,
    b.cd_dguid,
    b.cd_name,
    b.cd_type,
    b.ccs_dguid,
    b.ccs_name,
    b.cma_dguid,
    b.cma_p_dguid,
    b.cma_name,
    b.cma_type,
    b.csd_dguid,
    b.csd_name,
    b.csd_type,
    b.sac_type,
    b.sac_code,
    b.fed_dguid,
    b.fed_name,
    b.fed_en_name,
    b.fed_fr_name,
    b.ct_dguid,
    b.ada_dguid,
    b.da_dguid,
    b.db_dguid,
    a.addr_guid,
    a.apt_no_label,
    a.civic_no,
    a.civic_no_suffix,
    a.official_street_name, 
    a.mail_street_name, 
    a.official_street_type,
    a.mail_street_type,
    a.official_street_direction,
    a.mail_street_direction,
    a.mail_postal_code,
    a.mail_municipality_name,
    a.bu_n_civic_add,
    a.bu_use,
    a.bg_dls_lsd,
    a.bg_dls_qtr,
    a.bg_dls_sctn,
    a.bg_dls_rng,
    a.bg_dls_mrd,
    a.geometry AS geom
FROM bronze.nar_2024_06 AS a,
     silver.db_2021_digital AS b
WHERE ST_Intersects(a.geometry, b.geom);

-- Create spatial index
CREATE INDEX nar_2024_06_geom_idx ON silver.nar_2024_06 USING gist (geom) WITH (
    fillfactor = 100
);
"""

In [38]:
with engine.connect() as conn:
    conn.execute(text(sql))
    conn.commit()

<sqlalchemy.engine.cursor.CursorResult at 0x7f523a52d940>