In [5]:
#!/usr/bin/env python
# coding: utf-8
import os
import sys

import geopandas as gpd
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy import text

placenames_2021_csv = "/home/jovyan/work/process-statcan-data/data/geosuite/extracted/2021_92-150-X_eng/PN.csv"

DATABASE = os.environ.get("POSTGRES_DB")
HOST = os.environ.get("WAREHOUSE_PG_HOST")
USER = os.environ.get("POSTGRES_USER")
PASSWORD = os.environ.get("POSTGRES_PASSWORD")

#engine = create_engine(f"postgresql://{USER}:{PASSWORD}@{HOST}:5432/{DATABASE}")

print(f"Reading {placenames_2021_csv}")
placenames = pd.read_csv(filepath_or_buffer=placenames_2021_csv,
                         encoding='latin-1',
                         usecols=['PNdguid', 'PNname', 'PNsource', 'PNrplat', 'PNrplong'])

placenames.rename(columns={
    'PNdguid': 'pn_dguid',
    'PNname': 'pn_name',
    'PNsource': 'pn_source',
    'PNrplat': 'latitude',
    'PNrplong': 'longitude'
}, inplace=True)

Reading /home/jovyan/work/process-statcan-data/data/geosuite/extracted/2021_92-150-X_eng/PN.csv


In [7]:
placenames[placenames['pn_dguid'] == '2021S0515005422'].to_records()

rec.array([(4269, '2021S0515005422', 'Cascapédia\x96Saint-Jules', 1, 48.25, -65.9166667)],
          dtype=[('index', '<i8'), ('pn_dguid', 'O'), ('pn_name', 'O'), ('pn_source', '<i8'), ('latitude', '<f8'), ('longitude', '<f8')])

In [27]:
special_unicodes = []
for record in placenames.to_records():
    pn_dguid = record[1]
    pn_name = record[2]
    if r'\x' in repr(pn_name):
        special_unicodes.append((pn_dguid, pn_name))

In [32]:
#print(special_unicodes)
len(special_unicodes)
#dguids_affected = [x[0] for x in special_unicodes]

19

In [30]:
dguids_affected

['2021S0515005422',
 '2021S0515007864',
 '2021S0515017557',
 '2021S0515019487',
 '2021S0515019731',
 '2021S0515022795',
 '2021S0515024311',
 '2021S0515028429',
 '2021S0515030028',
 '2021S0515030168',
 '2021S0515030432',
 '2021S0515031197',
 '2021S0515031295',
 '2021S0515031660',
 '2021S0515032370',
 '2021S0515038300',
 '2021S0515038389',
 '2021S0515040448',
 '2021S0515040522']

In [16]:
r'\x' in r'Cascapédia\x96Saint-Jules'

True

In [None]:
print("Creating geodataframe from placenames file")
gdf = gpd.GeoDataFrame(
    placenames, 
    geometry=gpd.points_from_xy(placenames.longitude,
                                placenames.latitude),
    crs="EPSG:4326"
)

print("Dropping 'latitude', 'longitude' from geodataframe")
gdf.drop(columns=["latitude", "longitude"], 
         inplace=True)

print(f"Loading geodatframe to PostgreSQL as statcan_pn_2021_tmp")
gdf.to_postgis(name=f"statcan_pn_2021_tmp", 
               con=engine,
               chunksize=150000,
               if_exists='replace')

print("Creating statcan_pn_2021")
sql = """
DROP TABLE IF EXISTS statcan_pn_2021;

CREATE TABLE statcan_pn_2021 AS
SELECT 
db.country_dguid,
db.country_en_name, 
db.country_fr_name,
db.country_en_abbreviation,
db.country_fr_abbreviation,
db.grc_dguid,
db.grc_en_name,
db.grc_fr_name,
db.pr_dguid,
db.pr_en_name,
db.pr_fr_name,
db.pr_en_abbreviation,
db.pr_fr_abbreviation,
db.pr_iso_code,
db.car_dguid,
db.car_en_name,
db.car_fr_name,
db.er_dguid,
db.er_name,
db.cd_dguid,
db.cd_name,
db.cd_type,
db.ccs_dguid,
db.ccs_name,
db.cma_dguid,
db.cma_p_dguid,
db.cma_name,
db.cma_type,
db.csd_dguid,
db.csd_name,
db.csd_type,
db.sac_type,
db.sac_code,
db.fed_dguid,
db.fed_name,
db.fed_en_name,
db.fed_fr_name,
db.ct_dguid,
db.ada_dguid,
db.da_dguid,
db.db_dguid,
placenames.pn_dguid,
placenames.pn_name,
placenames.pn_source,
placenames.geometry as geom
FROM statcan_pn_2021_tmp as placenames,
     statcan_db_2021 as db
WHERE ST_Intersects(placenames.geometry, db.geom);

CREATE INDEX statcan_pn_2021_geom_idx ON
statcan_pn_2021 
	USING GIST(geom) WITH (FILLFACTOR = 100);
"""

with engine.connect() as conn:
    conn.execute(text(sql))
    conn.commit()