In [10]:
import gc
import glob
import os

import duckdb
from IPython.core.interactiveshell import InteractiveShell  
import numpy as np
import pandas as pd
from sqlalchemy import create_engine

# Enable multiple outputs per cell
InteractiveShell.ast_node_interactivity = "all"
# Show all columns
pd.set_option('display.max_columns', None)

data_dir = '/data/census_of_population/output/2021/tabular'

# PostgreSQL DB
DATABASE = os.environ.get("POSTGRES_DB")
USER = os.environ.get("POSTGRES_USER")
PASSWORD = os.environ.get("POSTGRES_PASSWORD")

engine = create_engine(f"postgresql://{USER}:{PASSWORD}@db:5432/{DATABASE}")

# DuckDB
con = duckdb.connect()
con.install_extension("spatial")
con.load_extension("spatial")

# Datasets
- 1.0 Canada, provinces, territories, census divisions (CDs), census subdivisions (CSDs) and dissemination areas (DAs)
- 2.0 Census metropolitan areas (CMAs), tracted census agglomerations (CAs) and census tracts (CTs)
- 3.0 Economic regions (ERs)
- 4.0 Population centres (POPCTRs)
- 5.0 Canada, provinces, territories and federal electoral districts (FEDs) (2013 Representation Order). **Just process FEDs**
- 6.0 Canada, provinces, territories and federal electoral districts (FEDs) (2023 Representation Order). **Just process FEDs**
- 7.0 Designated places (DPLs)
- 8.0 Aggregate dissemination areas (ADAs)
- 9.0 Forward sortation areas (FSAs)
- 10.0 Health regions (HRs)
   - The Health Regions CSV also has a `Home and Community Care Support Services` `GEO_LEVEL`
   - Need to take a look at the HRs files listed here https://www150.statcan.gc.ca/n1/pub/82-402-x/2023001/hrbf-flrs-eng.htm
- 11.0 Dissemination Blocks (DBs)

In [2]:
def process_cop_csv(csvs_to_process):
    """
    1. Reads subset of fields for Census of Population CSV files
    2. Pivots on characteristic_id
    3. Appends all of the processed CSVs as one dataframe
    """
    dataframes_to_concatenate = []
    for filename in csvs_to_process:
        print(f"Processing {filename}")
        params = {
            'filepath_or_buffer': filename,
            'encoding': 'latin-1',
            'usecols': ['DGUID', 
                        'CHARACTERISTIC_ID', 
                        'C1_COUNT_TOTAL',
                        'C2_COUNT_MEN+',
                        'C3_COUNT_WOMEN+'
                        ],
            'dtype': {
                'CHARACTERISTIC_ID': np.int16
            }
        }
        cop_df = pd.read_csv(**params)
        cop_df.rename(columns={
            'C1_COUNT_TOTAL': 'count_total',
            'C2_COUNT_MEN+': 'count_men', 
            'C3_COUNT_WOMEN+': 'count_women',
            'DGUID': 'dguid'
        }, inplace=True)

        cop_df = cop_df.pivot(index='dguid', columns='CHARACTERISTIC_ID')

        # Flatten the hierarchical index
        # https://stackoverflow.com/questions/14507794/how-to-flatten-a-hierarchical-index-in-columns/57630176#57630176
        level_one = cop_df.columns.get_level_values(0).astype(str)
        level_two = cop_df.columns.get_level_values(1).astype(str)
        column_separator = ['_' if x != '' else '' for x in level_two]
        cop_df.columns = level_one + column_separator + level_two
        dataframes_to_concatenate.append(cop_df)
        
    print("Concatenating all dataframes into one")
    cop_df = pd.concat(dataframes_to_concatenate)
    
    return cop_df

In [3]:
def drop_na_columns(dataframe):
    """
    Delete columns where there are no values.
    There are cases where there are values for the count_total
    columns, but no values for the count_men and count_women columns
    """
    columns_to_drop = []
    for field in dataframe.columns:
        minimum_value = dataframe[field].min()
        maximum_value = dataframe[field].max()
        if pd.isna(minimum_value) and pd.isna(maximum_value):
            columns_to_drop.append(field)

    if columns_to_drop:
        print("Dropping columns that don't have values")
        dataframe.drop(columns=columns_to_drop, inplace=True)

In [21]:
def convert_to_lowest_type(df):
    """
    Convert columns to the best possible dtypes
    For example, if the column is numerical and has a maximum value of 32,000 we can assign it a type of int16
    """
    params = {
        'convert_string': False,
        'convert_boolean': False
    }
    df = df.convert_dtypes(**params)

    dtypes = pd.DataFrame(df.dtypes)
    
    # Downcast to the smallest numerical dtype
    for row in dtypes.itertuples():
        column = row[0]
        the_type = str(row[1])
        
        # Skipping downcasting Float64 as there were issues with decimal places
        # For example, instead of a value being 65.4, it turned into 65.4000015258789
        if the_type == 'Float64':
            continue          
        elif the_type == 'Int64':
            df[column] = pd.to_numeric(df[column], downcast='integer')

    return df

# Start processing
## 1.0 Process Canada, provinces, territories (PRs), census divisions (CDs), census subdivisions (CSDs) and dissemination areas (DAs)

In [5]:
csvs_to_process = glob.glob("/data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/*English_CSV_data*")
cop_df = process_cop_csv(csvs_to_process)

Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_Atlantic.csv
Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_BritishColumbia.csv
Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_Ontario.csv
Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_Prairies.csv
Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_Quebec.csv
Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_Territories.csv
Concatenating all dataframes into one


# Remove duplicates
- For example, for some reason, they included Canada (dguid 2021A000011124) 6 times (once per CSV), so we need to get unique values

In [None]:
print(f"Number of records before {len(cop_df)}")
print("Before:")
cop_df[cop_df.index == '2021A000011124']

# Get unique records
cop_df = cop_df.groupby(cop_df.index).last()
print(f"Number of records after {len(cop_df)}")
cop_df[cop_df.index == '2021A000011124']

Get unique records

# Split the Census of Population dataframe by geographic level

In [None]:
# Get the dguid per level of geography
con.sql("""
DROP TABLE IF EXISTS country_2021;
CREATE TABLE country_2021 AS SELECT country_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/country_2021.parquet';

DROP TABLE IF EXISTS pr_2021;
CREATE TABLE pr_2021 AS SELECT pr_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/pr_2021.parquet';

DROP TABLE IF EXISTS cd_2021;
CREATE TABLE cd_2021 AS SELECT cd_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/cd_2021.parquet';

DROP TABLE IF EXISTS csd_2021;
CREATE TABLE csd_2021 AS SELECT csd_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/csd_2021.parquet';

DROP TABLE IF EXISTS da_2021;
CREATE TABLE da_2021 AS SELECT da_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/da_2021.parquet';
""")
con.commit()

# Convert the duckdb tables to pandas dataframe
country_dguid = con.sql("SELECT * FROM country_2021").to_df()
pr_dguid = con.sql("SELECT * FROM pr_2021").to_df()
cd_dguid = con.sql("SELECT * FROM cd_2021").to_df()
csd_dguid = con.sql("SELECT * FROM csd_2021").to_df()
da_dguid = con.sql("SELECT * FROM da_2021").to_df()

# Join the Census of Population dataframe to each geographic level
cop_country = cop_df.join(country_dguid.set_index('dguid'), on='dguid', how='inner')
cop_pr = cop_df.join(pr_dguid.set_index('dguid'), on='dguid', how='inner')
cop_cd = cop_df.join(cd_dguid.set_index('dguid'), on='dguid', how='inner')
cop_csd = cop_df.join(csd_dguid.set_index('dguid'), on='dguid', how='inner')
cop_da = cop_df.join(da_dguid.set_index('dguid'), on='dguid', how='inner')

del(cop_df)
del(country_dguid)
del(pr_dguid)
del(cd_dguid)
del(csd_dguid)
del(da_dguid)
gc.collect()

<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>

0

# Convert dataframe columns to lowest dtype

In [9]:
cop_country = convert_to_lowest_type(cop_country)
cop_pr = convert_to_lowest_type(cop_pr)
cop_cd = convert_to_lowest_type(cop_cd)
cop_csd = convert_to_lowest_type(cop_csd)
cop_da = convert_to_lowest_type(cop_da)

# Delete columns where there are no values

## Number of columns before

In [11]:
print(f"Country- length: {len(cop_country.columns)}")
print(f"PR- length: {len(cop_pr.columns)}")
print(f"CD- length: {len(cop_cd.columns)}")
print(f"CSD- length: {len(cop_csd.columns)}")
print(f"DA- length: {len(cop_da.columns)}")

Country- length: 7893
PR- length: 7893
CD- length: 7893
CSD- length: 7893
DA- length: 7893


In [12]:
drop_na_columns(cop_country)
drop_na_columns(cop_pr)
drop_na_columns(cop_cd)
drop_na_columns(cop_csd)
drop_na_columns(cop_da)

Dropping columns that don't have values
Dropping columns that don't have values
Dropping columns that don't have values
Dropping columns that don't have values
Dropping columns that don't have values


## Number of columns after

In [13]:
print(f"Country- length: {len(cop_country.columns)}")
print(f"PR- length: {len(cop_pr.columns)}")
print(f"CD- length: {len(cop_cd.columns)}")
print(f"CSD- length: {len(cop_csd.columns)}")
print(f"DA- length: {len(cop_da.columns)}")

Country- length: 7433
PR- length: 7433
CD- length: 7433
CSD- length: 7433
DA- length: 7431


In [27]:
# Country
cop_country = cop_country.reset_index()
cop_country.rename(columns={'dguid': 'country_dguid'}, inplace=True)
cop_country.to_parquet(path=f'{data_dir}/country_2021.parquet', index=False, compression='zstd')

# Provinces and Territories
cop_pr = cop_pr.reset_index()
cop_pr.rename(columns={'dguid': 'pr_dguid'}, inplace=True)
cop_pr.to_parquet(path=f'{data_dir}/pr_2021.parquet', index=False, compression='zstd')

# Census Divisions
cop_cd = cop_cd.reset_index()
cop_cd.rename(columns={'dguid': 'cd_dguid'}, inplace=True)
cop_cd.to_parquet(path=f'{data_dir}/cd_2021.parquet', index=False, compression='zstd')

# Census Subdivisions
cop_csd = cop_csd.reset_index()
cop_csd.rename(columns={'dguid': 'csd_dguid'}, inplace=True)
cop_csd.to_parquet(path=f'{data_dir}/csd_2021.parquet', index=False, compression='zstd')

# Dissemination Areas
cop_da = cop_da.reset_index()
cop_da.rename(columns={'dguid': 'da_dguid'}, inplace=True)
cop_da.to_parquet(path=f'{data_dir}/da_2021.parquet', index=False, compression='zstd')

In [28]:
del(cop_country)
del(cop_pr)
del(cop_cd)
del(cop_csd)
del(cop_da)
gc.collect()

0

## 2.0 Process Census metropolitan areas (CMAs), tracted census agglomerations (CAs) and census tracts (CTs)

# TODO: Finish processing CMA

In [None]:
csvs_to_process = glob.glob("/data/census_of_population/extracted/2021/98-401-X2021007_eng_CSV/*English_CSV_data*")
cop_df = process_cop_csv(csvs_to_process)

# Get the dguid per level of geography
con.sql("""
DROP TABLE IF EXISTS country_2021;
DROP TABLE IF EXISTS pr_2021;
DROP TABLE IF EXISTS cd_2021;
DROP TABLE IF EXISTS csd_2021;
DROP TABLE IF EXISTS da_2021;

DROP TABLE IF EXISTS cma_2021;
CREATE TABLE cma_2021 AS SELECT cma_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/cma_2021.parquet';

DROP TABLE IF EXISTS ct_2021;
CREATE TABLE ct_2021 AS SELECT ct_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/ct_2021.parquet';
""")
con.commit()

# Convert the duckdb tables to pandas dataframe
cma_dguid = con.sql("SELECT * FROM cma_2021").to_df()
ct_dguid = con.sql("SELECT * FROM ct_2021").to_df()

# Join the Census of Population dataframe to each geographic level
# There's going to be missing links
cop_cma = cop_df.join(cma_dguid.set_index('dguid'), on='dguid', how='inner')
cop_ct = cop_df.join(ct_dguid.set_index('dguid'), on='dguid', how='inner')

del(ct_dguid)
del(cop_df)
gc.collect()

# Convert columns to lowest dtypes
cop_ct = convert_to_lowest_type(cop_ct)

# Drop NA columns
print(f"CT - Number of Columns BEFORE: {len(cop_ct.columns)}")
drop_na_columns(cop_ct)
print(f"CT - Number of Columns AFTER: {len(cop_ct.columns)}")

# Export
# Census Tracts
cop_ct = cop_ct.reset_index()
cop_ct.rename(columns={'dguid': 'ct_dguid'}, inplace=True)
cop_ct.to_parquet(path=f'{data_dir}/ct_2021.parquet', index=False, compression='zstd')

del(cop_ct)
gc.collect()

Processing /data/census_of_population/extracted/2021/98-401-X2021007_eng_CSV/98-401-X2021007_English_CSV_data.csv
Concatenating all dataframes into one


<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>

244615

## 3.0 Process Economic regions (ERs) (98-401-X2021008_eng_CSV)
This file also includes Provinces and Territories and Country

In [None]:
csvs_to_process = glob.glob("/data/census_of_population/extracted/2021/98-401-X2021008_eng_CSV/*English_CSV_data*")
cop_df = process_cop_csv(csvs_to_process)

# Get the dguid per level of geography
con.sql("""
DROP TABLE IF EXISTS cma_2021;
DROP TABLE IF EXISTS ct_2021;

DROP TABLE IF EXISTS er_2021;
CREATE TABLE er_2021 AS SELECT er_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/er_2021.parquet';
""")
con.commit()

# Convert the duckdb tables to pandas dataframe
er_dguid = con.sql("SELECT * FROM er_2021").to_df()

# Join the Census of Population dataframe to each geographic level
cop_er = cop_df.join(er_dguid.set_index('dguid'), on='dguid', how='inner')

del(er_dguid)
del(cop_df)
gc.collect()

# Convert columns to lowest dtypes
cop_er = convert_to_lowest_type(cop_er)

# Drop NA columns
print(f"ER - Number of Columns BEFORE: {len(cop_er.columns)}")
drop_na_columns(cop_er)
print(f"CT - Number of Columns AFTER: {len(cop_er.columns)}")

# Export
# Economic Regions
cop_er = cop_er.reset_index()
cop_er.rename(columns={'dguid': 'er_dguid'}, inplace=True)
cop_er.to_parquet(path=f'{data_dir}/er_2021.parquet', index=False, compression='zstd')

del(cop_er)
gc.collect()

Processing /data/census_of_population/extracted/2021/98-401-X2021008_eng_CSV/98-401-X2021008_English_CSV_data.csv
Concatenating all dataframes into one


<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>

## 4.0 Process Population centres (POPCTRs)
### There are 1026 DGUIDs in the Census of Population data, but there should be 1030
They also use the pop_ctr_dguid and not the pop_ctr_p_dguid. So, there's no way to differentiate between Ottawa, and Gatineau for pop_ctr_dguid 2021S05100616

In [None]:
csvs_to_process = glob.glob("/data/census_of_population/extracted/2021/98-401-X2021009_eng_CSV/*English_CSV_data*")
cop_df = process_cop_csv(csvs_to_process)

# Get the dguid per level of geography
con.sql("""
DROP TABLE IF EXISTS er_2021;

DROP TABLE IF EXISTS pop_ctr_2021;
CREATE TABLE pop_ctr_2021 AS SELECT pop_ctr_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/pop_ctr_2021.parquet';
""")
con.commit()

# Convert the duckdb tables to pandas dataframe
pop_ctr_dguid = con.sql("SELECT * FROM pop_ctr_2021").to_df()

# Join the Census of Population dataframe to each geographic level
cop_pop_ctr = cop_df.join(pop_ctr_dguid.set_index('dguid'), on='dguid', how='inner')

del(pop_ctr_dguid)
del(cop_df)
gc.collect()

# Convert columns to lowest dtypes
cop_pop_ctr = convert_to_lowest_type(cop_pop_ctr)

# Drop NA columns
print(f"POP CTR - Number of Columns BEFORE: {len(cop_pop_ctr.columns)}")
drop_na_columns(cop_pop_ctr)
print(f"POP CTR- Number of Columns AFTER: {len(cop_pop_ctr.columns)}")

# Export
# Population Centers
cop_pop_ctr = cop_pop_ctr.reset_index()
cop_pop_ctr.rename(columns={'dguid': 'pop_ctr_dguid'}, inplace=True)
cop_pop_ctr.to_parquet(path=f'{data_dir}/pop_ctr_2021.parquet', index=False, compression='zstd')

del(cop_pop_ctr)
gc.collect()

Processing /data/census_of_population/extracted/2021/98-401-X2021009_eng_CSV/98-401-X2021009_English_CSV_data.csv
Concatenating all dataframes into one


<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>

## 5.0 Process Federal electoral districts (FEDs) (2013 Representation Order)

In [None]:
csvs_to_process = glob.glob("/data/census_of_population/extracted/2021/98-401-X2021010_eng_CSV/*English_CSV_data*")
cop_df = process_cop_csv(csvs_to_process)

# Get the dguid per level of geography
con.sql("""
DROP TABLE IF EXISTS pop_ctr_2021;

DROP TABLE IF EXISTS fed_2013;
CREATE TABLE fed_2013 AS SELECT fed_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/fed_2021_2013.parquet';
""")
con.commit()

# Convert the duckdb tables to pandas dataframe
fed_dguid = con.sql("SELECT * FROM fed_2013").to_df()

# Join the Census of Population dataframe to each geographic level
cop_fed = cop_df.join(fed_dguid.set_index('dguid'), on='dguid', how='inner')

del(fed_dguid)
del(cop_df)
gc.collect()

# Convert columns to lowest dtypes
cop_fed = convert_to_lowest_type(cop_fed)

# Drop NA columns
print(f"FED - Number of Columns BEFORE: {len(cop_fed.columns)}")
drop_na_columns(cop_fed)
print(f"FED - Number of Columns AFTER: {len(cop_fed.columns)}")

# Export
cop_fed = cop_fed.reset_index()
cop_fed.rename(columns={'dguid': 'fed_dguid'}, inplace=True)
cop_fed.to_parquet(path=f'{data_dir}/fed_2013.parquet', index=False, compression='zstd')

del(cop_fed)
gc.collect()

Processing /data/census_of_population/extracted/2021/98-401-X2021010_eng_CSV/98-401-X2021010_English_CSV_data.csv
Concatenating all dataframes into one


<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>

0

FED - Number of Columns BEFORE: 7893
Dropping columns that don't have values
FED - Number of Columns AFTER: 7433


0

## 6.0 Process Federal electoral districts (FEDs) (2023 Representation Order)
There should be 343 2023 FEDs

In [None]:
csvs_to_process = glob.glob("/data/census_of_population/extracted/2021/98-401-X2021029_eng_CSV/*English_CSV_data*")
cop_df = process_cop_csv(csvs_to_process)

# Get the dguid per level of geography
con.sql("""
DROP TABLE IF EXISTS fed_2013;

/*
DROP TABLE IF EXISTS fed_2023;
CREATE TABLE fed_2023 AS SELECT fed_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/fed_2023.parquet';
*/
""")
con.commit()

# Convert the duckdb tables to pandas dataframe
#fed_dguid = con.sql("SELECT * FROM fed_2023").to_df()

# Join the Census of Population dataframe to each geographic level
#cop_fed = cop_df.join(fed_dguid.set_index('dguid'), on='dguid', how='inner')
cop_df = cop_df.reset_index()
cop_df = cop_df[cop_df['dguid'].str.contains("2023")]

# Convert columns to lowest dtypes
cop_fed = convert_to_lowest_type(cop_df)

del(cop_df)
gc.collect()

# Drop NA columns
print(f"FED - Number of Columns BEFORE: {len(cop_fed.columns)}")
drop_na_columns(cop_fed)
print(f"FED - Number of Columns AFTER: {len(cop_fed.columns)}")

# Export
cop_fed.rename(columns={'dguid': 'fed_dguid'}, inplace=True)
cop_fed.to_parquet(path=f'{data_dir}/fed_2023.parquet', index=False, compression='zstd')

del(cop_fed)
gc.collect()

Processing /data/census_of_population/extracted/2021/98-401-X2021029_eng_CSV/98-401-X2021029_English_CSV_data.csv
Concatenating all dataframes into one


<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>

31

FED - Number of Columns BEFORE: 7894
Dropping columns that don't have values
FED - Number of Columns AFTER: 7427


0

## 7.0 Process Designated places (DPLs)
There should be 1685 DPLs

In [None]:
csvs_to_process = glob.glob("/data/census_of_population/extracted/2021/98-401-X2021011_eng_CSV/*English_CSV_data*")
cop_df = process_cop_csv(csvs_to_process)

# Get the dguid per level of geography
con.sql("""
DROP TABLE IF EXISTS dpl_2021;
CREATE TABLE dpl_2021 AS SELECT dpl_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/dpl_2021.parquet';
""")
con.commit()

# Convert the duckdb tables to pandas dataframe
dpl_dguid = con.sql("SELECT * FROM dpl_2021").to_df()

# Join the Census of Population dataframe to each geographic level
cop_dpl = cop_df.join(dpl_dguid.set_index('dguid'), on='dguid', how='inner')

del(dpl_dguid)
del(cop_df)
gc.collect()

# Convert columns to lowest dtypes
cop_dpl = convert_to_lowest_type(cop_dpl)

# Drop NA columns
print(f"DPL - Number of Columns BEFORE: {len(cop_dpl.columns)}")
drop_na_columns(cop_dpl)
print(f"DPL - Number of Columns AFTER: {len(cop_dpl.columns)}")

# Export
cop_dpl = cop_dpl.reset_index()
cop_dpl.rename(columns={'dguid': 'dpl_dguid'}, inplace=True)
cop_dpl.to_parquet(path=f'{data_dir}/dpl_2021.parquet', index=False, compression='zstd')

del(cop_dpl)
gc.collect()

Processing /data/census_of_population/extracted/2021/98-401-X2021011_eng_CSV/98-401-X2021011_English_CSV_data.csv
Concatenating all dataframes into one


<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>

0

DPL - Number of Columns BEFORE: 7893
Dropping columns that don't have values
DPL - Number of Columns AFTER: 7433


## 8.0 Process Aggregate dissemination areas (ADAs)
There should be 5433 ADAs

In [None]:
csvs_to_process = glob.glob("/data/census_of_population/extracted/2021/98-401-X2021012_eng_CSV/*English_CSV_data*")
cop_df = process_cop_csv(csvs_to_process)

# Get the dguid per level of geography
con.sql("""
DROP TABLE IF EXISTS dpl_2021;
DROP TABLE IF EXISTS ada_2021;
CREATE TABLE ada_2021 AS SELECT ada_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/ada_2021.parquet';
""")
con.commit()

# Convert the duckdb tables to pandas dataframe
ada_dguid = con.sql("SELECT * FROM ada_2021").to_df()

# Join the Census of Population dataframe to each geographic level
cop_ada = cop_df.join(ada_dguid.set_index('dguid'), on='dguid', how='inner')

del(ada_dguid)
del(cop_df)
gc.collect()

# Convert columns to lowest dtypes
cop_ada = convert_to_lowest_type(cop_ada)

# Drop NA columns
print(f"ADA - Number of Columns BEFORE: {len(cop_ada.columns)}")
drop_na_columns(cop_ada)
print(f"ADA - Number of Columns AFTER: {len(cop_ada.columns)}")

# Export
cop_ada = cop_ada.reset_index()
cop_ada.rename(columns={'dguid': 'ada_dguid'}, inplace=True)
cop_ada.to_parquet(path=f'{data_dir}/ada_2021.parquet', index=False, compression='zstd')

del(cop_ada)
gc.collect()

Processing /data/census_of_population/extracted/2021/98-401-X2021012_eng_CSV/98-401-X2021012_English_CSV_data.csv
Concatenating all dataframes into one


<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>

## 9.0 Process Forward sortation areas (FSAs)
There should be 1643 FSAs

In [None]:
csvs_to_process = glob.glob("/data/census_of_population/extracted/2021/98-401-X2021013_eng_CSV/*English_CSV_data*")
cop_df = process_cop_csv(csvs_to_process)

# Get the dguid per level of geography
con.sql("""
DROP TABLE IF EXISTS ada_2021;
DROP TABLE IF EXISTS fsa_2021;
CREATE TABLE fsa_2021 AS SELECT fsa_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/fsa_2021.parquet';
""")
con.commit()

# Convert the duckdb tables to pandas dataframe
fsa_dguid = con.sql("SELECT * FROM fsa_2021").to_df()

# Join the Census of Population dataframe to each geographic level
cop_fsa = cop_df.join(fsa_dguid.set_index('dguid'), on='dguid', how='inner')

del(fsa_dguid)
del(cop_df)
gc.collect()

# Convert columns to lowest dtypes
cop_fsa = convert_to_lowest_type(cop_fsa)

# Drop NA columns
print(f"FSA - Number of Columns BEFORE: {len(cop_fsa.columns)}")
drop_na_columns(cop_fsa)
print(f"FSA - Number of Columns AFTER: {len(cop_fsa.columns)}")

# Export
cop_fsa = cop_fsa.reset_index()
cop_fsa.rename(columns={'dguid': 'fsa_dguid'}, inplace=True)
cop_fsa.to_parquet(path=f'{data_dir}/fsa_2021.parquet', index=False, compression='zstd')

del(cop_fsa)
gc.collect()

Processing /data/census_of_population/extracted/2021/98-401-X2021013_eng_CSV/98-401-X2021013_English_CSV_data.csv
Concatenating all dataframes into one


<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>

684

FSA - Number of Columns BEFORE: 7893
Dropping columns that don't have values
FSA - Number of Columns AFTER: 7429


0

## 10.0 Process Health regions (HRs) and Local health integration networks
Start looking here https://www150.statcan.gc.ca/n1/en/catalogue/82-402-X

In [None]:
csvs_to_process = glob.glob("/data/census_of_population/extracted/2021/98-401-X2021015_eng_CSV/*English_CSV_data*")
cop_df = process_cop_csv(csvs_to_process)

# Get the dguid per level of geography
con.sql("""
DROP TABLE IF EXISTS fsa_2021;
DROP TABLE IF EXISTS hr_2022;
CREATE TABLE hr_2022 AS SELECT hr_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/hr_2022.parquet';
""")
con.commit()

# Convert the duckdb tables to pandas dataframe
hr_dguid = con.sql("SELECT * FROM hr_2022").to_df()

## 11.0 Dissemination Blocks (DBs)

In [25]:
sql = """
SELECT db_dguid, 
db_pop_2021 AS count_total_1, 
db_total_private_dwell_2021 AS count_total_4,
db_usual_residents_dwellings_2021 AS count_total_5
FROM silver.gaf_2021;
"""

cop_df = pd.read_sql_query(sql=sql, con=engine, index_col='db_dguid')

# Get the dguid per level of geography
con.sql("""
DROP TABLE IF EXISTS db_2021;
CREATE TABLE db_2021 AS SELECT db_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/db_2021.parquet';
""")
con.commit()

# Convert the duckdb tables to pandas dataframe
db_dguid = con.sql("SELECT * FROM db_2021").to_df()

# Join the Census of Population dataframe to each geographic level
cop_db = cop_df.join(db_dguid.set_index('dguid'), on='db_dguid', how='inner')

# Convert columns to lowest dtypes
cop_db = convert_to_lowest_type(cop_db)

# Export
cop_db = cop_db.reset_index()
cop_db.to_parquet(path=f'{data_dir}/db_2021.parquet', index=False, compression='zstd')

<duckdb.duckdb.DuckDBPyConnection at 0x7f6df0455b30>