# Download Census Shapefiles

> These are Census address blocks; download Shapefile(s) from https://www2.census.gov/geo/tiger/TIGER_RD18/LAYER/ADDR/


__Artifacts Generated__
<p/>

1. Volume - `<catalog>.census.census_data/address_block_shapefiles`
1. Table - `<catalog>.census.shape_address_block`

--- 
__Author:__ Michael Johns <mjohns@databricks.com>  | _Last Modified: 07 FEB 2025_

## Setup

In [0]:
from pyspark.databricks.sql import functions as dbf
from pyspark.sql import functions as F
from pyspark.sql.functions import udf, col
from pyspark.sql.types import *
from pyspark.sql import Window

import os



In [0]:
spark.conf.set("spark.sql.shuffle.partitions", 10_000)               # <- default is 200
# https://spark.apache.org/docs/latest/sql-performance-tuning.html#adaptive-query-execution
spark.conf.set("spark.databricks.optimizer.adaptive.enabled", False) # <- default is true       

__Configure Database + Username__

> Note: Adjust this to your own specified [Unity Catalog](https://docs.databricks.com/en/data-governance/unity-catalog/manage-privileges/admin-privileges.html#managing-unity-catalog-metastores) Schema.

In [0]:
catalog_name = "mjohns"
db_name = "census"

sql(f"use catalog {catalog_name}")
sql(f"use schema {db_name}")

DataFrame[]

In [0]:
%sql show tables

database,tableName,isTemporary
census,ga_address_block,False


__Setup `ETL_DIR` + `ETL_DIR_FUSE`__

> Note: Adjust this to your own specified [Volume](https://docs.databricks.com/en/ingestion/add-data/upload-to-volume.html#upload-files-to-a-unity-catalog-volume) (under a schema).

In [0]:
ETL_DIR = '/Volumes/mjohns/census/census_data/address_block_shapefiles'
os.environ['ETL_DIR'] = ETL_DIR

dbutils.fs.mkdirs(ETL_DIR)
print(f"...ETL_DIR: '{ETL_DIR}' (create)")

...ETL_DIR: '/Volumes/mjohns/census/census_data/address_block_shapefiles' (create)


In [0]:
ls $ETL_DIR/..

[0m[34;42maddress_block_shapefiles[0m/


## Get All GA Addresses (Shapefiles)
<p/>

* Look for pattern https://www2.census.gov/geo/tiger/TIGER_RD18/LAYER/ADDRFEAT/tl_rd22_13*.zip (13 is GA number)

In [0]:
state_num = "13"

__Make `address_features` directory.__

In [0]:
dbutils.fs.mkdirs(f"{ETL_DIR}/address_features")

Out[11]: True

### Get List of Shapefile ZIPs

In [0]:
%sh 
## --- UNCOMMENT TO RE-RUN ---
wget -O address_features.txt "https://www2.census.gov/geo/tiger/TIGER_RD18/LAYER/ADDRFEAT/"

--2023-10-23 14:35:14--  https://www2.census.gov/geo/tiger/TIGER_RD18/LAYER/ADDRFEAT/
Resolving www2.census.gov (www2.census.gov)... 104.86.227.69, 2600:1409:12:2b9::208c, 2600:1409:12:286::208c
Connecting to www2.census.gov (www2.census.gov)|104.86.227.69|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘address_features.txt’

     0K .......... .......... .......... .......... .......... 4.30M
    50K .......... .......... .......... .......... .......... 4.01M
   100K .......... .......... .......... .......... .......... 15.2M
   150K .......... .......... .......... .......... .......... 23.0M
   200K .......... .......... .......... .......... .......... 10.9M
   250K .......... .......... .......... .......... .......... 18.9M
   300K .......... .......... .......... .......... .......... 25.1M
   350K .......... .......... .......... .......... .......... 20.1M
   400K .......... .......... .......... .......... ......

In [0]:
dbutils.fs.cp("file:/databricks/driver/address_features.txt", ETL_DIR)

Out[22]: True

In [0]:
ls $ETL_DIR

[0m[34;42maddress_features[0m/  [01;32maddress_features.txt[0m*


__Figure out which rows are within the `<table>` tag and extract the filenames.__

> Since this is all in one file being read on one node, get consistent ordered id for `row_num` (not always true).

In [0]:
tbl_start_row = (
  spark.read.text(f"{ETL_DIR}/address_features.txt")
  .withColumn("row_num", F.monotonically_increasing_id())
  .withColumn("tbl_start_row", F.trim("value") == '<table>')
  .filter("tbl_start_row = True")
  .select("row_num")
).collect()[0][0]

tbl_end_row = (
  spark.read.text(f"{ETL_DIR}/address_features.txt")
  .withColumn("row_num", F.monotonically_increasing_id())
  .withColumn("tbl_end_row", F.trim("value") == '</table>')
  .filter("tbl_end_row = True")
  .select("row_num")
).collect()[0][0]

print(f"tbl_start_row: {tbl_start_row}, tbl_end_row: {tbl_end_row}")

tbl_start_row: 237, tbl_end_row: 3463


In [0]:
state_files = [r[1] for r in (
  spark.read.text(f"{ETL_DIR}/address_features.txt")
  .withColumn("row_num", F.monotonically_increasing_id())
    .filter(f"row_num > {tbl_start_row}")
    .filter(f"row_num < {tbl_end_row}")
  .withColumn("href_start", F.substring_index("value", 'href="', -1))
  .withColumn("href", F.substring_index("href_start", '">', 1))
    .filter(col("href").startswith(f"tl_rd22_{state_num}")) 
  .select("row_num","href")
).collect()]

print(f"len state files? {len(state_files):,}")
state_files[:5]

len state files? 159
Out[26]: ['tl_rd22_13001_addrfeat.zip',
 'tl_rd22_13003_addrfeat.zip',
 'tl_rd22_13005_addrfeat.zip',
 'tl_rd22_13007_addrfeat.zip',
 'tl_rd22_13009_addrfeat.zip']

### Download Shapefile ZIPs (159)

> Could do this in parallel, but keeping on just driver for now so as to not overload Census server with requests.

__Note: writing locally to driver, then copying to volume with `dbutils`.__

In [0]:
import pathlib
import requests

vol_path = pathlib.Path(f"{ETL_DIR}/address_features")
local_path = pathlib.Path(f"address_features")
local_path.mkdir(parents=True, exist_ok=True)
local_path_str = local_path.resolve().as_posix()

for idx,f in enumerate(state_files):
  idx_str = str(idx).rjust(4)
  
  vol_file = vol_path / f
  if not vol_file.exists():
    local_file = local_path / f 
    print(f"{idx_str} --> '{f}'")
    req = requests.get(f'https://www2.census.gov/geo/tiger/TIGER_RD18/LAYER/ADDRFEAT/{f}')
    with open(local_file, 'wb') as f:
      f.write(req.content)
  else:
    print(f"{idx_str} --> '{f}' exists...skipping")

   0 --> 'tl_rd22_13001_addrfeat.zip' exists...skipping
   1 --> 'tl_rd22_13003_addrfeat.zip' exists...skipping
   2 --> 'tl_rd22_13005_addrfeat.zip' exists...skipping
   3 --> 'tl_rd22_13007_addrfeat.zip' exists...skipping
   4 --> 'tl_rd22_13009_addrfeat.zip' exists...skipping
   5 --> 'tl_rd22_13011_addrfeat.zip' exists...skipping
   6 --> 'tl_rd22_13013_addrfeat.zip' exists...skipping
   7 --> 'tl_rd22_13015_addrfeat.zip' exists...skipping
   8 --> 'tl_rd22_13017_addrfeat.zip' exists...skipping
   9 --> 'tl_rd22_13019_addrfeat.zip' exists...skipping
  10 --> 'tl_rd22_13021_addrfeat.zip' exists...skipping
  11 --> 'tl_rd22_13023_addrfeat.zip' exists...skipping
  12 --> 'tl_rd22_13025_addrfeat.zip' exists...skipping
  13 --> 'tl_rd22_13027_addrfeat.zip' exists...skipping
  14 --> 'tl_rd22_13029_addrfeat.zip' exists...skipping
  15 --> 'tl_rd22_13031_addrfeat.zip' exists...skipping
  16 --> 'tl_rd22_13033_addrfeat.zip' exists...skipping
  17 --> 'tl_rd22_13035_addrfeat.zip' exists...s

In [0]:
dbutils.fs.cp(f"file:{local_path_str}", f"{ETL_DIR}/address_features", recurse=True)

Out[39]: True

In [0]:
ls -lh $ETL_DIR/address_features

total 366M
-rwxrwxrwx 1 nobody nogroup  1.8M Oct 23 14:53 [0m[01;32mtl_rd22_13001_addrfeat.zip[0m*
-rwxrwxrwx 1 nobody nogroup  888K Oct 23 14:53 [01;32mtl_rd22_13003_addrfeat.zip[0m*
-rwxrwxrwx 1 nobody nogroup  814K Oct 23 14:53 [01;32mtl_rd22_13005_addrfeat.zip[0m*
-rwxrwxrwx 1 nobody nogroup  447K Oct 23 14:53 [01;32mtl_rd22_13007_addrfeat.zip[0m*
-rwxrwxrwx 1 nobody nogroup  1.8M Oct 23 14:53 [01;32mtl_rd22_13009_addrfeat.zip[0m*
-rwxrwxrwx 1 nobody nogroup  1.4M Oct 23 14:53 [01;32mtl_rd22_13011_addrfeat.zip[0m*
-rwxrwxrwx 1 nobody nogroup  2.2M Oct 23 14:53 [01;32mtl_rd22_13013_addrfeat.zip[0m*
-rwxrwxrwx 1 nobody nogroup  4.5M Oct 23 14:53 [01;32mtl_rd22_13015_addrfeat.zip[0m*
-rwxrwxrwx 1 nobody nogroup  1.1M Oct 23 14:53 [01;32mtl_rd22_13017_addrfeat.zip[0m*
-rwxrwxrwx 1 nobody nogroup  1.6M Oct 23 14:53 [01;32mtl_rd22_13019_addrfeat.zip[0m*
-rwxrwxrwx 1 nobody nogroup  4.8M Oct 23 14:53 [01;32mtl_rd22_13021_addrfeat.zip[0m*
-rwxrwxrwx 1 nob