In [2]:
import bs4
import pandas as pd
import requests
import re

from urllib.parse import urljoin

## PPP source CSV discovery

In [3]:
ppp_base_url = 'https://data.sba.gov'
ppp_data_url = urljoin(ppp_base_url, 'dataset/ppp-foia')

page_content = requests.get(ppp_data_url).content

soup = bs4.BeautifulSoup(page_content, 'html.parser')

data_links = soup.find_all('a', attrs={'title': re.compile(r'\.csv$')})

csv_list = [{'url': urljoin(ppp_base_url, '/'.join([link.get('href'), 'download', link.get('title')])), 'filename': link.get('title')} for link in data_links]

csv_list

[{'url': 'https://data.sba.gov/dataset/ppp-foia/resource/c1275a03-c25c-488a-bd95-403c4b2fa036/download/public_150k_plus_240930.csv',
  'filename': 'public_150k_plus_240930.csv'},
 {'url': 'https://data.sba.gov/dataset/ppp-foia/resource/cff06664-1f75-4969-ab3d-6fa7d6b4c41e/download/public_up_to_150k_1_240930.csv',
  'filename': 'public_up_to_150k_1_240930.csv'},
 {'url': 'https://data.sba.gov/dataset/ppp-foia/resource/1e6b6629-a5aa-46e6-a442-6e67366d2362/download/public_up_to_150k_2_240930.csv',
  'filename': 'public_up_to_150k_2_240930.csv'},
 {'url': 'https://data.sba.gov/dataset/ppp-foia/resource/644c304a-f5ad-4cfa-b128-fe2cbcb7b26e/download/public_up_to_150k_3_240930.csv',
  'filename': 'public_up_to_150k_3_240930.csv'},
 {'url': 'https://data.sba.gov/dataset/ppp-foia/resource/98af633d-eb1b-4d4b-995d-330962e6c38d/download/public_up_to_150k_4_240930.csv',
  'filename': 'public_up_to_150k_4_240930.csv'},
 {'url': 'https://data.sba.gov/dataset/ppp-foia/resource/3b407e04-f269-47a0-a5fe-

In [4]:
import pandas as pd

first_csv = csv_list[0].get('url')
first_file = pd.read_csv(first_csv, encoding_errors='replace')
first_file.columns

KeyboardInterrupt: 

In [9]:
from io import StringIO

census_file = 'https://www2.census.gov/programs-surveys/popest/datasets/2010-2020/counties/asrh/CC-EST2020-ALLDATA.csv'
csv = requests.get(census_file, verify=False).text
pd.read_csv(StringIO(csv)).head()

  pd.read_csv(StringIO(csv)).head()


Unnamed: 0,SUMLEV,STATE,COUNTY,STNAME,CTYNAME,YEAR,AGEGRP,TOT_POP,TOT_MALE,TOT_FEMALE,...,HWAC_MALE,HWAC_FEMALE,HBAC_MALE,HBAC_FEMALE,HIAC_MALE,HIAC_FEMALE,HAAC_MALE,HAAC_FEMALE,HNAC_MALE,HNAC_FEMALE
0,50,1,1,Alabama,Autauga County,1,0,54571,26569,28002,...,607,538,57,48,26,32,9,11,19,10
1,50,1,1,Alabama,Autauga County,1,1,3579,1866,1713,...,77,56,9,5,4,1,0,0,2,1
2,50,1,1,Alabama,Autauga County,1,2,3991,2001,1990,...,64,66,2,3,2,7,2,3,2,0
3,50,1,1,Alabama,Autauga County,1,3,4290,2171,2119,...,51,57,13,7,5,5,2,1,1,1
4,50,1,1,Alabama,Autauga County,1,4,4290,2213,2077,...,48,44,7,5,0,2,2,1,3,1


In [11]:
from pathlib import Path
Path(census_file).name

'CC-EST2020-ALLDATA.csv'

In [17]:
xwalk_path = 'https://www2.census.gov/geo/docs/reference/state.txt'
resp = requests.get(xwalk_path, verify=False)
pd.read_csv(StringIO(resp.text), sep='|').head()



Unnamed: 0,STATE,STUSAB,STATE_NAME,STATENS
0,1,AL,Alabama,1779775
1,2,AK,Alaska,1785533
2,4,AZ,Arizona,1779777
3,5,AR,Arkansas,68085
4,6,CA,California,1779778


## Investigate county name matches

In [None]:
import duckdb

db_path = '../data/ppp_loan_analysis.duckdb'

with duckdb.connect(db_path) as db:
    query = '''
        SELECT DISTINCT
            p.project_state,
            p.project_county_name
        FROM bronze.paycheck_protection_loans p
        LEFT JOIN bronze.census_2020_estimates c
        p.project_state = c.
    '''
    db.sql()

## Cleaning result

Cleans state and county mismatches with regex reconciliation. Can be built into dbt view for reconciliation. Or use in a bridge table for exact matching across sources. 


In [None]:
query = '''
SELECT
	DISTINCT
    s.state_name,
	p.project_state,
	p.project_county_name,
	c.ctyname
FROM
	bronze.paycheck_protection_loans p
LEFT JOIN bronze.state_crosswalk s 
ON
	p.project_state = s.stusab
LEFT JOIN bronze.census_2020_estimates c 
ON
	s.state_name = c.stname
		AND LOWER(
				REGEXP_REPLACE(
					REGEXP_REPLACE(p.project_county_name, '^(ST |SAINTE)', 'SAINT', 'i'),
				'[ -]|''', '', 'g')
			) =
		LOWER(
			REGEXP_REPLACE(
				REGEXP_REPLACE(
					REGEXP_REPLACE(c.ctyname, ' (County|Parish|City and Borough|Borough|Municipality|Census Area)', '', 'i'), 
				'(St\. |Ste\. |^St )', 'SAINT', 'i'),
			'[ -]|''', '', 'g')
		)
WHERE
	c.ctyname IS NULL
	AND p.project_county_name IS NOT NULL
	AND p.project_state NOT IN ('PR', 'MP', 'AE', 'VI', 'AS', 'GU')
ORDER BY
	p.project_state
'''