In [1]:
print('hello world')

hello world


In [2]:
import polars as pl
import datetime as dt

In [3]:
import os
os.listdir("/home/jovyan")

['.profile',
 '.bash_logout',
 '.bashrc',
 'data',
 '.npm',
 '.conda',
 '.cache',
 '.wget-hsts',
 '.config',
 '.jupyter',
 '.local',
 '.ipython',
 'work']

In [4]:
states_path = '/home/jovyan/work/raw/states/'

# Acquiring counties

In [5]:
class CountyAcquisition:
    def __init__(self, out_dir='raw/states/'):
        self.out_dir = out_dir
        os.makedirs(out_dir, exist_ok=True)

    def run(self, in_file='2021_Gaz_counties_national.txt'):
        """Parse all US counties from here:
        https://www2.census.gov/geo/docs/maps-data/data/gazetteer/2021_Gazetteer/2021_Gaz_counties_national.zip

        Args:
            in_file (str, optional): Input wget file. Defaults to '2021_Gaz_counties_national.txt'.
        """
        df = pl.read_csv(in_file, separator='\t')
        for c in list(df['USPS'].unique()):
            df2 = df.filter(pl.col("USPS") == c)
            os.makedirs(f'{self.out_dir}/{c}', exist_ok=True)
            df2.write_csv(f'{self.out_dir}/{c}/counties.csv')

In [6]:
ca = CountyAcquisition('/home/jovyan/work/raw/states/')
ca.run("/home/jovyan/data/2024_Gaz_counties_national.txt")

# Acquiring State

In [7]:
class StateAcquisition:
    def __init__(self, out_dir='raw/states/'):
        self.out_dir = out_dir
        os.makedirs(out_dir, exist_ok=True)

    def run(self, in_file='zip_code_database.csv'):
        """Parses a list of all ZIPS per state

        Args:
            in_file (str, optional): [description]. Defaults to 'zip_code_database.csv'.
        """
        df = pl.read_csv(in_file)
        for c in list(df['state'].unique()):
            df2 = df.filter(pl.col("state") == c)
            os.makedirs(f'{self.out_dir}/{c}', exist_ok=True)
            df2.write_csv(f'{self.out_dir}/{c}/zips.csv')

In [8]:
sa = StateAcquisition('/home/jovyan/work/raw/states/')
sa.run("/home/jovyan/data/zip_code_database.csv")

# Fetcing ACS Data

In [9]:
import requests
import json
import os
import logging
import polars as pl
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm  # For progress bar


logger = logging.getLogger(__name__)

class GeographicsAcquisition:
    def __init__(self, states_path: str):
        self.states_path = states_path

    def _save(self, data: dict, out_path: str):
        if data:
            with open(out_path, 'w') as f:
                json.dump(data, f)

    def _get_geographics(self, by: str, on: str) -> dict:
        """Gets Geographics data from CensusReporter.org."""
        years = ['2018', '2019', '2020', '2021', '2022', '2023', '2017', '2016', '2015']
        summary = '860' if by == 'zip' else '050'

        with requests.Session() as s:
            for year in years:  # Iterate through years efficiently
                url = f'http://embed.censusreporter.org.s3.amazonaws.com/1.0/data/profiles/{year}/{summary}00US{on}.json'
                try:
                    r = s.get(url, timeout=2)
                    r.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
                    return json.loads(r.content.decode())
                except requests.exceptions.RequestException as e:
                    logger.warning(f"Error fetching data for {on} in {year}: {e}")  # Log the warning and continue

        return None  # Return None if no data found for all years



    def process_state(self, state_code: str):
        state_path = os.path.join(self.states_path, state_code)
        zips_path = os.path.join(state_path, 'zips.csv')
        counties_path = os.path.join(state_path, 'counties.csv')
        out_dir = state_path

        os.makedirs(os.path.join(out_dir, 'counties'), exist_ok=True)
        os.makedirs(os.path.join(out_dir, 'zips'), exist_ok=True)



        try:
            zips_df = pl.read_csv(zips_path)
            counties_df = pl.read_csv(counties_path)
        except FileNotFoundError as e:
            logger.error(f"Error reading files for state {state_code}: {e}")
            return



        with ThreadPoolExecutor() as executor:
            # Process ZIP codes
            zip_futures = [executor.submit(self._process_geographics, 'zip', row['zip'], out_dir) for row in zips_df.iter_rows(named=True)]
            for future in tqdm(zip_futures, desc=f"Processing ZIPs for {state_code}", unit="zip"):
                future.result()  # Retrieve results (mostly for exception handling)


            # Process counties
            county_futures = [executor.submit(self._process_geographics, 'county', row['GEOID'], out_dir, county_name=row['NAME'].replace(' County', '')) for row in counties_df.iter_rows(named=True)]
            for future in tqdm(county_futures, desc=f"Processing Counties for {state_code}", unit="county"):
                future.result()


    def _process_geographics(self, by: str, on: str, out_dir: str, county_name:str=None):
        _out_file = f'zips/{on}.json' if by=='zip' else f'counties/{county_name}.json'
        _out_path = os.path.join(out_dir, _out_file)
        if not os.path.exists(_out_path):
            data = self._get_geographics(by=by, on=on)
            self._save(data, _out_path)


    def run(self):
        state_codes = [f.name for f in os.scandir(self.states_path) if f.is_dir()]
        for state_code in state_codes:
            self.process_state(state_code)

In [None]:

# Example usage:
states_path = "/home/jovyan/work/raw/states/"
ga = GeographicsAcquisition(states_path)
ga.run()

Processing ZIPs for WY:   0%|          | 0/195 [00:00<?, ?zip/s]Error fetching data for 82008 in 2018: 404 Client Error: Not Found for url: http://embed.censusreporter.org.s3.amazonaws.com/1.0/data/profiles/2018/86000US82008.json
Error fetching data for 82071 in 2018: 404 Client Error: Not Found for url: http://embed.censusreporter.org.s3.amazonaws.com/1.0/data/profiles/2018/86000US82071.json
Error fetching data for 82003 in 2018: 404 Client Error: Not Found for url: http://embed.censusreporter.org.s3.amazonaws.com/1.0/data/profiles/2018/86000US82003.json
Error fetching data for 82902 in 2018: 404 Client Error: Not Found for url: http://embed.censusreporter.org.s3.amazonaws.com/1.0/data/profiles/2018/86000US82902.json
Error fetching data for 82931 in 2018: 404 Client Error: Not Found for url: http://embed.censusreporter.org.s3.amazonaws.com/1.0/data/profiles/2018/86000US82931.json
Error fetching data for 82717 in 2018: 404 Client Error: Not Found for url: http://embed.censusreporter.or