In [1]:
print('hello world')

hello world


In [2]:
import polars as pl
import datetime as dt

In [3]:
import os
os.listdir("/home/jovyan")

['.profile',
 '.bash_logout',
 '.bashrc',
 'data',
 '.npm',
 '.conda',
 '.cache',
 '.wget-hsts',
 '.config',
 '.jupyter',
 '.local',
 '.ipython',
 'work']

In [12]:
states_path = '/home/jovyan/work/raw/states/'

# Acquiring counties

In [5]:
class CountyAcquisition:
    def __init__(self, out_dir='raw/states/'):
        self.out_dir = out_dir
        os.makedirs(out_dir, exist_ok=True)

    def run(self, in_file='2021_Gaz_counties_national.txt'):
        """Parse all US counties from here:
        https://www2.census.gov/geo/docs/maps-data/data/gazetteer/2021_Gazetteer/2021_Gaz_counties_national.zip

        Args:
            in_file (str, optional): Input wget file. Defaults to '2021_Gaz_counties_national.txt'.
        """
        df = pl.read_csv(in_file, separator='\t')
        for c in list(df['USPS'].unique()):
            df2 = df.filter(pl.col("USPS") == c)
            os.makedirs(f'{self.out_dir}/{c}', exist_ok=True)
            df2.write_csv(f'{self.out_dir}/{c}/counties.csv')

In [6]:
ca = CountyAcquisition('/home/jovyan/work/raw/states/')
ca.run("/home/jovyan/data/2024_Gaz_counties_national.txt")

# Acquiring State

In [9]:
class StateAcquisition:
    def __init__(self, out_dir='raw/states/'):
        self.out_dir = out_dir
        os.makedirs(out_dir, exist_ok=True)

    def run(self, in_file='zip_code_database.csv'):
        """Parses a list of all ZIPS per state

        Args:
            in_file (str, optional): [description]. Defaults to 'zip_code_database.csv'.
        """
        df = pl.read_csv(in_file)
        for c in list(df['state'].unique()):
            df2 = df.filter(pl.col("state") == c)
            os.makedirs(f'{self.out_dir}/{c}', exist_ok=True)
            df2.write_csv(f'{self.out_dir}/{c}/zips.csv')

In [10]:
sa = StateAcquisition('/home/jovyan/work/raw/states/')
sa.run("/home/jovyan/data/zip_code_database.csv")

# Fetcing ACS Data

In [33]:
import requests
import json
import os 
import logging
logger = logging.getLogger(__name__)


class GeographicsAcquisition:
    def __init__(self, out_dir: str):
        self.out_dir = out_dir
        os.makedirs(os.path.join(out_dir, 'counties/'), exist_ok=True)
        os.makedirs(os.path.join(out_dir, 'zips/'), exist_ok=True)

    def _save(self, data: dict, out_path: str):
        if not data:
            return 
        with open(out_path, 'w') as f:
            json.dump(data, f)

    def _get_geographics(self, by: str, on: str, i=0) -> dict:
        """Gets Geographics data from CensusReporter.org

        Args:
            by (str): Either zip or county
            on (str): Zipcode or county ID
            i (int, optional): Recursion coutner. Defaults to 0.

        Returns:
            dict: JSON data for the chosen geographics
        """
        years = ['2018','2019','2020','2021','2022','2023','2017','2016','2015']
        if i >= len(years):
            return None 
        year = years[i]
        # Set Summary
        if by == 'zip':
            summary = '860'
        else:
            summary = '050'
        # Try
        with requests.Session() as s:
            r = s.get(f'http://embed.censusreporter.org.s3.amazonaws.com/1.0/data/profiles/{year}/{summary}00US{on}.json', timeout=2)
        if (r.status_code) != 200:
            return self._get_geographics(by, on=on, i=i+1)
        _js = json.loads(r.content.decode())
        return _js

    def run(self, zips:pl.DataFrame, counties:pl.DataFrame):
        for r in zips.iter_rows(named = True):
            try:
                _zip = r['zip']
                _out_file = f'zips/{_zip}.json'
                _out_path = os.path.join(self.out_dir, _out_file)
                # Avoid uplates for now
                if not os.path.exists(_out_path):
                    self._save(self._get_geographics(by='zip', on=_zip), out_path=_out_path)
            except Exception as e:
                logger.exception(e)
        for r in counties.iter_rows(named = True):
            try:
                # GEOID -> NAME
                geoid = r['GEOID']
                county = r['NAME'].replace(' County', '')
                _out_file = f'counties/{county}.json'
                _out_path = os.path.join(self.out_dir, _out_file)
                # Avoid uplates for now
                if not os.path.exists(_out_path):
                    self._save(self._get_geographics(by='zip', on=geoid), out_path=_out_path)
            except Exception as e:
                logger.exception(e)

In [34]:
sample_zips = pl.read_csv(os.path.join(states_path,'CA/zips.csv'))
sample_counties = pl.read_csv(os.path.join(states_path,'CA/counties.csv'))
print(sample_zips.head(n = 5))
print(sample_counties.head(n = 5))

sample_zips = sample_zips.head(n = 5)
sample_counties = sample_counties.head(n = 5)

shape: (5, 15)
┌───────┬──────────┬──────────────┬─────────────┬───┬─────────┬──────────┬───────────┬─────────────┐
│ zip   ┆ type     ┆ decommission ┆ primary_cit ┆ … ┆ country ┆ latitude ┆ longitude ┆ irs_estimat │
│ ---   ┆ ---      ┆ ed           ┆ y           ┆   ┆ ---     ┆ ---      ┆ ---       ┆ ed_populati │
│ i64   ┆ str      ┆ ---          ┆ ---         ┆   ┆ str     ┆ f64      ┆ f64       ┆ on          │
│       ┆          ┆ i64          ┆ str         ┆   ┆         ┆          ┆           ┆ ---         │
│       ┆          ┆              ┆             ┆   ┆         ┆          ┆           ┆ i64         │
╞═══════╪══════════╪══════════════╪═════════════╪═══╪═════════╪══════════╪═══════════╪═════════════╡
│ 90001 ┆ STANDARD ┆ 0            ┆ Los Angeles ┆ … ┆ US      ┆ 33.97    ┆ -118.24   ┆ 50130       │
│ 90002 ┆ STANDARD ┆ 0            ┆ Los Angeles ┆ … ┆ US      ┆ 33.94    ┆ -118.24   ┆ 45910       │
│ 90003 ┆ STANDARD ┆ 0            ┆ Los Angeles ┆ … ┆ US      ┆ 33.96    ┆ -

In [35]:
ga = GeographicsAcquisition(states_path)
ga.run(sample_zips, sample_counties)