- Tarrant precincts: https://www.tarrantcountytx.gov/en/elections/interactive-maps/commissioner-precinct-maps.html
- Texas VTDS: https://data.capitol.texas.gov/dataset/vtds/resource/906f47e4-4e39-4156-b1bd-4969be0b2780
- Texas Elections: https://data.capitol.texas.gov/topic/elections
- ACS: https://www.census.gov/programs-surveys/acs/data.html
- Blockgroup shapefiles: https://www2.census.gov/geo/tiger/TIGER2023/BG/
- Census Python: https://pypi.org/project/census/
- CRS: https://epsg.io/3085 & https://epsg.io/4269


In [0]:
from IPython.display import clear_output
try:
    %reload_ext autotime
except:
    %pip install -U ipython-autotime ipywidgets codetiming pandas geopandas matplotlib Census us
    dbutils.library.restartPython()
    clear_output()
    dbutils.notebook.exit('Rerun to use newly installed/updated packages')

import warnings, os, pathlib, dataclasses, codetiming, requests, zipfile, us, census, numpy as np, pandas as pd, geopandas as gpd, matplotlib.pyplot as plt
pd.options.display.max_columns = None
####################### helper functions #######################

def get_size(path):
    os.system(f'du -h {path}')

def rm(path, root=False):
    path = pathlib.Path(path)
    if path.is_file():
        path.unlink()
    elif path.is_dir():
        if root:
            shutil.rmtree(path)
        else:
            for p in path.iterdir():
                rm(p, True)
    return path

def mkdir(path):
    path = pathlib.Path(path)
    (path if path.suffix == '' else path.parent).mkdir(parents=True, exist_ok=True)
    return path

def reset(path):
    rm(path)
    mkdir(path)
    return path

def fetch(path, url, overwrite=False):
    path = pathlib.Path(path)
    if overwrite or not path.exists():
        reset(path)
        response = requests.get(url)
        print(f'fetching {url} to {path}')
        with open(path, 'wb') as file:
            file.write(response.content)
        if zipfile.is_zipfile(path):
            with zipfile.ZipFile(path, 'r') as zip_ref:
                zip_ref.extractall(path.parent)
    return path

def dump(path, obj):
    obj = obj.prep()
    obj.to_parquet(reset(path))
    return obj

def load(path):
    return gpd.read_parquet(path)

def disp(X, max_rows=3, sort=False):
    """convenient display method"""
    print(X.shape)
    X = (X.sort_index(axis=1) if sort else X).reset_index()
    Y = pd.DataFrame({'dtype':X.dtypes.astype('string'), 'missing_pct':X.isnull().mean()*100}).T.rename_axis('column').reset_index().prep(case='')
    display(Y)
    display(X.head(max_rows))

def to_numeric(df, case='lower', downcast='integer', errors='ignore', category=False, **kwargs):
    """convert to numeric dtypes if possible"""
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        case = case if case in dir(pd.Series().str) else 'strip'
        return (
            df
            .apply(lambda s: getattr(s.astype('string').str.strip().str,case)() if s.dtype in ['object','string'] else s)  # prep strings
            .apply(lambda s: s if pd.api.types.is_datetime64_any_dtype(s) or s.dtype in ['geometry'] else pd.to_numeric(s, downcast=downcast, errors=errors, **kwargs))  # convert to numeric if possible
            .convert_dtypes()  # convert to new nullable dtypes
            .apply(lambda s: s.astype('Int64') if pd.api.types.is_integer_dtype(s) else s.astype('category') if s.dtype in ['object','string'] and category else s)
        )

def prep(df, **kwargs):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        h = lambda x: x.to_numeric(**kwargs).rename(columns=lambda s: s.lower().strip() if isinstance(s, str) else s)
        idx = h(df[[]].reset_index())  # drop columns, reset_index to move index to columns, then apply g
        return h(df).reset_index(drop=True).set_index(pd.MultiIndex.from_frame(idx))  # set idx back to df's index

def get_area(df):
    return df.assign(area=df.geometry.to_crs('EPSG:3085').area)

def get_mlt(df, grp, col):
    return df.groupby(grp)[[col]].transform(lambda x: x/x.sum()).values

for fcn in [disp, to_numeric, prep, get_area, get_mlt]:
    """monkey-patch helpers into (Geo)Pandas DataFrame so we can use df.method syntax"""
    setattr(pd.DataFrame, fcn.__name__, fcn)
    setattr(gpd.GeoDataFrame, fcn.__name__, fcn)

####################### real code #######################

suf = lambda s, x='geoid': type(x)(f'{k}_{s}' for k in x) if isinstance(x,(list,tuple,set)) else f'{x}_{s}'
read_geo = lambda s, **kwargs: gpd.read_file(s, **kwargs).prep().to_crs('EPSG:4269')
intersect = lambda X, Y, **kwargs: gpd.overlay(X, Y, keep_geom_type=True, **kwargs)

@dataclasses.dataclass
class Redistricting():
    years: tuple = (2020,2021,2022,2023)
    current: int = 2024
    state: any = us.states.TX
    districts: tuple = ('precinct','congress','senate','house','commish','jp','education')
    offices: tuple = ('president','u.s. sen','governor','lt. governor','attorney gen')
    api_key: str = '5640e76608e24d8d6cc35b96ce35028445957cb5'
    acs: dict = (('B25034_010E','structures1940'),)
    overwrite: set = None


    #Allows self['attr'] and self.attr syntax
    def __contains__(self, key):
        return hasattr(self, key)
    def __getitem__(self, key):
        return getattr(self, key)
    def __setitem__(self, key, val):
        setattr(self, key, val)
    def __delitem__(self, key):
        if key in self:
            delattr(self, key)


    def __post_init__(self):
        self.root = pathlib.Path(f'/Volumes/aiml/scook/scook_files/redistricting/{self.state.abbr}')
        self.prc = self.root/f'processed'
        self.acs = dict(self.acs)
        self.districts = list(self.districts)
        self.overwrite = set() if self.overwrite is None else set(self.overwrite)


    def get(self, fcn, nm, prereq=[], **kwargs):
        dst = self.prc/f"{nm.split('_')[0]}/{nm}.parquet"
        if nm in self.overwrite:
            del self[nm]
            reset(dst)
            self.overwrite.remove(nm)
        if not nm in self:
            if dst.exists():
                self[nm] = load(dst)
            else:
                [f() for f in prereq]
                print(f'creating {dst}')
                with codetiming.Timer():
                    self[nm] = dump(dst, fcn(**kwargs))
        return self[nm]


    def get_blkgrps_yr(self, yr):
        def fcn():
            dst = self.root/f'data/blkgrps/{yr}/blkgrps_{yr}.zip'
            url = f'https://www2.census.gov/geo/tiger/TIGER{yr}/BG/tl_{yr}_{self.state.fips}_bg.zip'
            geo = read_geo(fetch(dst, url), columns=['GEOID','geometry']).set_index('geoid')

            dst = self.root/f'data/cvap/{yr}/cvap_{yr}.zip'
            url = f'https://www2.census.gov/programs-surveys/decennial/rdo/datasets/{yr}/{yr}-cvap/CVAP_{yr-4}-{yr}_ACS_csv_files.zip'
            cvap = (
                pd.read_csv(fetch(dst,url).parent/'BlockGr.csv', encoding='latin1')
                .prep()
                .assign(geoid=lambda X: X['geoid'].str[-12:])
                .pivot_table(index='geoid', columns='lntitle', values=['cit_est','cvap_est'])
                .fillna(0)
                .prep()
                .rename_axis('geoid')
            )
            cvap.columns = [k[:-3]+v for k,v in cvap.columns]

            # activate if Census API is down to skip code below
            return geo.join(cvap).reset_index().rename(columns=lambda x: x if x=='geometry' else suf(yr,x))

            g = lambda s, x: x.astype(str).str.rjust(s,'0')
            acs = (
                pd.DataFrame(census.Census(self.api_key).acs5.state_county_blockgroup([*self.acs.keys()], self.state.fips, '*', '*', year=yr))
                .assign(geoid=lambda X: g(2,X['state']) + g(3,X['county']) + g(6,X['tract']) + g(1,X['block group']))
                .set_index('geoid')
                [[*self.acs.keys()]]
                .rename(columns=self.acs)
                .prep()
            )
            print('acs')
            return geo.join(acs).join(cvap).reset_index().rename(columns=lambda x: x if x=='geometry' else suf(yr,x))
        return self.get(fcn, f'blkgrps_{yr}')


    def get_blkgrps(self):
        self.blkgrps = {yr: self.get_blkgrps_yr(yr) for yr in self.years}
        return self.blkgrps


    def get_vtds(self):
        def fcn():
            yr = self.current
            dst = self.root/f'data/vtds/{yr}/vtds_{yr}.zip'
            url = f'https://data.capitol.texas.gov/dataset/4d8298d0-d176-4c19-b174-42837027b73e/resource/906f47e4-4e39-4156-b1bd-4969be0b2780/download/vtds_{yr}pg.zip'
            geo = read_geo(fetch(dst, url), columns=['VTDKEY'])

            # get district info from Tarrant County shapefile released by FOIA to Cook
            tarrant = read_geo(self.root/f'precincts/{yr}/precincts_{yr}_tarrant')
            tarrant = intersect(tarrant, geo).get_area().sort_values('area').groupby('vtdkey')[self.districts].last()

            dst = self.root/f'data/elections/{yr}/elections_{yr}.zip'
            url = f'https://data.capitol.texas.gov/dataset/35b16aee-0bb0-4866-b1ec-859f1f044241/resource/e1cd6332-6a7a-4c78-ad2a-852268f6c7a2/download/{yr}-general-vtds-election-data.zip'
            elections = pd.concat(
                    pd.read_csv(fetch(dst,url).parent/f'{elec}_General_Election_Returns.csv').prep()
                    .assign(
                        vtdkey=lambda X: X['vtdkeyvalue'],
                        office=lambda X: X['office'].str.replace("_"," ").str.replace(".","").str.replace("'",""),
                        name  =lambda X: X['name'  ].str.replace("_"," ").str.replace(".","").str.replace("'",""),
                        candidate=lambda X: str(elec)+'_'+X['office']+'_'+X['party']+'_'+X['name']
                    )
                    .query(f'office in {self.offices}')
                    for elec in self.years if elec%2==0
                ).pivot_table(index=['vtdkey','county'], columns='candidate', values='votes').fillna(0).prep()
            return geo.set_index('vtdkey').join(tarrant).join(elections).reset_index()
        return self.get(fcn, f'vtds')


    def get_pieces_geo(self):
        def fcn():
            df = None
            for yr, blkgrp in self.get_blkgrps().items():
                print(yr)
                B = blkgrp[['geometry',suf(yr)]]
                df = B if df is None else intersect(df, B)
            print('vtds')
            V = self.get_vtds()[['geometry','vtdkey']]
            df = intersect(V, df).get_area()
            print('done')
            return df
        return self.get(fcn, f'pieces_geo', [self.get_blkgrps, self.get_vtds])


    def get_pieces(self):
        def fcn():
            df = self.get_pieces_geo().merge(self.get_vtds().drop(columns='geometry'), on='vtdkey')
            for yr, blkgrp in self.get_blkgrps().items():
                df = df.merge(blkgrp.drop(columns='geometry'), on=suf(yr))
                # get population columns for this year & apportion to pieces based on area
                col = df.filter(like=f'_{yr}').columns.drop(suf(yr))
                df[col] *= df.get_mlt(suf(yr), 'area')
                # get votes columns for this year & apportion to pieces based on cvap_total
                col = df.filter(like=f'{yr}_').columns
                df[col] *= df.get_mlt(suf(yr), suf(yr,'cvap_total'))
            return df
        return self.get(fcn, f'pieces', [self.get_blkgrps, self.get_vtds, self.get_pieces_geo])


    def dissolve(self, by='vtdkey'):
        # combine pieces based on "by"
        def fcn():
            df = self.get_pieces()
            aggfunc = {x:lambda x: pd.NA if x.isnull().all() else x.mode().iloc[0] for x in self.districts} # most frequent districts
            aggfun |= {x:'sum' for x in df.loc[:,self.districts[-1]:].columns[1:]} # sum populations & votes
            return df.dissolve(by=suf(by) if str(by).isdigit() else by, aggfunc=aggfunc)
        return self.get(fcn, f'dissolve_{by}', [self.get_pieces])

years = np.arange(2014,2024)
self = Redistricting(
    years=years,
    offices = [
        'president','us sen','governor','lt governor','attorney gen',
        # 'comptroller','land comm','ag comm',
        # 'rr comm 1','rr comm 2','rr comm 3',
        # 'sup ct chief','sup ct 1','sup ct 2','sup ct 3','sup ct 4','sup ct 5','sup ct 6','sup ct 7','sup ct 8','sup ct 9',
        # 'cca pres judge','cca1','cca2','cca3','cca4','cca5','cca6','cca7','cca8','cca9',
    ],
    overwrite=set({
        *{f'blkgrps_{yr}' for yr in years},
        'vtds',
        'pieces_geo',
        'pieces',
        'dissolve_vtdkey',
        *{f'dissolve_{suf(yr)}' for yr in years},
    }),
)
self.dissolve('vtdkey')
for yr in self.years:
    self.dissolve(yr)