- Tarrant precincts: https://www.tarrantcountytx.gov/en/elections/interactive-maps/commissioner-precinct-maps.html
- Texas VTDS: https://data.capitol.texas.gov/dataset/vtds/resource/906f47e4-4e39-4156-b1bd-4969be0b2780
- Texas Elections: https://data.capitol.texas.gov/topic/elections
- ACS: https://www.census.gov/programs-surveys/acs/data.html
- Blockgroup shapefiles: https://www2.census.gov/geo/tiger/TIGER2023/BG/
- Census Python: https://pypi.org/project/census/
- CRS: https://epsg.io/3085 & https://epsg.io/4269


In [0]:
from IPython.display import clear_output, display
try:
    %reload_ext autotime
except:
    %pip install -U ipython-autotime ipywidgets codetiming numpy pandas geopandas matplotlib Census us
    dbutils.library.restartPython()
    clear_output()
    dbutils.notebook.exit('Rerun to use newly installed/updated packages')

import warnings, os, pathlib, dataclasses, codetiming, requests, zipfile, us, census, numpy as np, pandas as pd, geopandas as gpd, pyarrow.parquet as pq, matplotlib.pyplot as plt
pd.options.display.max_columns = None
warnings.filterwarnings(category=FutureWarning, action='ignore')
################################################################
####################### helper functions #######################
################################################################

####################### dataframe extensions #######################
def disp(X, max_rows=3, sort=False):
    """convenient display method"""
    print(type(X), X.shape)
    X = (X.sort_index(axis=1) if sort else X).reset_index()
    Y = pd.DataFrame({'dtype':X.dtypes.astype('string'), 'missing_pct':X.isnull().mean()*100}).T.rename_axis('column').reset_index().prep(case='')
    print(X.shape)
    display(Y)
    display(X.head(max_rows))

def to_numeric(df, case='lower', downcast='integer', errors='ignore', category=False, **kwargs):
    """convert to numeric dtypes if possible"""
    case = case if case in dir(pd.Series().str) else 'strip'
    return (
        df
        .apply(lambda s: getattr(s.astype('string').str.strip().str,case)() if s.dtype in ['object','string'] else s)  # prep strings
        .apply(lambda s: s if pd.api.types.is_datetime64_any_dtype(s) or s.dtype in ['category','geometry'] else pd.to_numeric(s, downcast=downcast, errors=errors, **kwargs))  # convert to numeric if possible
        .convert_dtypes()  # convert to new nullable dtypes
        .apply(lambda s: s.astype('Int64') if pd.api.types.is_integer_dtype(s) else s.astype('category') if s.dtype in ['object','string'] and category else s)
    )

def prep(df, **kwargs):
    h = lambda x: x.to_numeric(**kwargs).rename(columns=lambda s: s.lower().strip() if isinstance(s, str) else s)
    idx = h(df[[]].reset_index())  # drop columns, reset_index to move index to columns, then apply g
    df = h(df).reset_index(drop=True).set_index(pd.MultiIndex.from_frame(idx))  # set idx back to df's index
    return df.to_crs('EPSG:4269') if isinstance(df, gpd.GeoDataFrame) else df

def buffer(df, distance=0, **kwargs):
    return df.assign(geometry=df.geometry.buffer(distance, **kwargs))

def simplify(df, tolerance=0, **kwargs):
    return df.assign(geometry=df.geometry.simplify(tolerance, **kwargs))

def overlay(df, other, **kwargs):
    return gpd.overlay(df, other.to_crs(df.crs), keep_geom_type=True, **kwargs)#.buffer(0).simplify(0)

def refine(df, other, **kwargs):
    return pd.concat(df.overlay(other, how=how, **kwargs) for how in ['intersection', 'difference'])#.buffer(0).simplify(0)

def get_area(df):
    return df.assign(area=df.geometry.to_crs('EPSG:3085').area)

def get_proportion(df, grp, col):
    return df.groupby(grp)[[col]].transform(lambda x: x/x.sum()).values

for fcn in [disp, to_numeric, prep, simplify, buffer, overlay, refine, get_area, get_proportion]:
    """monkey-patch helpers into (Geo)Pandas DataFrame so we can use df.method syntax"""
    setattr(pd.DataFrame, fcn.__name__, fcn)
    setattr(gpd.GeoDataFrame, fcn.__name__, fcn)

####################### file i/o #######################

def get_size(path):
    os.system(f'du -h {path}')

def rm(path, root=False):
    path = pathlib.Path(path)
    if path.is_file():
        path.unlink()
    elif path.is_dir():
        if root:
            shutil.rmtree(path)
        else:
            for p in path.iterdir():
                rm(p, True)
    return path

def mkdir(path):
    path = pathlib.Path(path)
    (path if path.suffix == '' else path.parent).mkdir(parents=True, exist_ok=True)
    return path

def reset(path):
    rm(path)
    mkdir(path)
    return path

def dump(path, obj, **kwargs):
    obj.to_parquet(reset(path), **kwargs)

def load(path, coerce=False, **kwargs):
    path = pathlib.Path(path)
    if path.suffix == '.parquet':
        if 'geometry' in pq.ParquetFile(path).schema.names:
            obj = gpd.read_parquet(path, **kwargs)
        else:
            obj = pd.read_parquet(path, **kwargs)
    elif path.suffix == '.csv':
        obj = pd.read_csv(path, **kwargs)
    else:
        obj = gpd.read_file(path, **kwargs)
    return obj.prep()

#########################################################
####################### real code #######################
#########################################################

def suf(s, x='blkgrp'):
    return [f'{k}_{s}' for k in x] if isinstance(x,(list,tuple,set)) else f'{x}_{s}'

@dataclasses.dataclass
class Data():
    years: tuple = (2020,2021,2022,2023)
    current: int = 2024
    state: any = us.states.TX
    api_key: str = '5640e76608e24d8d6cc35b96ce35028445957cb5'
    overwrite: set = None

    #Allows self['attr'] and self.attr syntax
    def __contains__(self, key):
        return hasattr(self, key)
    def __getitem__(self, key):
        return getattr(self, key)
    def __setitem__(self, key, val):
        setattr(self, key, val)
    def __delitem__(self, key):
        if key in self:
            delattr(self, key)

    def __post_init__(self):
        self.root = pathlib.Path(f'/Volumes/aiml/scook/scook_files/redistricting/{self.state.abbr}')
        self.overwrite = set() if self.overwrite is None else set(self.overwrite)
        for nm in self.overwrite.intersection({'blkgrps', 'acs', 'cvap', 'dissolved_blkgrp'}):
            self.overwrite |= {f'{nm}_{yr}' for yr in years}
            self.overwrite -= {nm}

    def dst(self, nm, suf):
        return self.root/f"data/{nm.replace('_','/')}/{nm}.{suf.strip('.')}"
    
    def fetch(self, nm, url=None, unzip=True, **kwargs):
        dst = self.dst(nm, 'zip')
        if url and not dst.exists():
            print(f'fetching {url} to {dst}')
            reset(dst)
            response = requests.get(url)
            with open(dst, 'wb') as file:
                file.write(response.content)
            if unzip and zipfile.is_zipfile(dst):
                with zipfile.ZipFile(dst, 'r') as file:
                    file.extractall(dst.parent)
        return dst

    def get(self, nm, fcn=None, prereq=[], coerce=False, **kwargs):
        dst = self.dst(nm, 'parquet')
        if nm in self.overwrite:
            del self[nm]
            reset(dst)
            self.overwrite -= {nm}
        if not nm in self:
            if dst.exists():
                self[nm] = load(dst)
            else:
                [f() for f in prereq]
                with codetiming.Timer():
                    print(f'creating {dst}')
                    if fcn:
                        self[nm] = fcn(**kwargs).prep()
                    else:
                        src = self.fetch(nm, unzip=False, **kwargs)
                        if src.exists():
                            self[nm] = load(src)
                        elif coerce:
                            print(f'cannot create {nm} - returning empty GeoDataFrame')
                            self[nm] = gpd.GeoDataFrame(columns=['geometry'], crs='EPSG:4269')
                    dump(dst, self[nm])
        return self[nm]

    def run_years(self, nm, fcn):
        return [[yr, self.get(f'{nm}_{yr}', fcn, yr=yr)] for yr in self.years]

    def get_elections(self):
        def fcn():
            src = self.fetch('elections', url=f'https://data.capitol.texas.gov/dataset/35b16aee-0bb0-4866-b1ec-859f1f044241/resource/e1cd6332-6a7a-4c78-ad2a-852268f6c7a2/download/{self.current}-general-vtds-election-data.zip')
            df = pd.concat(load(fn).assign(year=fn.stem[:4]) for fn in src.parent.iterdir() if 'General_Election_Returns' in fn.stem and 'City' not in fn.stem)
            for k in ['office','name']:
                df[k] = df[k].str.replace('_',' ').str.replace('.', '')
            return df.rename(columns={'vtdkeyvalue':'vtdkey'})
        return self.get('elections', fcn)

    def get_blkgrps(self):
        return [[yr, self.get(f'blkgrps_{yr}', url=f'https://www2.census.gov/geo/tiger/TIGER{yr}/BG/tl_{yr}_{self.state.fips}_bg.zip')] for yr in self.years]
    
    # def get_blkgrps(self):
    #     def fcn(yr):
    #         return load(self.fetch(f'blkgrps_{yr}', url=f'https://www2.census.gov/geo/tiger/TIGER{yr}/BG/tl_{yr}_{self.state.fips}_bg.zip', unzip=False))
    #     return self.run_years('blkgrps', fcn)

    def get_vtds(self):
        return self.get('vtds', url=f'https://data.capitol.texas.gov/dataset/4d8298d0-d176-4c19-b174-42837027b73e/resource/906f47e4-4e39-4156-b1bd-4969be0b2780/download/vtds_{self.current}pg.zip')

    def get_tarrant(self):
        return self.get('tarrant', coerce=True)

    def get_pieces(self):
        def fcn():
            L = [[suf(yr), B.filter(['geoid','geometry']).rename(columns={'geoid':suf(yr)})] for yr, B in self.get_blkgrps()]
            nm, df = L.pop()
            print(nm)
            while L:
                nm, B = L.pop()
                print(nm)
                df = df.overlay(B)
            print('vtds')
            df = df.overlay(self.get_vtds().filter(['vtdkey','geometry']))
            print('tarrant')
            df = df.refine(self.get_tarrant().filter(['precinct','geometry']))
            return df.get_area().query('area>1').simplify().buffer()
        return self.get('pieces', fcn, prereq=[self.get_blkgrps, self.get_vtds, self.get_tarrant])


@dataclasses.dataclass
class Redistricting(Data):
    acs_dict: dict = tuple()
    districts: tuple = tuple()
    offices: tuple = ('president','u.s. sen','governor','lt. governor','attorney gen')

    def __post_init__(self):
        super().__post_init__()
        self.acs_dict = {'B01003_001E':'pop_total'} | dict(self.acs_dict)
        self.districts = [suf(yr) for yr in self.years] + ['vtdkey','county',*self.districts]

    def get_votes(self):
        def fcn():
            return (
                self.get_elections()
                .query(f'office in {self.offices} and year in {self.years} and party in ["d","r"]')
                .assign(candidate=lambda X: X['office']+'_'+X['name']+'_'+X['party']+'_'+X['year'].astype('string'))
                .pivot_table(index=['vtdkey','county'], columns='candidate', values='votes', fill_value=0)
                .rename_axis(columns=None).reset_index()
                )
        return self.get('votes', fcn, prereq=[self.get_elections])

    def get_cvap(self):
        def fcn(yr):
            src = self.fetch(f'cvap_{yr}', url=f'https://www2.census.gov/programs-surveys/decennial/rdo/datasets/{yr}/{yr}-cvap/CVAP_{yr-4}-{yr}_ACS_csv_files.zip')
            df = load(src.parent/'BlockGr.csv', encoding='latin1')
            df[suf(yr)] = df['geoid'].str[-12:]
            df = df.pivot_table(index=suf(yr), columns='lntitle', values=['cit_est','cvap_est'], fill_value=0)
            df.columns = [suf(yr, k[:-3]+v) for k,v in df.columns]
            return df.reset_index()
        return self.run_years('cvap', fcn)

    def get_acs(self):
        def fcn(yr):
            df = pd.DataFrame(census.Census(self.api_key).acs5.state_county_blockgroup([*self.acs_dict.keys()], self.state.fips, '*', '*', year=yr))
            g = lambda s, x: x.astype(str).str.rjust(s,'0')
            df[suf(yr)] = g(2,df['state']) + g(3,df['county']) + g(6,df['tract']) + g(1,df['block group'])
            return df[[suf(yr), *self.acs_dict.keys()]].rename(columns={k:suf(yr,v) for k,v in self.acs_dict.items()})
        return self.run_years('acs', fcn)

    def merge(self, other):
        if 'merged' not in self:
            self.merged = other
        else:
            try:
                self.merged = self.merged.merge(other, how='left')
            except Exception as e:
                print(e)
        # print(self.merged.columns)

    def get_merged(self):
        def fcn():
            self.merge(self.get_pieces())
            self.merge(self.get_tarrant().filter({*self.districts}.difference({'county'})))
            self.merge(self.get_votes())
            ACS = self.get_acs()
            CVAP = self.get_cvap()
            while ACS:
                yr, A = ACS.pop()
                yr, C = CVAP.pop()
                self.merge(A)
                self.merge(C)
                pop  = suf(yr, 'pop_total')
                cvap = suf(yr, 'cvap_total')
                # {multiplier: {geoid: columns to scale by multiplier value in piece / multiplier value in geoid (aka: sum multiplier value over all pieces in geoid)}}
                # area must be first to scale pop & cvap BEFORE using them to scale others
                dct = {
                    'area': {suf(yr): [pop, cvap ]},
                    pop   : {suf(yr): [*A.columns[1:]]},
                    cvap  : {suf(yr): [*C.columns[1:]] , 'vtdkey': [*self.get_votes().filter(like=str(yr)).columns]},
                }
                for mlt, d in dct.items():
                    for geo, cols in d.items():
                        self.merged[cols] *= self.merged.get_proportion(geo, mlt)
            return self.merged
        return self.get('merged', fcn, prereq=[self.get_pieces, self.get_tarrant, self.get_votes, self.get_acs, self.get_cvap])
    
    def get_dissolved(self, by='vtdkey'):
        # combine pieces based on "by"
        def fcn():
            M = self.get_merged()
            p = M.filter(like='pop_total').columns.max()
            df = M[[by,'geometry']].dissolve(by).get_area()
            for k in self.districts:
                if k in M and k != by:
                    df = df.join(M.groupby([by,k])[p].sum().sort_values().reset_index().groupby(by).last()[k])
            df = df.join(M[M.columns.difference(df.columns)].groupby(by).sum())
            return df
        return self.get(f'dissolved_{by}', fcn, prereq=[self.get_merged])


years = list(range(2014,2024))[::-1]
self = Redistricting(
    years=years,
    districts = ['precinct','congress','senate','house','commish','jp','education'], 
    acs_dict = {
        'B01003_001E': 'pop_total',
    },
    offices = [
        'president','us sen','governor','lt governor','attorney gen',
        # 'comptroller','land comm','ag comm',
        # 'rr comm 1','rr comm 2','rr comm 3',
        # 'sup ct chief','sup ct 1','sup ct 2','sup ct 3','sup ct 4','sup ct 5','sup ct 6','sup ct 7','sup ct 8','sup ct 9',
        # 'cca pres judge','cca1','cca2','cca3','cca4','cca5','cca6','cca7','cca8','cca9',
    ],
    overwrite=set({
        # 'elections'
        # 'vtds',
        # 'tarrant',

        'blkgrps',
        'pieces',
        'votes',
        'cvap',
        'acs',
        'merged',
        'dissolved_vtdkey',
        'dissolved_blkgrp',
    }),
)

# self.get_blkgrps()
# self.get_acs()
# self.get_cvap()
# self.get_pieces()
self.get_dissolved('vtdkey')
# self.get_dissolved('blkgrp_2022')

In [0]:
self.get_pieces()['area'].mean()