In [1]:
# parameters
# state = Connecticut
state_ab = "ct"

# Data
1. Download all the data in directory "il_data"
2. Eextract them all

In [2]:
data_folder = state_ab + "_data/"
population_data = "./{}{}_pl2020_b/{}_pl2020_b.shp".format(data_folder, state_ab, state_ab)
vest20_data = "./{}{}_vest_20/{}_vest_20.shp".format(data_folder, state_ab, state_ab)
vest18_data = "./{}{}_vest_18/{}_vest_18.shp".format(data_folder, state_ab, state_ab)
vest16_data = "./{}{}_vest_16/{}_vest_16.shp".format(data_folder, state_ab, state_ab)
cd_data = "./{}{}_cong_adopted_2022/Districts_1 2022-02-14.shp".format(data_folder, state_ab)
send_data = "./{}{}_sldu_2021/{}_sldu_2021.shp".format(data_folder, state_ab, state_ab)
hdist_data = "./{}{}_sldl_2021/{}_sldl_2021.shp".format(data_folder, state_ab, state_ab)

In [3]:
import pandas as pd
import geopandas as gpd
import maup
import time
from maup import smart_repair
from gerrychain import Graph
import os

maup.progress.enabled = True

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
def do_smart_repair(df):
    # change it to the UTM it needs for smart_repair
    df = df.to_crs(df.estimate_utm_crs())
    df = smart_repair(df)
    
    # check maup doctor again to see if smart repair works
    if maup.doctor(df) == True:
        # change it back to this UTM for this data
        df = df.to_crs('EPSG:4269')
    else:
        raise Exception('maup.doctor failed')
    
    return df

In [6]:
def add_district(dist_df, dist_name, election_df, col_name):
    election_df = election_df.to_crs('EPSG:4269')
    dist_df = dist_df.to_crs('EPSG:4269')
    # check if it needs to be smart_repair
    if maup.doctor(dist_df) != True:
        dist_df = do_smart_repair(dist_df)

    election_df = gpd.GeoDataFrame(election_df, crs="EPSG:4269")

    # assign the pricincts
    precincts_to_district_assignment = maup.assign(election_df.geometry, dist_df.geometry)
    election_df[dist_name] = precincts_to_district_assignment
    for precinct_index in range(len(election_df)):
        election_df.at[precinct_index, dist_name] = dist_df.at[election_df.at[precinct_index, dist_name], col_name]

    return election_df

In [7]:
def rename(original, year):
    party = original[6]
    if party == 'R' or party == 'D':
        return original[3:6] + year + original[6]
    else:
        return original[3:6] + year + 'O'

In [8]:
def check_population(population, df):
    pop_check = pd.DataFrame({
        'pop_col': pop_col,
        'population_df': population[pop_col].sum(), 
        'vest_base': df[pop_col].sum(),
        'equal': [x == y for x, y in zip(population[pop_col].sum(), df[pop_col].sum())]
    })
    if pop_check['equal'].mean() < 1:
        print(pop_check)
        raise Exception("population doesn't agree")

    else:
        print("population agrees")

In [9]:
def add_vest(vest, df, year, population, start_col):
    df = df.to_crs('EPSG:4269')
    vest = vest.to_crs('EPSG:4269')
     # check if it needs to be smart_repair
    if maup.doctor(vest) != True:
        vest = do_smart_repair(vest)
    
    # rename the columns
    original_col = vest.columns[start_col:-1]
    new_col = [rename(i, year) for i in original_col]
    rename_dict = dict(zip(original_col, new_col))
    vest = vest.rename(columns=rename_dict)
    vest = vest.groupby(level=0, axis=1).sum() # combine all the other party's vote into columns with sufix "O"
    col_name = list(set(new_col))
    col_name.sort()
    
    # make the blocks from precincts by weight
    vest = gpd.GeoDataFrame(vest, crs="EPSG:4269")
    election_in_block = population[["VAP", 'geometry']] # population_df is in block scale
    blocks_to_precincts_assignment = maup.assign(election_in_block.geometry, vest.geometry)
    weights = election_in_block["VAP"] / blocks_to_precincts_assignment.map(election_in_block["VAP"].groupby(blocks_to_precincts_assignment).sum())
    weights = weights.fillna(0)
    prorated = maup.prorate(blocks_to_precincts_assignment, vest[col_name], weights)
    election_in_block[col_name] = prorated
    
    # assign blocks to precincts
    election_in_block = gpd.GeoDataFrame(election_in_block, crs="EPSG:4269")
    df = gpd.GeoDataFrame(df, crs="EPSG:4269")
    block_to_pricinct_assginment = maup.assign(election_in_block.geometry, df.geometry)
    df[col_name] = election_in_block[col_name].groupby(block_to_pricinct_assginment).sum()
    df = df.groupby(level=0, axis=1).sum()
    df = gpd.GeoDataFrame(df, crs = "EPSG:4269")
    # check if population agrees
    check_population(population, df)
    
    return df

## Read the census data

In [10]:
rename_dict = {'P0020001': 'TOTPOP', 'P0020002': 'HISP', 'P0020005': 'NH_WHITE', 'P0020006': 'NH_BLACK', 'P0020007': 'NH_AMIN',
               'P0020008': 'NH_ASIAN', 'P0020009': 'NH_NHPI', 'P0020010': 'NH_OTHER', 'P0020011': 'NH_2MORE',
               'P0040001': 'VAP', 'P0040002': 'HVAP', 'P0040005': 'WVAP', 'P0040006': 'BVAP', 'P0040007': 'AMINVAP',
               'P0040008': 'ASIANVAP', 'P0040009': 'NHPIVAP', 'P0040010': 'OTHERVAP', 'P0040011': '2MOREVAP'}

In [11]:
population_df = gpd.read_file(population_data)

In [12]:
population_df.rename(columns=rename_dict, inplace = True)

In [13]:
population_df['H_WHITE'] = population_df.apply(lambda t: t['P0010003'] - t['NH_WHITE'], 1)
population_df['H_BLACK'] = population_df.apply(lambda t: t['P0010004'] - t['NH_BLACK'], 1)
population_df['H_AMIN'] = population_df.apply(lambda t: t['P0010005'] - t['NH_AMIN'], 1)
population_df['H_ASIAN'] = population_df.apply(lambda t: t['P0010006'] - t['NH_ASIAN'], 1)
population_df['H_NHPI'] = population_df.apply(lambda t: t['P0010007'] - t['NH_NHPI'], 1)
population_df['H_OTHER'] = population_df.apply(lambda t: t['P0010008'] - t['NH_OTHER'], 1)
population_df['H_2MORE'] = population_df.apply(lambda t: t['P0010009'] - t['NH_2MORE'], 1)

# Read the vest 20 data
Now using it as a "base precinct", but it could be vest 18 or vest 16 if vest 20 is not working

In [14]:
def add_vest_base(vest, start_col, year):
    vest = vest.to_crs('EPSG:4269')
    original_col = vest.columns[start_col:-1]
    new_col = [rename(i, year) for i in original_col]
    rename_dict = dict(zip(original_col, new_col))
    vest = vest.rename(columns=rename_dict)
    vest = vest.groupby(level=0, axis=1).sum()
    vest = gpd.GeoDataFrame(vest, crs="EPSG:4269")
    
    return vest

In [15]:
vest20 = gpd.read_file(vest20_data)

In [16]:
if maup.doctor(vest20) != True:
    vest20 = do_smart_repair(vest20)

100%|███████████████████████████████████████| 741/741 [00:00<00:00, 1247.17it/s]


There are 9 overlaps.
Snapping all geometries to a grid with precision 10^( -5 ) to avoid GEOS errors.
Identifying overlaps...


100%|███████████████████████████████████████| 805/805 [00:00<00:00, 4210.08it/s]

Resolving overlaps...





Assigning order 2 pieces...
Assigning order 3 pieces...
Filling gaps...


Gaps to simplify: 100%|███████████████████████████| 5/5 [00:00<00:00, 31.48it/s]
Gaps to fill: 100%|███████████████████████████████| 2/2 [00:00<00:00, 38.08it/s]
100%|███████████████████████████████████████| 741/741 [00:00<00:00, 1260.70it/s]


In [17]:
vest20.columns

Index(['STATEFP20', 'COUNTYFP20', 'NAME20', 'G20PREDBID', 'G20PRERTRU',
       'G20PRELJOR', 'G20PREGHAW', 'G20PREOWRI', 'geometry'],
      dtype='object')

In [18]:
start_col = 3
vest_base_data = vest20
year = '20'

In [19]:
vest_base = add_vest_base(vest_base_data, start_col, year)

In [20]:
# vap and population have the same GEOID20
blocks_to_precincts_assignment = maup.assign(population_df.geometry, vest20.geometry)

100%|████████████████████████████████████████| 741/741 [00:01<00:00, 695.98it/s]
100%|████████████████████████████████████████| 741/741 [00:03<00:00, 232.76it/s]


In [21]:
pop_col = ['TOTPOP', 'HISP', 'NH_WHITE', 'NH_BLACK', 'NH_AMIN', 'NH_ASIAN', 'NH_NHPI', 'NH_OTHER', 'NH_2MORE', 'H_WHITE', 'H_BLACK', 'H_AMIN', 'H_ASIAN', 'H_NHPI', 'H_OTHER', 'H_2MORE', 'VAP', 'HVAP', 'WVAP', 'BVAP', 'AMINVAP', 'ASIANVAP', 'NHPIVAP', 'OTHERVAP', '2MOREVAP']

In [22]:
vest_base[pop_col] = population_df[pop_col].groupby(blocks_to_precincts_assignment).sum()

In [23]:
election_df = gpd.GeoDataFrame(vest_base, crs="EPSG:4269")

### Check if the population agrees

In [24]:
check_population(population_df, vest_base)

population agrees


# Add more vest data

In [25]:
vest18 = gpd.read_file(vest18_data)
vest18.columns

Index(['STATEFP20', 'COUNTYFP20', 'NAME20', 'G18USSDMUR', 'G18USSRCOR',
       'G18USSLLIO', 'G18USSGRUS', 'G18USSOWRI', 'G18GOVDLAM', 'G18GOVRSTE',
       'G18GOVLHAN', 'G18GOVOGRI', 'G18GOVOGRE', 'G18GOVOWRI', 'G18ATGDTON',
       'G18ATGRHAT', 'G18ATGGGOS', 'G18SOSDMER', 'G18SOSRCHA', 'G18SOSLGWY',
       'G18SOSGDER', 'G18TREDWOO', 'G18TRERGRA', 'G18TRELBRO', 'G18TREOWRI',
       'G18COMDLEM', 'G18COMRMIL', 'G18COMLPAS', 'G18COMGHEF', 'geometry'],
      dtype='object')

In [26]:
election_df = add_vest(vest18, election_df, '18', population_df, start_col)

100%|███████████████████████████████████████| 742/742 [00:00<00:00, 1250.38it/s]


There are 9 overlaps.
Snapping all geometries to a grid with precision 10^( -5 ) to avoid GEOS errors.
Identifying overlaps...


100%|███████████████████████████████████████| 806/806 [00:00<00:00, 4182.86it/s]

Resolving overlaps...





Assigning order 2 pieces...
Assigning order 3 pieces...
Filling gaps...


Gaps to simplify: 100%|███████████████████████████| 5/5 [00:00<00:00, 30.31it/s]
Gaps to fill: 100%|███████████████████████████████| 2/2 [00:00<00:00, 37.00it/s]
100%|███████████████████████████████████████| 742/742 [00:00<00:00, 1247.70it/s]
100%|████████████████████████████████████████| 742/742 [00:01<00:00, 691.59it/s]
100%|████████████████████████████████████████| 742/742 [00:03<00:00, 230.89it/s]
100%|████████████████████████████████████████| 741/741 [00:01<00:00, 709.37it/s]
100%|████████████████████████████████████████| 741/741 [00:03<00:00, 230.43it/s]


population agrees


In [27]:
vest16 = gpd.read_file(vest16_data)
vest16.columns

Index(['STATEFP20', 'COUNTYFP20', 'NAME20', 'G16PREDCLI', 'G16PRERTRU',
       'G16PRELJOH', 'G16PREGSTE', 'G16PREOWRI', 'G16USSDBLU', 'G16USSRCAR',
       'G16USSLLIO', 'G16USSGRUS', 'G16USSOWRI', 'geometry'],
      dtype='object')

In [28]:
election_df = add_vest(vest16, election_df, '16', population_df, start_col)

100%|███████████████████████████████████████| 743/743 [00:00<00:00, 1147.91it/s]


There are 9 overlaps.
Snapping all geometries to a grid with precision 10^( -5 ) to avoid GEOS errors.
Identifying overlaps...


100%|███████████████████████████████████████| 807/807 [00:00<00:00, 4233.69it/s]

Resolving overlaps...





Assigning order 2 pieces...
Assigning order 3 pieces...
Filling gaps...


Gaps to simplify: 100%|███████████████████████████| 5/5 [00:00<00:00, 24.46it/s]
Gaps to fill: 100%|███████████████████████████████| 2/2 [00:00<00:00, 36.87it/s]
100%|███████████████████████████████████████| 743/743 [00:00<00:00, 1256.99it/s]
100%|████████████████████████████████████████| 743/743 [00:01<00:00, 668.16it/s]
100%|████████████████████████████████████████| 743/743 [00:03<00:00, 232.99it/s]
100%|████████████████████████████████████████| 741/741 [00:01<00:00, 714.97it/s]
100%|████████████████████████████████████████| 741/741 [00:03<00:00, 235.05it/s]


population agrees


## Add the district data

In [29]:
cong_df = gpd.read_file(cd_data).to_crs('EPSG:4269')
send = gpd.read_file(send_data).to_crs('EPSG:4269')
hdist = gpd.read_file(hdist_data).to_crs('EPSG:4269')

In [30]:
cong_df.head()

Unnamed: 0,ID,AREA,DISTRICT,MEMBERS,LOCKED,NAME,POPULATION,NH_WHT,DEVIATION,F_DEVIATIO,IDEAL_VALU,ALTERNATE_,POLSBY_POP,ALTERNATE1,POLSBY_PO1,PERIMETER,REOCK,F_NH_WHT,DISTRICT_L,geometry
0,1,526.080688,4,1.0,,,721189,410531,0.0,0.0,721189.0,1.73468,0.332324,1.73468,0.332324,141.042622,0.321915,0.569242,4|0%,"POLYGON ((-73.54362 41.37509, -73.54347 41.376..."
1,2,501.35791,3,1.0,,,721189,437845,0.0,0.0,721189.0,2.044058,0.239339,2.044058,0.239339,162.245252,0.363883,0.607115,3|0%,"POLYGON ((-72.63558 41.31288, -72.63540 41.312..."
2,3,1278.44628,5,1.0,,,721189,456795,0.0,0.0,721189.0,2.084848,0.230065,2.084848,0.230065,264.253409,0.510607,0.633392,5|0%,"POLYGON ((-72.99955 41.52593, -72.99961 41.525..."
3,5,675.17157,1,1.0,,,721189,411361,0.0,0.0,721189.0,2.435537,0.168582,2.435537,0.168582,224.339972,0.432572,0.570393,1|0%,"POLYGON ((-73.12723 42.04213, -73.11993 42.041..."
4,6,2133.33715,2,1.0,,,721188,562700,-1.0,-1e-06,721189.0,1.550315,0.416064,1.550315,0.416064,253.836811,0.572632,0.78024,2|-0%,"MULTIPOLYGON (((-72.51333 41.94540, -72.51357 ..."


In [31]:
send.head()

Unnamed: 0,ID,DISTRICT,DISTRICTN,geometry
0,1,1,1,"POLYGON ((-72.67674 41.67505, -72.67669 41.675..."
1,2,2,2,"POLYGON ((-72.71538 41.76049, -72.71540 41.760..."
2,3,3,3,"POLYGON ((-72.66507 41.76651, -72.66495 41.768..."
3,4,4,4,"POLYGON ((-72.64827 41.72317, -72.64782 41.723..."
4,5,5,5,"POLYGON ((-72.81544 41.68782, -72.81770 41.688..."


In [32]:
hdist.head()

Unnamed: 0,ID,DISTRICT,DISTRICTN,geometry
0,1,1,1,"POLYGON ((-72.71568 41.76251, -72.71570 41.762..."
1,2,2,2,"POLYGON ((-73.47010 41.32642, -73.47036 41.326..."
2,3,3,3,"POLYGON ((-72.71513 41.74221, -72.71514 41.742..."
3,4,4,4,"POLYGON ((-72.67543 41.73794, -72.67559 41.738..."
4,5,5,5,"POLYGON ((-72.66504 41.76953, -72.66568 41.769..."


In [33]:
election_df = add_district(cong_df, "CD", election_df, "ID")

100%|████████████████████████████████████████████| 6/6 [00:00<00:00, 214.09it/s]
100%|████████████████████████████████████████████| 6/6 [00:00<00:00, 303.81it/s]
100%|█████████████████████████████████████████████| 6/6 [00:00<00:00, 38.65it/s]


In [34]:
election_df = add_district(send, "SEND", election_df, "DISTRICTN")

100%|██████████████████████████████████████████| 36/36 [00:00<00:00, 353.66it/s]


There are 3 overlaps.
Snapping all geometries to a grid with precision 10^( -5 ) to avoid GEOS errors.
Identifying overlaps...


100%|█████████████████████████████████████████| 39/39 [00:00<00:00, 2463.19it/s]


Resolving overlaps...
Assigning order 2 pieces...
Filling gaps...


Gaps to simplify: 0it [00:00, ?it/s]
Gaps to fill: 0it [00:00, ?it/s]
100%|██████████████████████████████████████████| 36/36 [00:00<00:00, 385.89it/s]
100%|██████████████████████████████████████████| 36/36 [00:00<00:00, 176.92it/s]
100%|██████████████████████████████████████████| 36/36 [00:00<00:00, 415.29it/s]


In [35]:
election_df = add_district(hdist, "HDIST", election_df, "DISTRICTN")

100%|████████████████████████████████████████| 151/151 [00:00<00:00, 718.04it/s]


There are 8 overlaps.
There are 143 holes.
Snapping all geometries to a grid with precision 10^( -5 ) to avoid GEOS errors.
Identifying overlaps...


100%|███████████████████████████████████████| 156/156 [00:00<00:00, 3618.66it/s]


Resolving overlaps...
Assigning order 2 pieces...
Filling gaps...


Gaps to simplify: 0it [00:00, ?it/s]
Gaps to fill: 0it [00:00, ?it/s]
100%|████████████████████████████████████████| 151/151 [00:00<00:00, 727.00it/s]
100%|████████████████████████████████████████| 151/151 [00:00<00:00, 688.01it/s]
100%|████████████████████████████████████████| 151/151 [00:00<00:00, 497.89it/s]


In [36]:
election_df.columns

Index(['2MOREVAP', 'AMINVAP', 'ASIANVAP', 'ATG18D', 'ATG18O', 'ATG18R', 'BVAP',
       'COM18D', 'COM18O', 'COM18R', 'COUNTYFP20', 'GOV18D', 'GOV18O',
       'GOV18R', 'HISP', 'HVAP', 'H_2MORE', 'H_AMIN', 'H_ASIAN', 'H_BLACK',
       'H_NHPI', 'H_OTHER', 'H_WHITE', 'NAME20', 'NHPIVAP', 'NH_2MORE',
       'NH_AMIN', 'NH_ASIAN', 'NH_BLACK', 'NH_NHPI', 'NH_OTHER', 'NH_WHITE',
       'OTHERVAP', 'PRE16D', 'PRE16O', 'PRE16R', 'PRE20D', 'PRE20O', 'PRE20R',
       'SOS18D', 'SOS18O', 'SOS18R', 'STATEFP20', 'TOTPOP', 'TRE18D', 'TRE18O',
       'TRE18R', 'USS16D', 'USS16O', 'USS16R', 'USS18D', 'USS18O', 'USS18R',
       'VAP', 'WVAP', 'geometry', 'CD', 'SEND', 'HDIST'],
      dtype='object')

In [37]:
# reorder the columns

fixed_columns = [
    'STATEFP20',
    'COUNTYFP20',
    'NAME20',
    'CD',
    'SEND',
    'HDIST',
    'TOTPOP',
    'NH_2MORE',
    'NH_AMIN',
    'NH_ASIAN',
    'NH_BLACK',
    'NH_NHPI',
    'NH_OTHER',
    'NH_WHITE',
    'HISP',
    'H_AMIN',
    'H_ASIAN',
    'H_BLACK',
    'H_NHPI',
    'H_OTHER',
    'H_WHITE',
    'H_2MORE',
    'VAP',
    'HVAP',
    'WVAP',
    'BVAP',
    'AMINVAP',
    'ASIANVAP',
    'NHPIVAP',
    'OTHERVAP',
    '2MOREVAP']

election_columns = [col for col in election_df.columns if col not in fixed_columns]
final_col = fixed_columns + election_columns
election_df = election_df[final_col]

In [38]:
list(election_df.columns)

['STATEFP20',
 'COUNTYFP20',
 'NAME20',
 'CD',
 'SEND',
 'HDIST',
 'TOTPOP',
 'NH_2MORE',
 'NH_AMIN',
 'NH_ASIAN',
 'NH_BLACK',
 'NH_NHPI',
 'NH_OTHER',
 'NH_WHITE',
 'HISP',
 'H_AMIN',
 'H_ASIAN',
 'H_BLACK',
 'H_NHPI',
 'H_OTHER',
 'H_WHITE',
 'H_2MORE',
 'VAP',
 'HVAP',
 'WVAP',
 'BVAP',
 'AMINVAP',
 'ASIANVAP',
 'NHPIVAP',
 'OTHERVAP',
 '2MOREVAP',
 'ATG18D',
 'ATG18O',
 'ATG18R',
 'COM18D',
 'COM18O',
 'COM18R',
 'GOV18D',
 'GOV18O',
 'GOV18R',
 'PRE16D',
 'PRE16O',
 'PRE16R',
 'PRE20D',
 'PRE20O',
 'PRE20R',
 'SOS18D',
 'SOS18O',
 'SOS18R',
 'TRE18D',
 'TRE18O',
 'TRE18R',
 'USS16D',
 'USS16O',
 'USS16R',
 'USS18D',
 'USS18O',
 'USS18R',
 'geometry']

In [39]:
# store the result in directory "ct"
os.makedirs("./{}".format(state_ab))
election_df.to_file("./{}/{}.shp".format(state_ab, state_ab))
election_df.to_file('./{}/{}.geojson'.format(state_ab, state_ab), driver='GeoJSON')

# Only do once to build json and read from file when generating ensembles
graph = Graph.from_file("./{}/{}.shp".format(state_ab, state_ab), ignore_errors=True)
graph.to_json("./{}/{}.json".format(state_ab, state_ab))