# All the parameters that need to be changed

In [2]:
# Delaware
state_ab = "de"

## Data
1. Download all the data in directory "il_data"
2. Eextract them all

In [4]:
data_folder = state_ab + "_data/"
population1_data = "./{}{}_pl2020_b/{}_pl2020_p1_b.shp".format(data_folder, state_ab, state_ab)
population2_data = "./{}{}_pl2020_b/{}_pl2020_p2_b.shp".format(data_folder, state_ab, state_ab)
vap_data =  "./{}{}_pl2020_b/{}_pl2020_p4_b.shp".format(data_folder, state_ab, state_ab)
vest20_data = "./{}{}_vest_20/{}_vest_20.shp".format(data_folder, state_ab, state_ab)
vest18_data = "./{}{}_vest_18/{}_vest_18.shp".format(data_folder, state_ab, state_ab)
vest16_data = "./{}{}_vest_16/{}_vest_16.shp".format(data_folder, state_ab, state_ab)
send_data = "./{}{}_sldu_adopted_2022/DESenate22.shp".format(data_folder, state_ab)
hdist_data = "./{}{}_sldl_adopted_2022/March 2022 Redistricting Clean Up File.shp".format(data_folder, state_ab)

## Parameters that needs to be manually checked

### base vest data
start_col = 5\
vest_base_data = vest20\
year = '20'

### district data
district column name of cong_df, send, hdist when calling add_dist()

# Program starts

In [7]:
import pandas as pd
import geopandas as gpd
import maup
import time
from maup import smart_repair
from gerrychain import Graph
import os

maup.progress.enabled = True

In [8]:
import warnings
warnings.filterwarnings("ignore")

In [9]:
def do_smart_repair(df):
    # change it to the UTM it needs for smart_repair
    df = df.to_crs(df.estimate_utm_crs())
    df = smart_repair(df)

    # Consistent crs
    df = df.to_crs('EPSG:4269')
    
    # check maup doctor again to see if smart repair works
    if maup.doctor(df) != True:
        raise Exception('maup.doctor failed')
    
    return df

In [10]:
def add_district(dist_df, dist_name, election_df, col_name):
    # Consistent crs
    dist_df = dist_df.to_crs('EPSG:4269')
    
    # check if it needs to be smart_repair
    if maup.doctor(dist_df) != True:
        dist_df = do_smart_repair(dist_df)

    election_df = gpd.GeoDataFrame(election_df, crs="EPSG:4269")
    dist_df = gpd.GeoDataFrame(dist_df, crs="EPSG:4269")

    # assigne the pricincts
    precincts_to_district_assignment = maup.assign(election_df.geometry, dist_df.geometry)
    election_df[dist_name] = precincts_to_district_assignment
    for precinct_index in range(len(election_df)):
        election_df.at[precinct_index, dist_name] = dist_df.at[election_df.at[precinct_index, dist_name], col_name]

    return election_df

In [11]:
def rename(original, year):
    party = original[6]
    if party == 'R' or party == 'D':
        return original[3:6] + year + original[6]
    else:
        return original[3:6] + year + 'O'

In [12]:
def check_population(population, df):
    pop_check = pd.DataFrame({
        'pop_col': pop_col,
        'population_df': population[pop_col].sum(), 
        'vest_base': df[pop_col].sum(),
        'equal': [x == y for x, y in zip(population[pop_col].sum(), df[pop_col].sum())]
    })
    if pop_check['equal'].mean() < 1:
        print(pop_check)
        raise Exception("population doesn't agree")

    else:
        print("population agrees")

In [13]:
def add_vest(vest, df, year, population, start_col):    
     # check if it needs to be smart_repair
    if maup.doctor(vest) != True:
        vest = do_smart_repair(vest)

    # Consistent crs
    vest = vest.to_crs('EPSG:4269')
    population = population.to_crs('EPSG:4269')
    
    # rename the columns
    original_col = vest.columns[start_col:-1]
    new_col = [rename(i, year) for i in original_col]
    rename_dict = dict(zip(original_col, new_col))
    vest = vest.rename(columns=rename_dict)
    vest = vest.groupby(level=0, axis=1).sum() # combine all the other party's vote into columns with sufix "O"
    col_name = list(set(new_col))
    col_name.sort()
    
    # make the blocks from precincts by weight
    vest = gpd.GeoDataFrame(vest, crs="EPSG:4269")
    election_in_block = population[["VAP", 'geometry']] # population_df is in block scale
    blocks_to_precincts_assignment = maup.assign(election_in_block.geometry, vest.geometry)
    weights = election_in_block["VAP"] / blocks_to_precincts_assignment.map(election_in_block["VAP"].groupby(blocks_to_precincts_assignment).sum())
    weights = weights.fillna(0)
    prorated = maup.prorate(blocks_to_precincts_assignment, vest[col_name], weights)
    election_in_block[col_name] = prorated
    
    # assign blocks to precincts
    election_in_block = gpd.GeoDataFrame(election_in_block, crs="EPSG:4269")
    df = gpd.GeoDataFrame(df, crs="EPSG:4269")
    block_to_pricinct_assginment = maup.assign(election_in_block.geometry, df.geometry)
    df[col_name] = election_in_block[col_name].groupby(block_to_pricinct_assginment).sum()
    df = df.groupby(level=0, axis=1).sum()
    
    # check if population agrees
    check_population(population, df)
    
    return df

## Read the census data

In [15]:
population1_df = gpd.read_file(population1_data)
population2_df = gpd.read_file(population2_data)
vap_df = gpd.read_file(vap_data)

In [16]:
population2_df = population2_df.drop(columns=['SUMLEV', 'LOGRECNO', 'GEOID', 'COUNTY', 'geometry'])
vap_df = vap_df.drop(columns=['SUMLEV', 'LOGRECNO', 'GEOID', 'COUNTY', 'geometry'])

In [17]:
population_df = pd.merge(population1_df, population2_df, on='GEOID20')
population_df = pd.merge(population_df, vap_df, on='GEOID20')

In [18]:
print(population_df['geometry'])

0        POLYGON ((-75.65542 39.54930, -75.65459 39.549...
1        POLYGON ((-75.59892 39.77281, -75.59887 39.773...
2        POLYGON ((-75.75740 39.43498, -75.75734 39.435...
3        POLYGON ((-75.78475 39.68048, -75.78470 39.680...
4        POLYGON ((-75.47102 39.81160, -75.47086 39.811...
                               ...                        
20193    POLYGON ((-75.60796 38.64493, -75.60686 38.644...
20194    POLYGON ((-75.62227 38.56966, -75.62193 38.570...
20195    POLYGON ((-75.69660 38.58534, -75.69651 38.585...
20196    POLYGON ((-75.15564 38.54452, -75.15544 38.544...
20197    POLYGON ((-75.10601 38.62154, -75.10592 38.621...
Name: geometry, Length: 20198, dtype: geometry


In [19]:
rename_dict = {'P0020001': 'TOTPOP', 'P0020002': 'HISP', 'P0020005': 'NH_WHITE', 'P0020006': 'NH_BLACK', 'P0020007': 'NH_AMIN',
               'P0020008': 'NH_ASIAN', 'P0020009': 'NH_NHPI', 'P0020010': 'NH_OTHER', 'P0020011': 'NH_2MORE',
               'P0040001': 'VAP', 'P0040002': 'HVAP', 'P0040005': 'WVAP', 'P0040006': 'BVAP', 'P0040007': 'AMINVAP',
               'P0040008': 'ASIANVAP', 'P0040009': 'NHPIVAP', 'P0040010': 'OTHERVAP', 'P0040011': '2MOREVAP'}

In [20]:
population_df.rename(columns=rename_dict, inplace = True)

In [21]:
population_df['H_WHITE'] = population_df.apply(lambda t: t['P0010003'] - t['NH_WHITE'], 1)
population_df['H_BLACK'] = population_df.apply(lambda t: t['P0010004'] - t['NH_BLACK'], 1)
population_df['H_AMIN'] = population_df.apply(lambda t: t['P0010005'] - t['NH_AMIN'], 1)
population_df['H_ASIAN'] = population_df.apply(lambda t: t['P0010006'] - t['NH_ASIAN'], 1)
population_df['H_NHPI'] = population_df.apply(lambda t: t['P0010007'] - t['NH_NHPI'], 1)
population_df['H_OTHER'] = population_df.apply(lambda t: t['P0010008'] - t['NH_OTHER'], 1)
population_df['H_2MORE'] = population_df.apply(lambda t: t['P0010009'] - t['NH_2MORE'], 1)

# Read the base vest data
Now using it as a "base precinct", but it could be vest 18 or vest 16 if vest 20 is not working

In [23]:
def add_vest_base(vest, start_col, year):
    vest = vest.to_crs('EPSG:4269')
    original_col = vest.columns[start_col:-1]
    new_col = [rename(i, year) for i in original_col]
    rename_dict = dict(zip(original_col, new_col))
    vest = vest.rename(columns=rename_dict)
    vest = vest.groupby(level=0, axis=1).sum()
    vest = gpd.GeoDataFrame(vest, crs="EPSG:4269")
    
    return vest

# Check if vest 16 can be used as base

In [25]:
vest16 = gpd.read_file(vest16_data)

In [26]:
print(population_df.columns)

Index(['GEOID20', 'SUMLEV', 'LOGRECNO', 'GEOID', 'COUNTY', 'P0010001',
       'P0010002', 'P0010003', 'P0010004', 'P0010005',
       ...
       'P0040071', 'P0040072', 'P0040073', 'H_WHITE', 'H_BLACK', 'H_AMIN',
       'H_ASIAN', 'H_NHPI', 'H_OTHER', 'H_2MORE'],
      dtype='object', length=230)


In [27]:
if maup.doctor(vest16) != True:
    vest16 = do_smart_repair(vest16)

100%|███████████████████████████████████████| 430/430 [00:00<00:00, 1545.75it/s]


### If it is true for maup doctor, we will use it as the base vest data.
Check where the election column starts, this should be the same for all vest data in that state

In [29]:
vest16.columns

Index(['PRECINCT', 'G16PREDCLI', 'G16PRERTRU', 'G16PREGSTE', 'G16PRELJOH',
       'G16HALDROC', 'G16HALRREI', 'G16HALGPER', 'G16HALLGES', 'G16GOVDCAR',
       'G16GOVRBON', 'G16GOVGGRO', 'G16GOVLGOW', 'G16LTGDHAL', 'G16LTGRGUN',
       'G16INSDNAV', 'G16INSRCRA', 'geometry'],
      dtype='object')

## Parameters that need to be checked

In [31]:
start_col = 1
vest_base_data = vest16
year = '16'

In [32]:
vest_base = add_vest_base(vest_base_data, start_col, year)

In [33]:
# vap and population have the same GEOID20
blocks_to_precincts_assignment = maup.assign(population_df.geometry, vest_base.geometry)

100%|███████████████████████████████████████| 430/430 [00:00<00:00, 1293.72it/s]
100%|████████████████████████████████████████| 430/430 [00:01<00:00, 330.51it/s]


In [34]:
print(blocks_to_precincts_assignment.isna().sum())
print(len(blocks_to_precincts_assignment))

0
20198


In [35]:
maup.doctor(vest16)

100%|███████████████████████████████████████| 430/430 [00:00<00:00, 1589.11it/s]


True

In [36]:
vest18 = gpd.read_file(vest18_data)

In [37]:
maup.doctor(vest18)

100%|███████████████████████████████████████| 434/434 [00:00<00:00, 1599.03it/s]


True

In [38]:
vest20 = gpd.read_file(vest20_data)

In [39]:
maup.doctor(vest20)

100%|███████████████████████████████████████| 434/434 [00:00<00:00, 1463.06it/s]


True

In [40]:
maup.doctor(population_df)

100%|███████████████████████████████████| 20198/20198 [00:06<00:00, 3210.80it/s]


True

In [41]:
pop_col = ['TOTPOP', 'HISP', 'NH_WHITE', 'NH_BLACK', 'NH_AMIN', 'NH_ASIAN', 'NH_NHPI', 'NH_OTHER', 'NH_2MORE', 'H_WHITE', 'H_BLACK', 'H_AMIN', 'H_ASIAN', 'H_NHPI', 'H_OTHER', 'H_2MORE', 'VAP', 'HVAP', 'WVAP', 'BVAP', 'AMINVAP', 'ASIANVAP', 'NHPIVAP', 'OTHERVAP', '2MOREVAP']

In [42]:
vest_base[pop_col] = population_df[pop_col].groupby(blocks_to_precincts_assignment).sum()

In [43]:
blocks_to_precincts_assignment

0        204
1        249
2        203
3         79
4        165
        ... 
20193    405
20194    363
20195    408
20196    368
20197    394
Length: 20198, dtype: int64

In [44]:
election_df = gpd.GeoDataFrame(vest_base, crs="EPSG:4269")

### Check if the population agrees

In [46]:
check_population(population_df, vest_base)

population agrees


# Add more vest data

In [48]:
# check the result here
election_df = add_vest(vest20, election_df, '20', population_df, start_col)

100%|███████████████████████████████████████| 434/434 [00:00<00:00, 1590.08it/s]
100%|███████████████████████████████████████| 434/434 [00:00<00:00, 1311.26it/s]
100%|████████████████████████████████████████| 434/434 [00:01<00:00, 331.66it/s]
100%|███████████████████████████████████████| 430/430 [00:00<00:00, 1323.90it/s]
100%|████████████████████████████████████████| 430/430 [00:01<00:00, 322.26it/s]


population agrees


In [49]:
vest16 = gpd.read_file(vest16_data)
vest16.columns

Index(['PRECINCT', 'G16PREDCLI', 'G16PRERTRU', 'G16PREGSTE', 'G16PRELJOH',
       'G16HALDROC', 'G16HALRREI', 'G16HALGPER', 'G16HALLGES', 'G16GOVDCAR',
       'G16GOVRBON', 'G16GOVGGRO', 'G16GOVLGOW', 'G16LTGDHAL', 'G16LTGRGUN',
       'G16INSDNAV', 'G16INSRCRA', 'geometry'],
      dtype='object')

In [50]:
election_df = add_vest(vest16, election_df, '16', population_df, start_col)

100%|███████████████████████████████████████| 430/430 [00:00<00:00, 1453.95it/s]
100%|███████████████████████████████████████| 430/430 [00:00<00:00, 1180.96it/s]
100%|████████████████████████████████████████| 430/430 [00:01<00:00, 327.91it/s]
100%|███████████████████████████████████████| 430/430 [00:00<00:00, 1333.31it/s]
100%|████████████████████████████████████████| 430/430 [00:01<00:00, 322.29it/s]

population agrees





## Add the district data

In [52]:
send = gpd.read_file(send_data).to_crs('EPSG:4269')
hdist = gpd.read_file(hdist_data).to_crs('EPSG:4269')

In [53]:
send.head()

Unnamed: 0,ID,AREA,DISTRICT,MEMBERS,LOCKED,NAME,ADJ_POPULA,ADJ_WHITE,ADJ_BLACK,ADJ_ASIAN,...,IDEAL_VALU,DEVIATION,F_DEVIATIO,F_ADJ_WHIT,F_ADJ_BLAC,F_ADJ_ASIA,F_ADJ_AMIN,F_ADJ_OTHE,DISTRICT_L,geometry
0,1,15.944303,1,1.0,,,44856,29399,10231,1640,...,47124.0,-2268.0,-0.048128,0.655408,0.228085,0.036561,0.001783,0.017657,1|-4.81%,"POLYGON ((-75.47421 39.78841, -75.47409 39.788..."
1,2,60.647408,10,1.0,,,47281,27975,10990,3724,...,47124.0,157.0,0.003332,0.591675,0.23244,0.078763,0.001819,0.027009,10|0.33%,"POLYGON ((-75.69906 39.40833, -75.69908 39.408..."
2,3,20.52145,11,1.0,,,48203,21537,14022,5115,...,47124.0,1079.0,0.022897,0.446798,0.290895,0.106114,0.004336,0.068149,11|2.29%,"POLYGON ((-75.71540 39.60807, -75.71697 39.607..."
3,4,107.66404,12,1.0,,,47368,26073,13463,2908,...,47124.0,244.0,0.005178,0.550435,0.284221,0.061392,0.002977,0.028247,12|0.52%,"POLYGON ((-75.55876 39.65424, -75.55724 39.652..."
4,5,15.994961,13,1.0,,,48294,18048,19838,2801,...,47124.0,1170.0,0.024828,0.373711,0.410776,0.057999,0.00441,0.072618,13|2.48%,"POLYGON ((-75.58859 39.69282, -75.58874 39.692..."


In [54]:
hdist.head()

Unnamed: 0,ID,AREA,DISTRICT,MEMBERS,LOCKED,NAME,ADJ_POPULA,ADJ_WHITE,ADJ_BLACK,ADJ_ASIAN,...,TOTAL_REGI,DEM_,REP_,OTHER_,F_ADJ_18_W,F_ADJ_18_B,F_ADJ_18_A,DISTRICT_N,DISTRICT_L,geometry
0,1,2.455995,1,1.0,,,23267,6608,14503,313,...,19374,0.754878,0.085114,0.160008,0.317962,0.598196,0.014487,1,01|-3.6%,"POLYGON ((-75.55350 39.76181, -75.55343 39.761..."
1,2,10.331964,2,1.0,,,22961,5099,13969,351,...,17218,0.719422,0.087118,0.19346,0.258801,0.586316,0.018915,2,02|-4.87%,"POLYGON ((-75.55103 39.73524, -75.55105 39.735..."
2,3,1.584466,3,1.0,,,23692,6461,11869,200,...,16771,0.721543,0.093614,0.184843,0.308913,0.484878,0.010081,3,03|-1.84%,"POLYGON ((-75.55230 39.74464, -75.55222 39.744..."
3,4,52.090076,4,1.0,,,23653,19435,1249,300,...,19540,0.356704,0.399846,0.243449,0.853769,0.047228,0.011489,4,04|-2.01%,"POLYGON ((-75.12478 38.68044, -75.12423 38.679..."
4,5,5.388244,5,1.0,,,23485,6842,11952,1312,...,16989,0.677262,0.122609,0.200129,0.321194,0.493084,0.058338,5,05|-2.7%,"POLYGON ((-75.60451 39.66538, -75.60653 39.663..."


In [55]:
election_df = add_district(send, "SEND", election_df, "ID")

100%|██████████████████████████████████████████| 21/21 [00:00<00:00, 599.85it/s]
100%|█████████████████████████████████████████| 21/21 [00:00<00:00, 1559.69it/s]
100%|██████████████████████████████████████████| 21/21 [00:00<00:00, 170.06it/s]


In [56]:
election_df = add_district(hdist, "HDIST", election_df, "ID")

100%|██████████████████████████████████████████| 41/41 [00:00<00:00, 676.66it/s]
100%|█████████████████████████████████████████| 41/41 [00:00<00:00, 2241.74it/s]
100%|██████████████████████████████████████████| 41/41 [00:00<00:00, 223.46it/s]


### Put the base precinct year after the precinct information column

In [58]:
base_columns = {}
if 'COUNTY_FIP' + year not in election_df.columns:
    base_columns = {
        'COUNTY_FIP':'COUNTY_FIP'+year,
        'COUNTY_NAM':'COUNTY_NAM'+year,
        'PRECINCT':'PRECINCT'+year,
    }
election_df.rename(columns=base_columns, inplace = True)

In [59]:
# reorder the columns
fixed_columns = [
    #'COUNTY_FIP'+year,
    #'COUNTY_NAM'+year,
    'PRECINCT'+year,
    #'CD',
    'SEND',
    'TOTPOP',
    'NH_2MORE',
    'NH_AMIN',
    'NH_ASIAN',
    'NH_BLACK',
    'NH_NHPI',
    'NH_OTHER',
    'NH_WHITE',
    'HISP',
    'H_AMIN',
    'H_ASIAN',
    'H_BLACK',
    'H_NHPI',
    'H_OTHER',
    'H_WHITE',
    'H_2MORE',
    'VAP',
    'HVAP',
    'WVAP',
    'BVAP',
    'AMINVAP',
    'ASIANVAP',
    'NHPIVAP',
    'OTHERVAP',
    '2MOREVAP']

election_columns = [col for col in election_df.columns if col not in fixed_columns]
final_col = fixed_columns + election_columns
election_df = election_df[final_col]

In [60]:
# store the result in directory "il"
os.makedirs("./{}".format(state_ab))
election_df.to_file("./{}/{}.shp".format(state_ab, state_ab))
election_df.to_file('./{}/{}.geojson'.format(state_ab, state_ab), driver='GeoJSON')

# Only do once to build json and read from file when generating ensembles
graph = Graph.from_file("./{}/{}.shp".format(state_ab, state_ab), ignore_errors=True)
graph.to_json("./{}/{}.json".format(state_ab, state_ab))