## Intro to MAUP

@author: eveomett AI for Redistricting, USF All data retrieved 04/30/24:
https://redistrictingdatahub.org/dataset/virginia-block-pl-94171-2020-by-table/
https://redistrictingdatahub.org/dataset/vest-2020-virginia-precinct-boundaries-and-election-results-shapefile/
https://redistrictingdatahub.org/dataset/2021-virginia-congressional-districts-approved-plan/

https://redistrictingdatahub.org/dataset/vest-2018-virginia-precinct-and-election-results/
https://redistrictingdatahub.org/dataset/vest-2016-virginia-precinct-and-election-results/
https://redistrictingdatahub.org/dataset/vest-2021-virginia-precinct-boundaries-and-election-results-shapefile/
https://redistrictingdatahub.org/dataset/vest-2017-virginia-precinct-boundaries-and-election-results-shapefile/

https://redistrictingdatahub.org/dataset/2021-senate-of-virginia-districts-approved-plan/
https://redistrictingdatahub.org/dataset/2021-virginia-house-of-delegates-districts-approved-plan/

In [4]:
import pandas as pd
import geopandas as gpd
import maup
import time
from maup import smart_repair
from gerrychain import Graph

maup.progress.enabled = True

In [5]:
import warnings
warnings.filterwarnings("ignore")

In [6]:
# state = Virginia
state_ab = "va"

## Data
1. Download all the data in directory "il_data"
2. Extract them all

In [7]:
data_folder = state_ab + "_data/"
population1_data = "./{}{}_pl2020_b-2/{}_pl2020_p1_b.shp".format(data_folder, state_ab, state_ab)
population2_data = "./{}{}_pl2020_b-2/{}_pl2020_p2_b.shp".format(data_folder, state_ab, state_ab)
vap_data =  "./{}{}_pl2020_b-2/{}_pl2020_p4_b.shp".format(data_folder, state_ab, state_ab)
vest21_data = "./{}{}_vest_21/{}_vest_21.shp".format(data_folder, state_ab, state_ab)
vest20_data = "./{}{}_vest_20/{}_vest_20.shp".format(data_folder, state_ab, state_ab)
vest18_data = "./{}{}_vest_18/{}_vest_18.shp".format(data_folder, state_ab, state_ab)
vest17_data = "./{}{}_vest_17/{}_vest_17.shp".format(data_folder, state_ab, state_ab)
vest16_data = "./{}{}_vest_16/{}_vest_16.shp".format(data_folder, state_ab, state_ab)
cd_data = "./{}{}_cong_adopted_2021/SCV FINAL CD.shp".format(data_folder, state_ab)
send_data = "./{}{}_sldu_2021/va_sldu_2021.shp".format(data_folder, state_ab)
hdist_data = "./{}{}_sldl_adopted_2021/va_sldl_adopted_2021.shp".format(data_folder, state_ab)

In [8]:
def do_smart_repair(df):
    # change it to the UTM it needs for smart_repair
    df = df.to_crs(df.estimate_utm_crs())
    df = smart_repair(df)
    if maup.doctor(df):
        print('smart_repair successful')
            
        # change it back to this UTM for this data
        df = df.to_crs('EPSG:4269')
    else:
        print('smart_repair failed')
    return df

In [9]:
def add_district(dist_df, dist_name, election_df, col_name):
    # check if it needs to be smart_repair
    if maup.doctor(dist_df) != True:
        dist_df = do_smart_repair(dist_df)
    
    election_df = gpd.GeoDataFrame(election_df, crs="EPSG:4269")
    
    # assigne the pricincts
    precincts_to_district_assignment = maup.assign(election_df.geometry, dist_df.geometry)
    election_df[dist_name] = precincts_to_district_assignment
    for precinct_index in range(len(election_df)):
        election_df.at[precinct_index, dist_name] = dist_df.at[election_df.at[precinct_index, dist_name], col_name]
    
    return election_df

In [10]:
def rename(original, year):
    party = original[6]
    if party == 'R' or party == 'D':
        return original[3:6] + year + original[6]
    else:
        return original[3:6] + year + 'O'

In [11]:
def add_vest_data(vest_data, df, year, block_df):
    vest = gpd.read_file(vest_data)
    
     # check if it needs to be smart_repair
    if maup.doctor(vest) != True:
        vest = do_smart_repair(vest)
    
    # rename the columns
    original_col = vest.columns[5:-1]
    new_col = [rename(i, year) for i in original_col]
    rename_dict = dict(zip(original_col, new_col))
    vest = vest.rename(columns=rename_dict)
    vest = vest.groupby(level=0, axis=1).sum() # combine all the other party's vote into columns with sufix "O"
    col_name = list(set(new_col))
    col_name.sort()
    
    # assign pricinct to block
    vest = gpd.GeoDataFrame(vest, crs="EPSG:4269")
    vest_to_block_assginment = maup.assign(vest.geometry, block_df.geometry)
    block = block_df[['geometry']]
    block[col_name] = vest[col_name].groupby(vest_to_block_assginment).sum()
    
    # assign block to vest
    block = gpd.GeoDataFrame(block, crs="EPSG:4269")
    df = gpd.GeoDataFrame(df, crs="EPSG:4269")
    block_to_pricinct_assginment = maup.assign(block.geometry, df.geometry)
    df[col_name] = block[col_name].groupby(block_to_pricinct_assginment).sum()
    df = df.groupby(level=0, axis=1).sum()
    
    return df

### Read the census data

In [13]:
population1_df = gpd.read_file(population1_data)
population2_df = gpd.read_file(population2_data)
vap_df = gpd.read_file(vap_data)

In [14]:
population2_df = population2_df.drop(columns=['SUMLEV', 'LOGRECNO', 'GEOID', 'COUNTY', 'geometry'])
population_df = pd.merge(population1_df, population2_df, on='GEOID20')

In [15]:
maup.doctor(population_df)

100%|██████████████████████████████████| 163491/163491 [04:50<00:00, 563.68it/s]


True

In [17]:
population_df['H_WHITE'] = population_df.apply(lambda t: t['P0010003'] - t['P0020005'], 1)
population_df['H_BLACK'] = population_df.apply(lambda t: t['P0010004'] - t['P0020006'], 1)
population_df['H_AMIN'] = population_df.apply(lambda t: t['P0010005'] - t['P0020007'], 1)
population_df['H_ASIAN'] = population_df.apply(lambda t: t['P0010006'] - t['P0020008'], 1)
population_df['H_NHPI'] = population_df.apply(lambda t: t['P0010007'] - t['P0020009'], 1)
population_df['H_OTHER'] = population_df.apply(lambda t: t['P0010008'] - t['P0020010'], 1)
population_df['H_2MORE'] = population_df.apply(lambda t: t['P0010009'] - t['P0020011'], 1)

## Read the vest 20 data

Now using it as a "base pricinct", but it could be vest 18 or vest 16 if vest 20 is not working

In [18]:
vest20 = gpd.read_file(vest20_data)

In [19]:
maup.doctor(vest20)

100%|██████████████████████████████████████| 2477/2477 [00:11<00:00, 208.03it/s]


There are 58 overlaps.
There are 587 holes.


False

In [20]:
do_smart_repair(vest20)

Snapping all geometries to a grid with precision 10^( -5 ) to avoid GEOS errors.
Identifying overlaps...


100%|██████████████████████████████████████| 2623/2623 [00:03<00:00, 773.34it/s]


Resolving overlaps...
Assigning order 2 pieces...
Assigning order 3 pieces...
Filling gaps...


Gaps to simplify: 100%|█████████████████████████| 64/64 [00:35<00:00,  1.80it/s]
Gaps to fill: 100%|███████████████████████████████| 7/7 [00:04<00:00,  1.56it/s]
100%|██████████████████████████████████████| 2477/2477 [00:11<00:00, 223.49it/s]


smart_repair successful


Unnamed: 0,COUNTYFP,LOCALITY,VTDST,PRECINCT,G20PREDBID,G20PRERTRU,G20PRELJOR,G20PREOWRI,G20USSDWAR,G20USSRGAD,G20USSOWRI,geometry
0,001,Accomack County,000101,Chincoteague,837,1618,29,2,915,1563,3,"POLYGON ((-75.42507 37.89957, -75.42499 37.899..."
1,001,Accomack County,000201,Atlantic,321,657,11,2,357,644,0,"POLYGON ((-75.42499 37.89985, -75.42507 37.899..."
2,001,Accomack County,000202,Greenbackville,516,1091,18,0,539,1054,0,"POLYGON ((-75.41691 37.93523, -75.41651 37.934..."
3,001,Accomack County,000301,New Church,1013,667,14,2,1003,638,2,"POLYGON ((-75.58612 37.89203, -75.58639 37.892..."
4,001,Accomack County,000401,Bloxom,307,462,8,0,306,447,0,"POLYGON ((-75.58754 37.86652, -75.58770 37.866..."
...,...,...,...,...,...,...,...,...,...,...,...,...
2472,800,Suffolk City,000302,John F Kennedy (CD 4),1111,54,13,5,1082,58,5,"POLYGON ((-76.56694 36.72110, -76.56715 36.721..."
2473,800,Suffolk City,000504,Lake Cohoon (CD 3),143,325,2,2,149,317,0,"POLYGON ((-76.65292 36.82553, -76.64793 36.829..."
2474,800,Suffolk City,000302,John F Kennedy (CD 3),930,300,12,7,952,285,0,"POLYGON ((-76.46673 36.74654, -76.46676 36.746..."
2475,059,Fairfax County,000712,Shreve,674,312,24,11,661,357,0,"POLYGON ((-77.19151 38.89153, -77.19137 38.891..."


In [21]:
original_col = vest20.columns[5:-1]
new_col = [rename(i, '20') for i in original_col]
rename_dict = dict(zip(original_col, new_col))
vest20 = vest20.rename(columns=rename_dict)
vest20 = vest20.groupby(level=0, axis=1).sum()
vest20 = gpd.GeoDataFrame(vest20, crs="EPSG:4269")

In [22]:
# vap and population have the same GEOID20
blocks_to_precincts_assignment = maup.assign(population_df.geometry, vest20.geometry)

100%|███████████████████████████████████████| 2477/2477 [00:39<00:00, 62.78it/s]
100%|██████████████████████████████████████| 2477/2477 [00:04<00:00, 556.99it/s]


In [24]:
pop_column_names = ['P0020001', 'P0020002', 'P0020005', 'P0020006', 'P0020007', 'P0020008', 'P0020009', 'P0020010', 'P0020011', 
                    'H_WHITE', 'H_BLACK', 'H_AMIN', 'H_ASIAN', 'H_NHPI', 'H_OTHER', 'H_2MORE']
vap_column_names = ['P0040001', 'P0040002', 'P0040005', 'P0040006', 'P0040007', 'P0040008', 'P0040009', 'P0040010', 'P0040011']

vest20[pop_column_names] = population_df[pop_column_names].groupby(blocks_to_precincts_assignment).sum()
vest20[vap_column_names] = vap_df[vap_column_names].groupby(blocks_to_precincts_assignment).sum()

In [25]:
print(population_df['P0020001'].sum())
print(vest20['P0020001'].sum())
print(vap_df['P0040001'].sum())
print(vest20['P0040001'].sum())

8631393
8631389.0
6745054
6745051.0


In [None]:
cong_df = gpd.read_file(cd_data)
send = gpd.read_file(send_data)
hdist = gpd.read_file(hdist_data