# Eco Prep
Use this notebook to prepare raw data for ecological inference. Once you have exported your .csv file, you can input it into `Eco Inference.ipynb`. What you'll need is a block-level TIGER/Line shapefile for the target state and also block-level P1 racial census data for the state in addition to the large 2020 results aggregated to blocks csv and native lands csv mentioned in Appendix A of the writeup.

In [1]:
#Imports
import geopandas as gpd
import pandas as pd
import shapely
import numpy as np
import maup

#Import native lands data
AIANHH_data = pd.read_csv("national_block_assignment_aianhh/national_block_assignment_aianhh.csv")
#Import block level TIGER/Line
state_block_gdf = gpd.read_file("South Dakota/tl_2024_46_tabblock20/tl_2024_46_tabblock20.shp")
#Import repaired precinct-level result file
national_2020_results_state = gpd.read_file("South Dakota/SD_repaired/SD_repaired.shp")
#Import census demographic data
state_racial_data = pd.read_csv("South Dakota/SD_P1_Data/DECENNIALPL2020.P1-Data.csv")

  AIANHH_data = pd.read_csv("national_block_assignment_aianhh/national_block_assignment_aianhh.csv")
  state_racial_data = pd.read_csv("South Dakota/SD_P1_Data/DECENNIALPL2020.P1-Data.csv")


#### Prepare for merging

In [2]:
#Drop index
state_racial_data.drop(0, inplace=True)
#Reformat geoid for compatibility
state_racial_data['GEO_ID'] = state_racial_data['GEO_ID'].str[9:26]


In [3]:
#Ensure everything is an int for merging
state_racial_data['GEOID20'] = state_racial_data['GEO_ID'].astype(int)
state_racial_data['P1_005N']=state_racial_data['P1_005N'].astype(int)
state_racial_data['P1_003N']=state_racial_data['P1_003N'].astype(int)
state_racial_data['P1_001N']=state_racial_data['P1_001N'].astype(int)
state_racial_data['P1_012N']=state_racial_data['P1_012N'].astype(int)


In [4]:
state_block_gdf['GEOID20']=state_block_gdf['GEOID20'].astype(int)

In [5]:
state_block_gdf = state_block_gdf.merge(AIANHH_data, on = ['GEOID20'], how = 'left')

In [6]:
state_native_areas_added = state_block_gdf.merge(AIANHH_data, on = ['GEOID20'], how = 'left')

In [7]:
state_block_gdf = state_block_gdf.merge(state_racial_data, on = ['GEOID20'], how = 'left')

In [8]:
state_block_gdf.crs

<Geographic 2D CRS: EPSG:4269>
Name: NAD83
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: North America - onshore and offshore: Canada - Alberta; British Columbia; Manitoba; New Brunswick; Newfoundland and Labrador; Northwest Territories; Nova Scotia; Nunavut; Ontario; Prince Edward Island; Quebec; Saskatchewan; Yukon. Puerto Rico. United States (USA) - Alabama; Alaska; Arizona; Arkansas; California; Colorado; Connecticut; Delaware; Florida; Georgia; Hawaii; Idaho; Illinois; Indiana; Iowa; Kansas; Kentucky; Louisiana; Maine; Maryland; Massachusetts; Michigan; Minnesota; Mississippi; Missouri; Montana; Nebraska; Nevada; New Hampshire; New Jersey; New Mexico; New York; North Carolina; North Dakota; Ohio; Oklahoma; Oregon; Pennsylvania; Rhode Island; South Carolina; South Dakota; Tennessee; Texas; Utah; Vermont; Virginia; Washington; West Virginia; Wisconsin; Wyoming. US Virgin Islands. British Virgin Islands

In [9]:
#Modify coordinate system for maup.assign
national_2020_results_state=national_2020_results_state.to_crs(4269)

#### Begin merges

In [10]:
sum_variables = ["P1_001N", "P1_005N", "P1_012N","P1_003N"]

#Assign blocks to precincts
blocks_to_precincts_assignment = maup.assign(state_block_gdf.geometry, national_2020_results_state.geometry)
#Group columns of interest at the block level and sum them by precinct
national_2020_results_state[sum_variables] = state_block_gdf[sum_variables].groupby(blocks_to_precincts_assignment).sum()

print(national_2020_results_state['P1_001N'].sum()) #Sanity check


  df = df[df.area > area_cutoff].reset_index(drop=True)

  geometries = geometries[geometries.area > area_cutoff]

  return assign_to_max(intersections(sources, targets, area_cutoff=0).area)


886667


In [11]:
#Bespoke division function to account for division by 0 error
def divide(a, b):
    try:
        return a / b
    except ZeroDivisionError:
        return 0

#Create group percentages for eco inference
national_2020_results_state['native_american_perc'] = divide(national_2020_results_state['P1_005N'],national_2020_results_state['P1_001N'])
national_2020_results_state['mixed_native_american_perc'] = divide(national_2020_results_state['P1_012N'],national_2020_results_state['P1_001N'])
national_2020_results_state['white_alone_perc'] = divide(national_2020_results_state['P1_003N'],national_2020_results_state['P1_001N'])
national_2020_results_state['other'] = divide((national_2020_results_state['P1_001N']-(national_2020_results_state['P1_005N']+national_2020_results_state['P1_012N']+national_2020_results_state['P1_003N'])), national_2020_results_state['P1_001N'])

#Drop columns with Nan entires in key columns as that implies they are regions without population
national_2020_results_state_corrected = national_2020_results_state.dropna(subset=['native_american_perc', 'mixed_native_american_perc','other', 'white_alone_perc'])


In [12]:
#Create candidate support percentages for eco inference
national_2020_results_state_corrected['perc_trump'] = divide(national_2020_results_state_corrected['G20PRERTRU'],(national_2020_results_state_corrected['G20PRERTRU'] + national_2020_results_state_corrected['G20PREDBID']))
national_2020_results_state_corrected['perc_biden'] = divide(national_2020_results_state_corrected['G20PREDBID'],(national_2020_results_state_corrected['G20PRERTRU'] + national_2020_results_state_corrected['G20PREDBID']))
#Note that total votes does not include third party candidates
national_2020_results_state_corrected['total_votes'] = (national_2020_results_state_corrected['G20PRERTRU'] + national_2020_results_state_corrected['G20PREDBID'])
#Drop columns with Nan entires in key columns as that implies they are regions without population
national_2020_results_state_corrected=national_2020_results_state_corrected.dropna(subset=['perc_trump', 'perc_biden'])

In [13]:
national_2020_results_state_corrected

Unnamed: 0,STATEFP,COUNTYFP,VTDST,NAME,G20PRERTRU,G20PREDBID,G20PRELJOR,G20USSRROU,G20USSDAHL,G20HALRJOH,...,P1_005N,P1_012N,P1_003N,native_american_perc,mixed_native_american_perc,white_alone_perc,other,perc_trump,perc_biden,total_votes
0,46,103,VTD3-4,Precinct RC 3-4,1738,1298,88,1917,1201,2305,...,202,112,4569,0.038337,0.021256,0.867147,0.073259,0.572464,0.427536,3036
1,46,103,VTD1-3,Precinct RC 1-3,1716,1003,78,1866,919,2203,...,258,120,4133,0.053273,0.024778,0.853397,0.068553,0.631114,0.368886,2719
2,46,103,VTD1-2,Precinct RC 1-2,1275,795,89,1380,758,1577,...,491,195,3593,0.105728,0.041990,0.773686,0.078596,0.615942,0.384058,2070
3,46,103,VTD2-4,Precinct RC 2-4,1236,1131,129,1371,1093,1578,...,1124,318,4425,0.173618,0.049120,0.683503,0.093760,0.522180,0.477820,2367
4,46,103,VTD2-1,Precinct RC 2-1,60,66,8,72,61,76,...,121,24,218,0.295844,0.058680,0.533007,0.112469,0.476190,0.523810,126
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
732,46,033,VTD-SD,Precinct 05 Sentinel Hill,523,212,17,537,208,586,...,39,30,1044,0.034002,0.026155,0.910201,0.029643,0.711565,0.288435,735
733,46,103,VTDHC1,Precinct HC,943,418,39,982,409,1075,...,76,59,1840,0.033898,0.026316,0.820696,0.119090,0.692873,0.307127,1361
734,46,033,VTD-PR,Precinct 04 Pringle,341,129,2,338,124,360,...,9,10,623,0.013554,0.015060,0.938253,0.033133,0.725532,0.274468,470
735,46,033,VTD-BR,Precinct 09 Bear Rock,963,404,35,1011,387,1120,...,17,49,1782,0.008901,0.025654,0.932984,0.032461,0.704462,0.295538,1367


In [14]:
#Replace whatever precinct naming scheme column with just 'NAME' for ease of use and reusability
national_2020_results_state_corrected['NAME'] = national_2020_results_state_corrected['NAME'].astype(str)

#### Export data

In [15]:
#Filter to just columns of interest
exportable = national_2020_results_state_corrected[["native_american_perc", "perc_trump", "G20PRERTRU", "NAME", "G20PREDBID", "perc_biden", 'total_votes', 'mixed_native_american_perc', 'white_alone_perc', 'other']]
#Export data to csv
exportable.to_csv('Prepared_4_Eco_Inference/SouthDakota.csv', index = True)
