In [1]:
from pprint import pprint
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import folium
import os
import maup
import multiprocessing as mp
from pyproj import CRS
import re

In [2]:
# Get the current working directory
current_directory = os.getcwd()
move_up = os.path.dirname(os.path.dirname(current_directory))
data_dir = os.path.join(move_up, 'Data')
new_path = os.chdir(data_dir)
print(os.getcwd())

# Data

---

- [ ] Find Precinct Data (Nevada + Mississippi)
- [ ] Find State Assembly Shapefile
- [ ] Demographic Population

# Pre-processing

---

- [ ] Read ShapeFile
- [ ] Convert ShapeFile -> GeoJSON
- [ ] Multi-threaded and Parallelize Code
- [ ] 

### Mississippi State Assembly Data Processing

In [345]:
ms_districts_df = gpd.read_file('ms_State_Assembly_2022.geojson')
ms_districts_df.plot()

# plt.show()

ms_districts_df.to_crs(epsg=4326, inplace=True)
display(ms_districts_df.crs)
display(ms_districts_df.columns)

# ms_districts_df.to_file('ms_State_Assembly_2022.geojson', driver='GeoJSON')


Data gathered from [Redistricting Data](https://redistrictingdatahub.org/dataset/2022-mississippi-senate-districts-approved-plan/)

### Demographic Data on Block Level

In [344]:
ms_block_race_df = gpd.read_file('ms-block-demo-2020.geojson')

# ms_block_race_df['area'] = ms_block_race_df['geometry'].area
# ms_block_race_df.explore('area', legend=False)
# ms_block_race_df.explore('STATEFP20', legend=False)
ms_block_race_df.plot()
display(ms_block_race_df.head())
display(ms_block_race_df.columns)

# https://redistrictingdatahub.org/dataset/mississippi-2022-select-race-data-from-the-american-community-survey-20182022-at-the-block-group-level/
# diff_block_df = gpd.read_file('MS/ms_race_2022_bg/ms_race_2022_bg.shp')
# display(diff_block_df.isna().sum())
# display(diff_block_df.columns)


# fig, (ax1,ax2) = plt.subplots(nrows=1, ncols=2, figsize=(20, 16))
# ax1 = diff_block_df.plot(ax=ax1)
# ax2 = ms_block_race_df.plot(ax=ax2)

Data from [Districtr](https://districtr.org/plan)

### General Elections Precinct Data + Geometry

In [350]:
ms_prec_bound_2022_df = gpd.read_file('ms_gen_22_prec/ms_gen_22_prec.shp')
# ms_prec_bound_2022_df.to_file('msGenPrecBound2022', driver='GeoJSON')
# ms_prec_bound_2022_df.explore()


ms_prec_bound_2022_df

# fig, (ax1,ax2) = plt.subplots(nrows=1, ncols=2, figsize=(20, 16))
# ax1 = ms_prec_bound_2022_df.plot(ax=ax1)
# ax2 = ms_block_race_df.plot(ax=ax2)

In [None]:
ms_prec_bound_2022_df.columns

Data from [Redistricting Hub](https://redistrictingdatahub.org/dataset/mississippi-2022-general-election-precinct-level-results-and-boundaries/)

In [None]:
ms_prec_data = ms_prec_bound_2022_df
display(ms_prec_data.columns)

ms_block_data = ms_block_race_df
display(ms_block_data.columns)

ms_block_data.to_crs(32030, inplace=True)
ms_prec_data.to_crs(32030, inplace=True)
display(ms_prec_data.crs)


In [None]:
# maup.doctor(ms_block_data)
maup.doctor(ms_prec_data)
# print(ms_prec_data.geometry.is_valid)

In [None]:
display(ms_prec_data.estimate_utm_crs())
ms_prec_data.to_crs(epsg=32030, inplace=True)
ms_prec_data = maup.smart_repair(ms_prec_data)

In [None]:
ms_prec_data
block_prec_aggr = maup.assign(ms_block_data, ms_prec_data)
display(ms_prec_data.columns.intersection(ms_block_data.columns))


In [None]:
display(ms_prec_data)
ms_prec_data.to_file("fixed_ms_prec_geom.geojson", driver="GeoJSON")

In [None]:
display(ms_prec_data.columns)
display(ms_block_data.columns)

### Redistrcting Hub Precinct Level Data + Districtr Block Demographic Data provided better geometry

In [None]:
variables = ['TOTPOP', 'WHITE', 'BLACK', 'ASIAN', 'HISP']

ms_prec_data[variables] = ms_block_data[variables].groupby(block_prec_aggr).sum()


# display(ms_prec_data[variables].head())
# display(ms_prec_data.isna().sum())
# display(ms_prec_data)
# ms_prec_data.plot()

In [None]:
ms_prec_data['2022_DEMOCRATIC'] = ms_prec_data[['GCON01DBLA', 'GCON02DTHO', 'GCON03DYOU', 'GCON04DDUP']].sum(axis=1)
ms_prec_data['2022_REPUBLICAN'] = ms_prec_data[['GCON01RKEL', 'GCON02RFLO', 'GCON03RGUE', 'GCON04REZE']].sum(axis=1)
ms_prec_data['2022_LIBERAL'] = ms_prec_data[['GCON04LJOH']].sum(axis=1)

# display(ms_prec_data)

ms_prec_data['border'] = ms_prec_data.buffer(200)

# ms_prec_data['border'].plot()
rows = [row for _, row in ms_prec_data.groupby(ms_prec_data.index)]

new_prec_data = ms_prec_data.drop(['GCON01DBLA', 'GCON02DTHO', 'GCON03DYOU', 'GCON04DDUP', 
                                      'GCON01RKEL', 'GCON02RFLO', 'GCON03RGUE', 'GCON04REZE', 
                                     'GCON04LJOH', 'POLL_LOC', 'CNTY_CODE'], axis=1)

new_prec_data

In [None]:
new_prec_data.plot(column='WHITE', cmap='OrRd', legend=True)

# 2023 General Election Precinct-Level Results

In [19]:
ms_gen_23 = gpd.read_file('MS Data/ms_gen_2023_prec/ms_gen_2023_prec.csv')
ms_gov_unfiltered = ms_gen_23.iloc[:, :4].join(ms_gen_23.iloc[:, 10:13])

In [20]:
ms_gov_unfiltered

In [None]:

ms_gov_unfiltered.to_excel('ms_atg_gov_unfiltered.xlsx')

We remove columns we don't need such as results from Commissioner of Agriculture & Commerce \
and other parties

In [21]:
ms_unused_columns = ['G23GOVIGRA']
ms_gov_23 = ms_gov_unfiltered.drop(ms_unused_columns, axis=1)

ATG and GOV data from [Redistricting Hub](https://redistrictingdatahub.org/dataset/mississippi-2023-general-election-precinct-level-results/)

In [39]:
ms_vest_20 = gpd.read_file('MS Data/ms_vest_20/ms_vest_20.shp')
ms_vest_20.columns
to_m

In [40]:
ms_vest_repaired

"VEST Presidential Election Data from [Redistricting Hub](https://redistrictingdatahub.org/dataset/vest-2020-mississippi-precinct-and-election-results/) \
Precinct-Level Presidential Data + Geometry

In [None]:
ms_vest_20.to_excel('ms_vest_20.xlsx')

In [24]:
ms_pres_prec_20 = ms_vest_20.iloc[:, :7].join(ms_vest_20.iloc[:, -4:])
ms_pres_prec_20

In [25]:
ms_unused_prec_col = ['G20USSLEDW', 'GEOID20']
pres_str_to_int = ['STATEFP20', 'COUNTYFP20', 'VTDST20']
ms_pres_20 = ms_pres_prec_20.drop(ms_unused_prec_col, axis=1)
ms_pres_20[pres_str_to_int] = ms_pres_20[pres_str_to_int].astype(int)
ms_pres_20.sort_values(by='COUNTYFP20', inplace=True)
ms_pres_20

In [26]:
def clean_location(location):
    # Replace patterns using regex
    location = re.sub(r'Dist\.\s\d+,\s(.+)\sPrecinct', r'\1', location)
    location = re.sub(r'County\sPrecinct\s\d+-([A-Za-z])', r'District \1', location)
    location = re.sub(r'Precinct\s(\d+)', r'\1', location)
    location = re.sub(r'Beat\s\d+\s([A-Za-z]+)\s\([A-Z]+\)', r'\1', location)
    location = re.sub(r'([A-Za-z]+)\sNorth\s-\s(\d+b?n)\s(\d+)', r'\1 \2 \3', location)
    location = re.sub(r'([A-Za-z]+)\sHighway\s\d+\s-\s\d+', r'\1 Hwy', location)
    # Additional regex patterns
    location = re.sub(r'\d+\s-\s(.+)', r'\1', location)
    location = re.sub(r'\(\d+\)\s(.+)', r'\1', location)
    location = re.sub(r'(\d+)\s-\s([A-Za-z-]+)', r'\2', location)
    location = re.sub(r'([A-Za-z ]+)\(\w+-\w+\)', r'\1', location)
    location = re.sub(r'(\w+)\s(\d+)', r'\1 \2', location)
    location = re.sub(r'(\w+)\s(\w+)\s(\d+)', r'\1 \2 \3', location)
    location = re.sub(r'(\w+)\s(\d+)(\w+)', r'\1 \2 \3', location)
    # Remove 'Beat' at the beginning of a string
    location = re.sub(r'^Beat\s*', '', location)
    location = re.sub(r'^\d+\s*-', '', location.strip())

    return location.lstrip('0123456789.- ').upper()

def remove_symbols(text):
    # Remove '.' and '#'
    cleaned_text = re.sub(r'[.#\']', '', text)
    return cleaned_text

def replace_with_space(text):
    # Replace '/' and '-' with a space
    cleaned_text = re.sub(r'[\\/-]', ' ', text)
    return cleaned_text

In [27]:
ms_gov_23['Location Cleaned'] = ms_gov_23['precinct'].apply(clean_location).apply(remove_symbols).apply(replace_with_space)
# ms_atg_gov_23['Location Cleaned'] = ms_atg_gov_23['Location Cleaned']
ms_gov_23

In [28]:
ms_gov_23.loc[ms_gov_23['COUNTYFP'] == 'NONE', 'COUNTYFP'] = 33
ms_gov_23.columns

In [29]:
strs_to_int_col = ['COUNTYFP', 'G23GOVDPRE', 'G23GOVRREE']
ms_gov_23[strs_to_int_col] = ms_gov_23[strs_to_int_col].astype('int')
ms_gov_23['UNIQUE_ID'] = (ms_gov_23['Location Cleaned'] + ' ' + ms_gov_23['COUNTYFP'].astype('str')).astype('str')

In [30]:
ms_gov_23

In [373]:
ms_gov_23.to_excel('ms_atg_gov_23.xlsx')

In [31]:
ms_pres_20
ms_pres_20['UNIQUE_ID'] = (ms_pres_20['NAME20'].apply(replace_with_space).apply(remove_symbols).str.upper() + ' ' + ms_pres_20['COUNTYFP20'].astype('str')).astype('str')

In [38]:
ms_pres_20.crs

In [376]:
ms_pres_20.to_excel('ms_pres_20.xlsx')

In [35]:
test_merge = pd.merge(ms_pres_20, ms_gov_23, on='UNIQUE_ID', how='left')
pprint(test_merge.isna().sum())
test_merge.shape

In [37]:
# maup.doctor(test_merge)


In [398]:
test_merge.drop_duplicates(subset=['NAME20', 'COUNTYFP'], keep='first', inplace=True)
test_merge.sort_values(by=['COUNTYFP20', 'UNIQUE_ID'], inplace=True)
test_merge.shape

In [399]:
test_merge.to_excel('merge_outer.xlsx')


In [403]:
testing = pd.read_excel('merge_outer.xlsx')
testing

# Exploratory Data Analysis

- [ ] Precinct Neighbors
- [ ] MGGG Algorithm
- [ ] Calculate Election Winners
- [ ] Identify Opportunity Districts
- [ ] **Ecological Inference**


In [15]:
ms_election_precinct_df = pd.read_excel('MS Data/ms_election_precinct_data.xlsx')
ms_election_precinct_df['geometry'] = gpd.GeoSeries.from_wkt(ms_election_precinct_df['geometry'])


# ms_election_precinct_data = gpd.GeoDataFrame(ms_election_precinct_df, geometry=ms_election_precinct_df.geometry, crs='EPSG:4326')
# ms_election_precinct_data