In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
import folium
import requests
from io import BytesIO
import pygris
from pygris.utils import erase_water
import requests

Process the raw excel file

In [2]:
# Read the excel sheet and skip blank rows
raw_excel = pd.read_excel("data/DSHA LIHTC List_MAPPING.xlsx", engine='openpyxl', skiprows=[2,3], skipfooter=4, dtype=str)
# Add additional column information from the first row
raw_excel.columns = (raw_excel.columns.astype(str) + " " + raw_excel.head(1).fillna("").astype(str)).iloc[0].str.strip().values
raw_excel.rename(columns={"ALLOCATION .1 DATE": "ALLOCATION DATE", "ALLOCATION  AMOUNT": "ALLOCATION AMOUNT", "Type of Property*": "Type of Property"}, inplace=True)
raw_excel.drop(0, inplace=True)

In [3]:
# Find and label the rows about tax year
raw_excel["is tax"] = raw_excel["PROJECT NAME & ADDRESS"].str.contains("TAX CREDIT ALLOCATIONS")

# Function that maps the boolean column "is tax", which is true when a row contains tax year information, to an integer equal to the tax year
def assign_to_year(x, i):
    # if the row is a tax year, increment i
    if x:
        i[0] = i[0] + 1
    # return an integer equal to the tax year
    return i[0] + 2016

# add a column for the tax year
index = [-1]
raw_excel["Tax Allocation Year"] = raw_excel["is tax"].apply(assign_to_year, args=[index])

# drop rows of tax year information and reformat
raw_excel = raw_excel.loc[~raw_excel["is tax"]].drop(columns="is tax").reset_index(drop=True)


In [4]:
# Add a column that labels the primary line for an entry
raw_excel["primary"] = ~raw_excel["County"].isna()

# Function that maps each the boolean column "primary", which is true when a row contains the primary info from the dataset, to an integer that functions as an index for primary entries
def assign_to_year(x, i):
    # if the row is primary, increment i
    if x:
        i[0] = i[0] + 1
    # return an index for the primary entries
    return i[0]

# add an index column for the primary entries
index = [-1]
raw_excel["primary"] = raw_excel["primary"].apply(assign_to_year, args=[index])

In [5]:
# Fix 'ALLOCATION AMOUNT', 'ALLOCATION DATE' swap
flipped_years = [2018, 2019, 2020, 2021, 2022]
tmp = raw_excel.loc[raw_excel["Tax Allocation Year"].isin(flipped_years)]['ALLOCATION AMOUNT'].copy()
tmp2 = raw_excel.loc[raw_excel["Tax Allocation Year"].isin(flipped_years)]['ALLOCATION DATE'].copy()
raw_excel.loc[raw_excel["Tax Allocation Year"].isin(flipped_years), 'ALLOCATION AMOUNT'] = tmp2.values
raw_excel.loc[raw_excel["Tax Allocation Year"].isin(flipped_years), 'ALLOCATION DATE'] = tmp.values


In [7]:
# Create separate dataframes for each row in a data entry
grouped_data = raw_excel.groupby("primary")
raw_data1 = grouped_data.nth(0)
raw_data2 = grouped_data.nth(1).drop(columns="Tax Allocation Year")
raw_data3 = grouped_data.nth(2).drop(columns="Tax Allocation Year")
raw_data4 = grouped_data.nth(3).drop(columns="Tax Allocation Year")
raw_data5 = grouped_data.nth(4).drop(columns="Tax Allocation Year")

# Modify the column names for each dataframe to prepare for joining
raw_data2.columns = raw_data2.columns + " 2"
raw_data3.columns = raw_data3.columns + " 3"
raw_data4.columns = raw_data4.columns + " 4"
raw_data5.columns = raw_data5.columns + " 5"

# Join the dataframes by index and remove unused columns
flattened_data = raw_data1.join(raw_data2, how="left").join(raw_data3, how="left").join(raw_data4, how="left").join(raw_data5, how="left").dropna(axis=1, how='all').reset_index(drop=True)


In [8]:
# Convert dates back to their orginial format
flattened_data["Placed in Service Date"] = pd.to_datetime(flattened_data["Placed in Service Date"], errors='coerce').dt.strftime('%m/%d/%Y')
flattened_data["ALLOCATION DATE"] = pd.to_datetime(flattened_data["ALLOCATION DATE"], errors='coerce').dt.strftime('%m/%d/%Y')
flattened_data["Tax Credit Compliance Date"] = pd.to_datetime(flattened_data["Tax Credit Compliance Date"], errors='coerce').dt.strftime('%m/%d/%Y')
flattened_data["Extended Use Period"] = pd.to_datetime(flattened_data["Extended Use Period"], errors='coerce').dt.strftime('%m/%d/%Y')
flattened_data["Placed in Service Date 2"] = pd.to_datetime(flattened_data["Placed in Service Date 2"], errors='coerce').dt.strftime('%m/%d/%Y')



In [None]:
flattened_data


In [9]:
# Combine address fields
address_columns = ["PROJECT NAME & ADDRESS", "PROJECT NAME & ADDRESS 2", "PROJECT NAME & ADDRESS 3", "PROJECT NAME & ADDRESS 4", "PROJECT NAME & ADDRESS 5"]

# Extracts addresses from projects
def extract_address(x):
    # Project 27 has three full addresses, so we use the last one listed
    if x.name == 27:
        addr = x[address_columns].dropna().values[-1]
        return addr
    # The last two lines of the address field contain the address split between two lines, except for project 27
    else:
        addr = x[address_columns].dropna().values[-2:]
        return addr[0] + ", " + addr[1]

# Extract an address for each project
flattened_data["address"] = flattened_data.apply(extract_address, axis=1)

In [None]:
flattened_data

In [10]:
# Print the data to a csv
flattened_data.to_csv("data/processed_data.csv", index=False)

In [11]:
# Print the addresses to a seperate list
flattened_data["address"].to_csv("data/DSHA_addresses.csv", index=False)

At this point we transfer the address csv over to the geocoder to get the latitude and longitude of each project

In [12]:
# Read the file of geolocated addresses
geolocations = pd.read_csv("data/counts_per_tract.csv").drop_duplicates("input_address")
# Join the geolocations to the flattened dataframe

#ignore case and commas
geolocations["address_lower"] = geolocations["input_address"].str.lower().str.replace(",","")
flattened_data["address_lower"] = flattened_data["address"].str.lower().str.replace(",", "")
geolocated_data = flattened_data.merge(geolocations, on="address_lower", how="left")
# Remove lat,lot from unsuccessfully (not in the u.s.) geolocated address
geolocated_data.loc[geolocated_data["census_tract"] == "Unable To Geocode The Address", "lon"] = np.nan
geolocated_data.loc[geolocated_data["census_tract"] == "Unable To Geocode The Address", "lat"] = np.nan

In [13]:
# Convert lat,lot to Shapely points
geolocated_data = gpd.GeoDataFrame(geolocated_data, geometry=gpd.points_from_xy(geolocated_data['lon'], geolocated_data['lat'], crs="EPSG:4326"))


In [None]:
# Visualize points on a map

# initialize the map and store it in a folium map object
us_map = folium.Map(location=[39.74503, -75.57203], zoom_start=14, tiles=None)

# Add background tiles
folium.TileLayer('CartoDB positron',name="Light Map",control=False).add_to(us_map)


# Add markers for each school
points=folium.features.GeoJson(
        geolocated_data.loc[geolocated_data["census_tract"] != "Unable To Geocode The Address"], # Full geopandas data
        control=False,
        marker = folium.CircleMarker(radius = 5, # Radius in metres
                           weight = 0, #outline weight
                           fill_color = '#d95f02', 
                           fill_opacity = 1)
        )

points.add_to(us_map)
us_map

In [None]:
# extract shape files for senate districts
senate_districts = gpd.read_file("data/2022Senate_Districts_Trimmed.geojson")

# Gets the senate district containing a point
def get_district(x):
    # Return a blank when an address could not be geolocated
    if x["census_tract"] == "Unable To Geocode The Address":
        return ""
    # Return the senate district containing the point otherwise
    else:
        return senate_districts.loc[x["geometry"].within(senate_districts["geometry"])]["SLDUST"].astype(int).astype(str).values[0]

# Add a column for senate district
geolocated_data["Senate District"] = geolocated_data.apply(get_district, axis=1)

In [16]:
# extract shape files for representative districts
rep_districts_2022 = pygris.state_legislative_districts(state="DE",house="lower", year=2022)
rep_districts_2022 = rep_districts_2022[["SLDLST","geometry"]].to_crs(4326)

rep_districts_2011 = pygris.state_legislative_districts(state="DE",house="lower", year=2011)
rep_districts_2011 = rep_districts_2011[["SLDLST","geometry"]].to_crs(4326)

Using FIPS code '10' for input 'DE'
Using FIPS code '10' for input 'DE'


In [17]:
#trim the representative district shapes to remove excess water
rep_districts_2022_trimmed = erase_water(rep_districts_2022)

rep_districts_2011_trimmed = erase_water(rep_districts_2011)

# rep_districts_2022_trimmed.explore()
# rep_districts_2011_trimmed.explore()

  return geopandas.overlay(


In [18]:
# Add a column for representative district

gjoin = gpd.sjoin(geolocated_data,rep_districts_2011_trimmed)[["PROJECT NAME & ADDRESS","ALLOCATION DATE","census_tract", "geometry","SLDLST"]]

geolocated_data = geolocated_data.merge(gjoin, on=["PROJECT NAME & ADDRESS","ALLOCATION DATE","census_tract","geometry"], how="left")


In [19]:
# rename SLDLST to Representative District

geolocated_data = geolocated_data.rename(columns={"SLDLST" : "Representative District"})

In [20]:
# Add funding source column
geolocated_data["Funding Source"] = "LIHTC"

In [21]:
geolocated_data.dtypes

PROJECT NAME & ADDRESS          object
ALLOCATION AMOUNT               object
ALLOCATION DATE                 object
Placed in Service Date          object
4% or 9% Allocation             object
Applicable Credit Rate          object
Status Active/Non               object
# of Tax Credit Units           object
Type of Property                object
County                          object
Tax Credit Compliance Date      object
Extended Use Period             object
Tax Allocation Year              int64
PROJECT NAME & ADDRESS 2        object
ALLOCATION AMOUNT 2             object
Placed in Service Date 2        object
Applicable Credit Rate 2        object
PROJECT NAME & ADDRESS 3        object
PROJECT NAME & ADDRESS 4        object
PROJECT NAME & ADDRESS 5        object
address                         object
address_lower                   object
input_address                   object
census_tract                    object
lon                             object
lat                      

In [22]:
# add Represenative names 
rep_names = pd.read_excel("data/Representative List.xlsx")
rep_names["District"] = rep_names["District"].astype("str") #convert to string type
geolocated_data["Representative District"] = geolocated_data["Representative District"].astype("str")
# remove leading 0s on district names
geolocated_data["Representative District"] = [d.lstrip("0") for d in geolocated_data["Representative District"]]
geolocated_data = geolocated_data.merge(rep_names, left_on="Representative District", right_on="District", how="left")

In [23]:
geolocated_data.columns

Index(['PROJECT NAME & ADDRESS', 'ALLOCATION AMOUNT', 'ALLOCATION DATE',
       'Placed in Service Date', '4% or 9% Allocation',
       'Applicable Credit Rate', 'Status Active/Non', '# of Tax Credit Units',
       'Type of Property', 'County', 'Tax Credit Compliance Date',
       'Extended Use Period', 'Tax Allocation Year',
       'PROJECT NAME & ADDRESS 2', 'ALLOCATION AMOUNT 2',
       'Placed in Service Date 2', 'Applicable Credit Rate 2',
       'PROJECT NAME & ADDRESS 3', 'PROJECT NAME & ADDRESS 4',
       'PROJECT NAME & ADDRESS 5', 'address', 'address_lower', 'input_address',
       'census_tract', 'lon', 'lat', 'acc', 'geometry',
       'Representative District', 'Funding Source', 'Name', 'District'],
      dtype='object')

In [25]:
# Print the dataset with senate districts attached to a csv
geolocated_data.drop(columns=["input_address", "address_lower","census_tract", "lon", "lat","District"]).to_file("data/DSHA_repdistricts2011.geojson", driver="GeoJSON")

In [26]:
geolocated_data.drop(columns=["input_address","address_lower", "census_tract", "lon", "lat","District"]).columns

Index(['PROJECT NAME & ADDRESS', 'ALLOCATION AMOUNT', 'ALLOCATION DATE',
       'Placed in Service Date', '4% or 9% Allocation',
       'Applicable Credit Rate', 'Status Active/Non', '# of Tax Credit Units',
       'Type of Property', 'County', 'Tax Credit Compliance Date',
       'Extended Use Period', 'Tax Allocation Year',
       'PROJECT NAME & ADDRESS 2', 'ALLOCATION AMOUNT 2',
       'Placed in Service Date 2', 'Applicable Credit Rate 2',
       'PROJECT NAME & ADDRESS 3', 'PROJECT NAME & ADDRESS 4',
       'PROJECT NAME & ADDRESS 5', 'address', 'acc', 'geometry',
       'Representative District', 'Funding Source', 'Name'],
      dtype='object')

In [37]:
# Count the number of Tax Credit Units in each district for each year and add them to the aggregated dataframe
geolocated_data['# of Tax Credit Units'] = geolocated_data['# of Tax Credit Units'].astype(int)
aggregated_data = geolocated_data.groupby(["Representative District", "Tax Allocation Year"]).sum()["# of Tax Credit Units"].reset_index()
aggregated_data.to_csv("data/Tax_Credit_Units_per_Representative_District.csv", index=False)





  aggregated_data = geolocated_data.groupby(["Representative District", "Tax Allocation Year"]).sum()["# of Tax Credit Units"].reset_index()


In [38]:
geolocated_data

Unnamed: 0,PROJECT NAME & ADDRESS,ALLOCATION AMOUNT,ALLOCATION DATE,Placed in Service Date,4% or 9% Allocation,Applicable Credit Rate,Status Active/Non,# of Tax Credit Units,Type of Property,County,...,input_address,census_tract,lon,lat,acc,geometry,Representative District,Funding Source,Name,District
0,PEARL CENTER,913337,12/20/2016,08/31/2018,0.09,9,Active,51,F,New Castle,...,"300 East 8th street Wilmington, DE 19801",29,-75.54647152099994,39.74226094900007,Census: Exact,POINT (-75.54647 39.74226),3.0,LIHTC,Sherry Dorsey Walker,3.0
1,JEFFERSON ESTATES II,518044,12/20/2016,01/23/2018,0.09,9,Active,32,F,Sussex,...,"825 Kings Highway, #101, Lewes, DE 19958",509.02,-75.14529768299997,38.76164546600006,Census: Exact,POINT (-75.14530 38.76165),37.0,LIHTC,Ruth Briggs King,37.0
2,VILLAGE OF IRON BRANCH (fka Halls Heritage),792137,12/20/2016,12/31/2018,0.09,9,Active,38,F,Sussex,...,"Halls Heritage Circle, Millsboro, DE 19966",506.02,-75.29852190919607,38.58629459541729,ArcGIS: 100,POINT (-75.29852 38.58629),41.0,LIHTC,Richard G. Collins,41.0
3,"THE FLATS, PHASE II MRB",503861,12/08/2016,06/28/2018,0.04,3.21,Active,72,F,New Castle,...,"Bancroft Parkway, Wilmington, DE 19805",13,-75.57186289073468,39.75607740008254,ArcGIS: 100,POINT (-75.57186 39.75608),4.0,LIHTC,Jeff Hilovsky,4.0
4,LIBERTY COURT MRB,620003,07/26/2017,10/13/2018,0.04,3.22,Active,100,F,Kent,...,"1289 Walker Road, Dover, DE 19901",407,-75.55637240499993,39.167185569000026,Census: Non_Exact,POINT (-75.55637 39.16719),31.0,LIHTC,Sean M. Lynn,31.0
5,H. FLETCHER BROWN MRB,565754,05/12/2017,12/31/2019,0.04,3.24,Active,35,S,New Caslte,...,"1010 N. Broom street Wilmington, DE 19806",15,-75.56218107799998,39.75159206100005,Census: Exact,POINT (-75.56218 39.75159),4.0,LIHTC,Jeff Hilovsky,4.0
6,CHELTEN APARTMENTS MRB,497801,08/25/2017,10/31/2018,0.04,3.22,Active,120,S,New Castle,...,"431 Old Forge Road, New Castle, DE 19720",149.08,-75.62710744399999,39.652231312000026,Census: Exact,POINT (-75.62711 39.65223),5.0,LIHTC,Kendra Johnson,5.0
7,VILLAGE OF ST. JOHN (Forward Reservation),778622,12/06/2017,08/26/2019,0.09,9,Carryover,53,S,New Castle,...,"2019 North Market street Wilmington, DE 19802",5,-75.54174052899998,39.751854105000064,Census: Exact,POINT (-75.54174 39.75185),1.0,LIHTC,Nnamdi O. Chukwuocha,1.0
8,OUR LADY OF GRACE (Forward Reservation),774946,12/11/2017,12/20/2019,0.09,9,Carryover,60,F,New Castle,...,"2000 Mary Anagela Way, Newark, DE 19712",147.05,-75.71618273833816,39.6656199289158,ArcGIS: 96.58,POINT (-75.71618 39.66562),24.0,LIHTC,Edward S. Osienski,24.0
9,BRANDYWINE STATION,1044577,12/11/2017,12/06/2018,0.09,9,Carryover,56,F,Sussex,...,"Case Lane and Edwards boulevard Millsboro, DE ...",506.02,-75.29199711143387,38.586581654826816,ArcGIS: 100,POINT (-75.29200 38.58658),41.0,LIHTC,Richard G. Collins,41.0


In [39]:
# Count the Allocation Amount in each district for each year and add them to the aggregated dataframe
data_noTBD = geolocated_data.loc[geolocated_data['ALLOCATION AMOUNT'] != "TBD"]
data_noTBD["ALLOCATION AMOUNT"] = data_noTBD['ALLOCATION AMOUNT'].astype(int)
aggregated_data = aggregated_data.merge(data_noTBD.groupby(["Representative District", "Tax Allocation Year"]).sum()["ALLOCATION AMOUNT"].reset_index(), how="outer", on=["Representative District", "Tax Allocation Year"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
  aggregated_data = aggregated_data.merge(data_noTBD.groupby(["Representative District", "Tax Allocation Year"]).sum()["ALLOCATION AMOUNT"].reset_index(), how="outer", on=["Representative District", "Tax Allocation Year"])


In [40]:
aggregated_data

Unnamed: 0,Representative District,Tax Allocation Year,# of Tax Credit Units,ALLOCATION AMOUNT
0,1.0,2017,53,778622.0
1,1.0,2022,111,1000000.0
2,2.0,2019,59,1056240.0
3,2.0,2020,50,621046.0
4,2.0,2021,50,1000000.0
5,24.0,2017,60,774946.0
6,28.0,2021,54,
7,3.0,2016,51,913337.0
8,3.0,2018,77,1122778.0
9,3.0,2019,128,2112482.0


In [41]:
# get the population per district for each year 
population = pd.read_csv("data/rep_district_population.csv")
population["District"] = population["District"].astype("str")


In [42]:
aggregated_data.dtypes

Representative District     object
Tax Allocation Year          int64
# of Tax Credit Units        int64
ALLOCATION AMOUNT          float64
dtype: object

In [None]:
# Add derived statistics, population data, senator name, and funding source to the aggregated dataframe
aggregated_data = aggregated_data.merge(gpd.read_file("data/aggregated_senate_new.json")[["name", "district", "adj_popula"]], how="outer", left_on="Senate District", right_on="district").drop(columns="Senate District")
# Filter out "S." from the beginning of senator names
# aggregated_data["name"] = aggregated_data["name"].str.removeprefix("S. ")
# Add funding source
aggregated_data["Funding Source"] = "LIHTC"

# Calculate derived statistics
aggregated_data["Average Allocation per 100 Persons"] = aggregated_data["ALLOCATION AMOUNT"] * 100 / aggregated_data["adj_popula"].astype(float)
aggregated_data["Average Population per Tax Credit Unit"] = aggregated_data["adj_popula"].astype(float) / aggregated_data["# of Tax Credit Units"]
aggregated_data["Average Allocation per Tax Credit Unit"] = aggregated_data["ALLOCATION AMOUNT"].astype(float) / aggregated_data["# of Tax Credit Units"]

# Convert adjusted population to int. Assume anywhere with missing population numbers has a population of 0
aggregated_data["adj_popula"] = aggregated_data["adj_popula"].fillna(0).astype(int)

aggregated_data

In [43]:
# add population data 
aggregated_data = aggregated_data.merge(population, how="left", left_on=["Representative District","Tax Allocation Year"], right_on=["District","year"])
aggregated_data["Funding Source"] = "LIHTC"

# calculations
aggregated_data["Average Allocation per 100 Persons"] = (aggregated_data["ALLOCATION AMOUNT"] / aggregated_data["population"]) *100
aggregated_data["Average Population per Tax Credit Unit"] = aggregated_data["population"] / aggregated_data["# of Tax Credit Units"]
aggregated_data["Average Allocation per Tax Credit Unit"] = aggregated_data["ALLOCATION AMOUNT"] / aggregated_data["# of Tax Credit Units"]

# add Representative names
aggregated_data = aggregated_data.merge(rep_names, left_on = "Representative District", right_on= "District",how="left")

In [44]:
aggregated_data

Unnamed: 0,Representative District,Tax Allocation Year,# of Tax Credit Units,ALLOCATION AMOUNT,year,District_x,population,Funding Source,Average Allocation per 100 Persons,Average Population per Tax Credit Unit,Average Allocation per Tax Credit Unit,Name,District_y
0,1.0,2017,53,778622.0,2017.0,1.0,22303.0,LIHTC,3491.108819,420.811321,14690.981132,Nnamdi O. Chukwuocha,1.0
1,1.0,2022,111,1000000.0,2022.0,1.0,22262.0,LIHTC,4491.959393,200.558559,9009.009009,Nnamdi O. Chukwuocha,1.0
2,2.0,2019,59,1056240.0,2019.0,2.0,21794.0,LIHTC,4846.471506,369.389831,17902.372881,Stephanie T. Bolden,2.0
3,2.0,2020,50,621046.0,2020.0,2.0,21782.0,LIHTC,2851.189055,435.64,12420.92,Stephanie T. Bolden,2.0
4,2.0,2021,50,1000000.0,2021.0,2.0,22198.0,LIHTC,4504.910352,443.96,20000.0,Stephanie T. Bolden,2.0
5,24.0,2017,60,774946.0,2017.0,24.0,23230.0,LIHTC,3335.970728,387.166667,12915.766667,Edward S. Osienski,24.0
6,28.0,2021,54,,2021.0,28.0,23386.0,LIHTC,,433.074074,,William J. Carson,28.0
7,3.0,2016,51,913337.0,2016.0,3.0,21271.0,LIHTC,4293.813173,417.078431,17908.568627,Sherry Dorsey Walker,3.0
8,3.0,2018,77,1122778.0,2018.0,3.0,19408.0,LIHTC,5785.129843,252.051948,14581.532468,Sherry Dorsey Walker,3.0
9,3.0,2019,128,2112482.0,2019.0,3.0,19015.0,LIHTC,11109.555614,148.554688,16503.765625,Sherry Dorsey Walker,3.0


In [45]:
# Melt the wide form data into long form data, grouping by district, representative name, funding source, and year
long_data = pd.melt(aggregated_data, id_vars=["Representative District", "Name", "Funding Source", "Tax Allocation Year"], value_vars=["# of Tax Credit Units", "ALLOCATION AMOUNT", "population", "Average Allocation per 100 Persons", "Average Population per Tax Credit Unit", "Average Allocation per Tax Credit Unit"])

# Calculate the average of each variable across all districts and years
averages = long_data.loc[~long_data["Representative District"].isna()].groupby("variable").mean(numeric_only=True).reset_index()
averages["Funding Source"] = "LIHTC"
averages["Name"] = np.nan
averages["Representative District"] = "District Average"
averages["Tax Allocation Year"] = "All Time"

# Calculate the average of each variable across all districts in each year
yearly_averages = long_data.loc[~long_data["Representative District"].isna()].groupby(["variable", "Tax Allocation Year"]).mean(numeric_only=True).reset_index()
yearly_averages["Funding Source"] = "LIHTC"
yearly_averages["Name"] = np.nan
yearly_averages["Representative District"] = "District Average"

# Add the averages to the long form data
long_data = pd.concat([long_data,averages, yearly_averages])
# Fill missing values with 0 for processing
long_data.fillna(0).to_csv("data/long_tax_data_reps.csv", index=False)

In [46]:
long_data

Unnamed: 0,Representative District,Name,Funding Source,Tax Allocation Year,variable,value
0,1,Nnamdi O. Chukwuocha,LIHTC,2017,# of Tax Credit Units,53.00
1,1,Nnamdi O. Chukwuocha,LIHTC,2022,# of Tax Credit Units,111.00
2,2,Stephanie T. Bolden,LIHTC,2019,# of Tax Credit Units,59.00
3,2,Stephanie T. Bolden,LIHTC,2020,# of Tax Credit Units,50.00
4,2,Stephanie T. Bolden,LIHTC,2021,# of Tax Credit Units,50.00
...,...,...,...,...,...,...
37,District Average,,LIHTC,2018,population,21884.25
38,District Average,,LIHTC,2019,population,21664.75
39,District Average,,LIHTC,2020,population,21716.80
40,District Average,,LIHTC,2021,population,22467.20


In [None]:
# Read trimmed senate distrcit shapes
trim = gpd.read_file("data/2022Senate_Districts_Trimmed.geojson", driver="GeoJSON")
trim["district"] = trim["SLDUST"].astype(int)
trim = trim[["district", "geometry"]]

In [47]:
# trimmed representative districts 

#convert to Int64 to remove 0s and back to string
rep_districts_2011_trimmed["SLDLST"] = rep_districts_2011_trimmed["SLDLST"].astype("Int64").astype("str")
rep_districts_2022_trimmed["SLDLST"] = rep_districts_2022_trimmed["SLDLST"].astype("Int64").astype("str")


In [51]:
# Attach representative districts to wide form aggregated data with appropriate year 
aggregated_data2011 = aggregated_data[aggregated_data['Tax Allocation Year'] != 2022] 
aggregated_data2011 = rep_districts_2011_trimmed.merge(aggregated_data2011, right_on="Representative District", left_on="SLDLST", how="outer")

aggregated_data2022 = aggregated_data[aggregated_data['Tax Allocation Year'] == 2022] 
aggregated_data2022 = rep_districts_2022_trimmed.merge(aggregated_data2022, right_on="Representative District", left_on="SLDLST", how="outer")

In [58]:
aggregated_data2011 = aggregated_data2011.drop(columns=["District_x","District_y","Representative District"]).rename(columns={"SLDLST":"District"})
aggregated_data2022 = aggregated_data2022.drop(columns=["District_x","District_y","Representative District"]).rename(columns={"SLDLST":"District"})

In [61]:
# save as GeoJSON
gpd.GeoDataFrame(aggregated_data2011.loc[~aggregated_data2011["District"].isna()]).to_file("data/aggregated_with_geo2011.geojson", driver="GeoJSON")
gpd.GeoDataFrame(aggregated_data2022.loc[~aggregated_data2011["District"].isna()]).to_file("data/aggregated_with_geo2022.geojson", driver="GeoJSON")

In [None]:
# Attach senate districts to wide form aggregated data
aggregated_data["district"] = pd.to_numeric(aggregated_data["district"], errors="coerce")
aggregated_data = aggregated_data.merge(trim, on="district", how="outer")
# Print the wide form data for valid districts to a geojson
gpd.GeoDataFrame(aggregated_data.loc[~aggregated_data["district"].isna()]).to_file("data/aggregated_with_geo.geojson", driver="GeoJSON")
# Print the wide form data without geoometry to a csv
aggregated_data.drop(columns="geometry").to_csv("data/aggregated_data_with_na.csv", index=False)

Figure out how senate districts overlap with senate districts


NOTE: PyPDF2 needs to be added to the environment and imported to run this code

In [None]:
# Download census block groups
blocks = requests.get("https://www2.census.gov/geo/tiger/GENZ2022/shp/cb_2022_10_bg_500k.zip")
blocks = gpd.read_file(BytesIO(blocks.content))
blocks

In [None]:
lines = []

# creating a pdf file object
with open('data/CensusBlockBreakdownbySenateDistrict.pdf', 'rb') as pdfFileObj:
    # creating a pdf reader object
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)

    # printing number of pages in pdf file
    print(pdfReader.numPages)

    # Iterate over each page
    for pageNum in range(pdfReader.numPages):
        # creating a page object
        pageObj = pdfReader.getPage(pageNum)
        
        # extracting text from page
        pageText = pageObj.extractText()
        
        # extract lines from each page
        pageLines = pageText.split("\n")
        
        for line in pageLines:
            lines.append(line.split(' '))

# Pull the columns out and separate columns that were incorrectly joined
columns = lines[0]
columns[1] = columns[0][5:] + columns[1]
columns[0] = columns[0][:5]
columns.insert(2, columns[2][:6])
columns[3] = columns[3][6:]
data = [x for x in lines if x[0].isdigit()]
for row in data:
    row.insert(1, row[0][2:])
    row[0] = row[0][:2]
    row.insert(2, row[2][:5])
    row[3] = row[3][5:]
    
senate_census_map = pd.DataFrame(data, columns=columns)
senate_census_map

In [None]:
# Crosstabulate the connections between senate districts and census block groups
crosstab = pd.crosstab(senate_census_map["Proposed2022_SD"], senate_census_map["BlockGroup"])
# Get a list of census block groups that overlap with each senate district
senate_per_bg = crosstab.apply(lambda x : x[x != 0].index.values)
senate_per_bg.loc[senate_per_bg.apply(len) != 1]




In [None]:
senate_per_bg2 = crosstab.apply(lambda x : x[x != 0].values)
senate_per_bg2.loc[senate_per_bg2.apply(len) != 1]

In [None]:
scbg = pd.concat([senate_per_bg, senate_per_bg2], axis=1)
scbg.loc[(scbg.apply(lambda x : x.apply(len)) != 1).all(axis=1)].values

In [None]:
scbg.columns = ["Senate Districts", "Blocks per District"]
scbg.to_csv("data/Senate_to_Block_Groups.csv")