In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
import folium
import requests
from io import BytesIO

Process the raw excel file

In [2]:
# Read the excel sheet and skip blank rows
raw_excel = pd.read_excel("data/DSHA LIHTC List_MAPPING.xlsx", engine='openpyxl', skiprows=[2,3], skipfooter=4, dtype=str)
# Add additional column information from the first row
raw_excel.columns = (raw_excel.columns.astype(str) + " " + raw_excel.head(1).fillna("").astype(str)).iloc[0].str.strip().values
raw_excel.rename(columns={"ALLOCATION .1 DATE": "ALLOCATION DATE", "ALLOCATION  AMOUNT": "ALLOCATION AMOUNT", "Type of Property*": "Type of Property"}, inplace=True)
raw_excel.drop(0, inplace=True)

In [3]:
# Find and label the rows about tax year
raw_excel["is tax"] = raw_excel["PROJECT NAME & ADDRESS"].str.contains("TAX CREDIT ALLOCATIONS")

# Function that maps the boolean column "is tax", which is true when a row contains tax year information, to an integer equal to the tax year
def assign_to_year(x, i):
    # if the row is a tax year, increment i
    if x:
        i[0] = i[0] + 1
    # return an integer equal to the tax year
    return i[0] + 2016

# add a column for the tax year
index = [-1]
raw_excel["Tax Allocation Year"] = raw_excel["is tax"].apply(assign_to_year, args=[index])

# drop rows of tax year information and reformat
raw_excel = raw_excel.loc[~raw_excel["is tax"]].drop(columns="is tax").reset_index(drop=True)


In [4]:
# Add a column that labels the primary line for an entry
raw_excel["primary"] = ~raw_excel["County"].isna()

# Function that maps each the boolean column "primary", which is true when a row contains the primary info from the dataset, to an integer that functions as an index for primary entries
def assign_to_year(x, i):
    # if the row is primary, increment i
    if x:
        i[0] = i[0] + 1
    # return an index for the primary entries
    return i[0]

# add an index column for the primary entries
index = [-1]
raw_excel["primary"] = raw_excel["primary"].apply(assign_to_year, args=[index])

In [5]:
# Fix 'ALLOCATION AMOUNT', 'ALLOCATION DATE' swap
flipped_years = [2018, 2019, 2020, 2021, 2022]
tmp = raw_excel.loc[raw_excel["Tax Allocation Year"].isin(flipped_years)]['ALLOCATION AMOUNT'].copy()
tmp2 = raw_excel.loc[raw_excel["Tax Allocation Year"].isin(flipped_years)]['ALLOCATION DATE'].copy()
raw_excel.loc[raw_excel["Tax Allocation Year"].isin(flipped_years), 'ALLOCATION AMOUNT'] = tmp2.values
raw_excel.loc[raw_excel["Tax Allocation Year"].isin(flipped_years), 'ALLOCATION DATE'] = tmp.values


In [6]:
# Create separate dataframes for each row in a data entry
grouped_data = raw_excel.groupby("primary")
raw_data1 = grouped_data.nth(0)
raw_data2 = grouped_data.nth(1).drop(columns="Tax Allocation Year")
raw_data3 = grouped_data.nth(2).drop(columns="Tax Allocation Year")
raw_data4 = grouped_data.nth(3).drop(columns="Tax Allocation Year")
raw_data5 = grouped_data.nth(4).drop(columns="Tax Allocation Year")

# Modify the column names for each dataframe to prepare for joining
raw_data2.columns = raw_data2.columns + " 2"
raw_data3.columns = raw_data3.columns + " 3"
raw_data4.columns = raw_data4.columns + " 4"
raw_data5.columns = raw_data5.columns + " 5"

# Join the dataframes by index and remove unused columns
flattened_data = raw_data1.join(raw_data2, how="left").join(raw_data3, how="left").join(raw_data4, how="left").join(raw_data5, how="left").dropna(axis=1, how='all').reset_index(drop=True)


In [7]:
# Convert dates back to their orginial format
flattened_data["Placed in Service Date"] = pd.to_datetime(flattened_data["Placed in Service Date"], errors='coerce').dt.strftime('%m/%d/%Y')
flattened_data["ALLOCATION DATE"] = pd.to_datetime(flattened_data["ALLOCATION DATE"], errors='coerce').dt.strftime('%m/%d/%Y')
flattened_data["Tax Credit Compliance Date"] = pd.to_datetime(flattened_data["Tax Credit Compliance Date"], errors='coerce').dt.strftime('%m/%d/%Y')
flattened_data["Extended Use Period"] = pd.to_datetime(flattened_data["Extended Use Period"], errors='coerce').dt.strftime('%m/%d/%Y')
flattened_data["Placed in Service Date 2"] = pd.to_datetime(flattened_data["Placed in Service Date 2"], errors='coerce').dt.strftime('%m/%d/%Y')



In [8]:
# Combine address fields
address_columns = ["PROJECT NAME & ADDRESS", "PROJECT NAME & ADDRESS 2", "PROJECT NAME & ADDRESS 3", "PROJECT NAME & ADDRESS 4", "PROJECT NAME & ADDRESS 5"]

# Extracts addresses from projects
def extract_address(x):
    # Project 27 has three full addresses, so we use the last one listed
    if x.name == 27:
        addr = x[address_columns].dropna().values[-1]
        return addr
    # The last two lines of the address field contain the address split between two lines, except for project 27
    else:
        addr = x[address_columns].dropna().values[-2:]
        return addr[0] + ", " + addr[1]

# Extract an address for each project
flattened_data["address"] = flattened_data.apply(extract_address, axis=1)

In [9]:
# Convert years to string
flattened_data["Tax Allocation Year"] = flattened_data["Tax Allocation Year"].astype(str)

# Print the data to a csv
flattened_data.to_csv("data/processed_data.csv", index=False)

In [10]:
# Print the addresses to a seperate list
flattened_data["address"].to_csv("data/DSHA_addresses.csv", index=False)

At this point we transfer the address csv over to the geocoder to get the latitude and longitude of each project

In [11]:
# Read the file of geolocated addresses
geolocations = pd.read_csv("data/counts_per_tract.csv").drop_duplicates("input addresses")
# Join the geolocations to the flattened dataframe
geolocated_data = flattened_data.merge(geolocations, left_on="address", right_on="input addresses", how="inner")
# Remove lat,lot from unsuccessfully (not in the u.s.) geolocated address
geolocated_data.loc[geolocated_data["census tract"] == "Unable To Geolocate The Address", "lot"] = np.nan
geolocated_data.loc[geolocated_data["census tract"] == "Unable To Geolocate The Address", "lat"] = np.nan

In [12]:
# Convert lat,lot to Shapely points
geolocated_data = gpd.GeoDataFrame(geolocated_data, geometry=gpd.points_from_xy(geolocated_data['lat'], geolocated_data['lot'], crs="EPSG:4326"))


In [None]:
# Visualize points on a map

# initialize the map and store it in a folium map object
us_map = folium.Map(location=[39.74503, -75.57203], zoom_start=14, tiles=None)

# Add background tiles
folium.TileLayer('CartoDB positron',name="Light Map",control=False).add_to(us_map)


# Add markers for each school
points=folium.features.GeoJson(
        geolocated_data.loc[geolocated_data["census tract"] != "Unable To Geolocate The Address"], # Full geopandas data
        control=False,
        marker = folium.CircleMarker(radius = 5, # Radius in metres
                           weight = 0, #outline weight
                           fill_color = '#d95f02', 
                           fill_opacity = 1)
        )

points.add_to(us_map)
us_map

In [13]:
# extract shape files for senate districts
senate_districts = gpd.read_file("data/2020Senate_Districts_Trimmed.geojson")

# Gets the senate district containing a point
def get_district(x):
    # Return a blank when an address could not be geolocated
    if x["census tract"] == "Unable To Geolocate The Address":
        return ""
    # Return the senate district containing the point otherwise
    else:
        return senate_districts.loc[x["geometry"].within(senate_districts["geometry"])]["SLDUST"].astype(int).astype(str).values[0]

# Add a column for senate district
geolocated_data["Senate District"] = geolocated_data.apply(get_district, axis=1)

In [14]:
# Add funding source column
geolocated_data["Funding Source"] = "LIHTC"

In [15]:
geolocated_data.columns

Index(['PROJECT NAME & ADDRESS', 'ALLOCATION AMOUNT', 'ALLOCATION DATE',
       'Placed in Service Date', '4% or 9% Allocation',
       'Applicable Credit Rate', 'Status Active/Non', '# of Tax Credit Units',
       'Type of Property', 'County', 'Tax Credit Compliance Date',
       'Extended Use Period', 'Tax Allocation Year',
       'PROJECT NAME & ADDRESS 2', 'ALLOCATION AMOUNT 2',
       'Placed in Service Date 2', 'Applicable Credit Rate 2',
       'PROJECT NAME & ADDRESS 3', 'PROJECT NAME & ADDRESS 4',
       'PROJECT NAME & ADDRESS 5', 'address', 'input addresses',
       'census tract', 'lot', 'lat', 'geometry', 'Senate District',
       'Funding Source'],
      dtype='object')

In [16]:
# Print the dataset with senate districts attached to a csv
geolocated_data.drop(columns=["input addresses", "census tract", "lot", "lat"]).to_file("data/DSHA_districted.geojson", driver="GeoJSON")

In [17]:
geolocated_data.drop(columns=["input addresses", "census tract", "lot", "lat"]).columns

Index(['PROJECT NAME & ADDRESS', 'ALLOCATION AMOUNT', 'ALLOCATION DATE',
       'Placed in Service Date', '4% or 9% Allocation',
       'Applicable Credit Rate', 'Status Active/Non', '# of Tax Credit Units',
       'Type of Property', 'County', 'Tax Credit Compliance Date',
       'Extended Use Period', 'Tax Allocation Year',
       'PROJECT NAME & ADDRESS 2', 'ALLOCATION AMOUNT 2',
       'Placed in Service Date 2', 'Applicable Credit Rate 2',
       'PROJECT NAME & ADDRESS 3', 'PROJECT NAME & ADDRESS 4',
       'PROJECT NAME & ADDRESS 5', 'address', 'geometry', 'Senate District',
       'Funding Source'],
      dtype='object')

In [18]:
# Count the number of Tax Credit Units in each district for each year and add them to the aggregated dataframe
geolocated_data['# of Tax Credit Units'] = geolocated_data['# of Tax Credit Units'].astype(int)
aggregated_data = geolocated_data.groupby(["Senate District", "Tax Allocation Year"]).sum()["# of Tax Credit Units"].reset_index()
aggregated_data.to_csv("data/Tax_Credit_Units_per_Senate_District.csv", index=False)





  aggregated_data = geolocated_data.groupby(["Senate District", "Tax Allocation Year"]).sum()["# of Tax Credit Units"].reset_index()


In [19]:
# Count the Allocation Amount in each district for each year and add them to the aggregated dataframe
data_noTBD = geolocated_data.loc[geolocated_data['ALLOCATION AMOUNT'] != "TBD"]
data_noTBD["ALLOCATION AMOUNT"] = data_noTBD['ALLOCATION AMOUNT'].astype(int)
aggregated_data = aggregated_data.merge(data_noTBD.groupby(["Senate District", "Tax Allocation Year"]).sum()["ALLOCATION AMOUNT"].reset_index(), how="outer", on=["Senate District", "Tax Allocation Year"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
  aggregated_data = aggregated_data.merge(data_noTBD.groupby(["Senate District", "Tax Allocation Year"]).sum()["ALLOCATION AMOUNT"].reset_index(), how="outer", on=["Senate District", "Tax Allocation Year"])


In [20]:
# Add derived statistics, population data, senator name, and funding source to the aggregated dataframe
aggregated_data = aggregated_data.merge(gpd.read_file("data/aggregated_senate_new.json")[["name", "district", "adj_popula"]], how="outer", left_on="Senate District", right_on="district").drop(columns="Senate District")
# Filter out "S." from the beginning of senator names
aggregated_data["name"] = aggregated_data["name"].str.removeprefix("S. ")
# Add funding source
aggregated_data["Funding Source"] = "LIHTC"

# Calculate derived statistics
aggregated_data["Average Allocation per 100 Persons"] = aggregated_data["ALLOCATION AMOUNT"] * 100 / aggregated_data["adj_popula"].astype(float)
aggregated_data["Average Population per Tax Credit Unit"] = aggregated_data["adj_popula"].astype(float) / aggregated_data["# of Tax Credit Units"]
aggregated_data["Average Allocation per Tax Credit Unit"] = aggregated_data["ALLOCATION AMOUNT"].astype(float) / aggregated_data["# of Tax Credit Units"]

# Convert adjusted population to int. Assume anywhere with missing population numbers has a population of 0
aggregated_data["adj_popula"] = aggregated_data["adj_popula"].fillna(0).astype(int)

aggregated_data

Unnamed: 0,Tax Allocation Year,# of Tax Credit Units,ALLOCATION AMOUNT,name,district,adj_popula,Funding Source,Average Allocation per 100 Persons,Average Population per Tax Credit Unit,Average Allocation per Tax Credit Unit
0,2018.0,70.0,1247683.0,,,0,LIHTC,,,17824.042857
1,2016.0,72.0,503861.0,Sarah Mcbride,1.0,44856,LIHTC,1123.285625,623.0,6998.069444
2,2017.0,128.0,1841225.0,Sarah Mcbride,1.0,44856,LIHTC,4104.746299,350.4375,14384.570312
3,2018.0,50.0,881051.0,Sarah Mcbride,1.0,44856,LIHTC,1964.176476,897.12,17621.02
4,2019.0,51.0,8135.0,Sarah Mcbride,1.0,44856,LIHTC,18.135812,879.529412,159.509804
5,2022.0,60.0,,Sarah Mcbride,1.0,44856,LIHTC,,747.6,
6,2020.0,106.0,656670.0,Stephanie L. Hansen,10.0,47281,LIHTC,1388.866564,446.04717,6195.0
7,2017.0,60.0,774946.0,Bryan Townsend,11.0,48203,LIHTC,1607.671722,803.383333,12915.766667
8,2017.0,120.0,497801.0,Marie Pinkney,13.0,48294,LIHTC,1030.771939,402.45,4148.341667
9,2021.0,54.0,,Kyra Hoffner,14.0,49253,LIHTC,,912.092593,


In [21]:
# Melt the wide form data into long form data, grouping by district, senator name, funding source, and year
long_data = pd.melt(aggregated_data, id_vars=["district", "name", "Funding Source", "Tax Allocation Year"], value_vars=["adj_popula", "# of Tax Credit Units", "ALLOCATION AMOUNT", "Average Allocation per 100 Persons", "Average Population per Tax Credit Unit", "Average Allocation per Tax Credit Unit"])

# Remove nan district from long form data
long_data = long_data.loc[~long_data["district"].isna()]

# Calculate the averages of each variable across all districts in each year
yearly_averages = long_data.loc[~long_data["Tax Allocation Year"].isna()].groupby(["variable", "Tax Allocation Year"]).mean(numeric_only=True).reset_index()
yearly_averages["Funding Source"] = "LIHTC"
yearly_averages["name"] = np.nan
yearly_averages["district"] = "District Average"

# Add the averages to the long form data
long_data = pd.concat([long_data, yearly_averages])
# Fill missing values with 0 for processing
long_data.fillna(0).to_csv("data/long_tax_data.csv", index=False)

In [22]:
# Read trimmed senate distrcit shapes
trim = gpd.read_file("data/2020Senate_Districts_Trimmed.geojson", driver="GeoJSON")
trim["district"] = trim["SLDUST"].astype(int)
trim = trim[["district", "geometry"]]

# Add derived statistics, population data, senator name, and funding source to the trimmed shapefiles
extras = gpd.read_file("data/aggregated_senate_new.json")[["name", "district", "adj_popula"]]
extras["district"] = extras["district"].astype(int)

trim = trim.merge(extras, how="left", on="district")
# Filter out "S." from the beginning of senator names
trim["name"] = trim["name"].str.removeprefix("S. ")
# Add funding source
trim["Funding Source"] = "LIHTC"


# Duplicate each district across all years
years = pd.Series(aggregated_data["Tax Allocation Year"].unique(), name="Tax Allocation Year")
trim = trim.merge(years, how="cross")



In [23]:
# Drop district information from aggregated data
aggregated_data = aggregated_data.drop(columns=["adj_popula", "Funding Source", "name"])

# Attach senate districts to wide form aggregated data
aggregated_data["district"] = pd.to_numeric(aggregated_data["district"], errors="coerce")
aggregated_data = aggregated_data.merge(trim, on=["district", "Tax Allocation Year"], how="right")

In [None]:
# Compute the sum of all allocations in a district
sums = aggregated_data.groupby("district").sum(numeric_only=True).reset_index()
# Add name, adjusted population, funding source, and geometry back in
sums = aggregated_data[["district", "name", "adj_popula", "Funding Source", "geometry"]].drop_duplicates("district").merge(sums, on="district", how="right")
# Label tax allocation year as sum over time
sums["Tax Allocation Year"] = "All Time"
# Add sums over time to aggregated data
aggregated_data = pd.concat([aggregated_data, sums])

In [None]:
# Print the wide form data for valid districts to a geojson
gpd.GeoDataFrame(aggregated_data.loc[~aggregated_data["district"].isna()]).to_file("data/aggregated_with_geo.geojson", driver="GeoJSON")
# Print the wide form data without geoometry to a csv
aggregated_data.drop(columns="geometry").to_csv("data/aggregated_data_with_na.csv", index=False)

In [None]:
aggregated_data


Figure out how senate districts overlap with senate districts


NOTE: PyPDF2 needs to be added to the environment and imported to run this code

In [None]:
# Download census block groups
blocks = requests.get("https://www2.census.gov/geo/tiger/GENZ2022/shp/cb_2022_10_bg_500k.zip")
blocks = gpd.read_file(BytesIO(blocks.content))
blocks

In [None]:
lines = []

# creating a pdf file object
with open('data/CensusBlockBreakdownbySenateDistrict.pdf', 'rb') as pdfFileObj:
    # creating a pdf reader object
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)

    # printing number of pages in pdf file
    print(pdfReader.numPages)

    # Iterate over each page
    for pageNum in range(pdfReader.numPages):
        # creating a page object
        pageObj = pdfReader.getPage(pageNum)
        
        # extracting text from page
        pageText = pageObj.extractText()
        
        # extract lines from each page
        pageLines = pageText.split("\n")
        
        for line in pageLines:
            lines.append(line.split(' '))

# Pull the columns out and separate columns that were incorrectly joined
columns = lines[0]
columns[1] = columns[0][5:] + columns[1]
columns[0] = columns[0][:5]
columns.insert(2, columns[2][:6])
columns[3] = columns[3][6:]
data = [x for x in lines if x[0].isdigit()]
for row in data:
    row.insert(1, row[0][2:])
    row[0] = row[0][:2]
    row.insert(2, row[2][:5])
    row[3] = row[3][5:]
    
senate_census_map = pd.DataFrame(data, columns=columns)
senate_census_map

In [None]:
# Crosstabulate the connections between senate districts and census block groups
crosstab = pd.crosstab(senate_census_map["Proposed2022_SD"], senate_census_map["BlockGroup"])
# Get a list of census block groups that overlap with each senate district
senate_per_bg = crosstab.apply(lambda x : x[x != 0].index.values)
senate_per_bg.loc[senate_per_bg.apply(len) != 1]




In [None]:
senate_per_bg2 = crosstab.apply(lambda x : x[x != 0].values)
senate_per_bg2.loc[senate_per_bg2.apply(len) != 1]

In [None]:
scbg = pd.concat([senate_per_bg, senate_per_bg2], axis=1)
scbg.loc[(scbg.apply(lambda x : x.apply(len)) != 1).all(axis=1)].values

In [None]:
scbg.columns = ["Senate Districts", "Blocks per District"]
scbg.to_csv("data/Senate_to_Block_Groups.csv")