Skip to content
This repository has been archived by the owner on Feb 23, 2024. It is now read-only.

Commit

Permalink
Make WIC Participation Data Available to Frontend (#44)
Browse files Browse the repository at this point in the history
* Start changing the wic participation data into using multiple dataframes

* Clean up some code that isn't needed

* -Write four CSV files instead of one confusing one.
-Pass the participation data back in so it can be merged in with the census data

* Cover the new WIC code in tests

* Fix flake8 errors

* Fix missing returns

* Set Up Test for Merging WIC Participation Data into Census Data

Changed the WIC participation dataframes to actually have the fips column as the index. This changed the json format of a lot of the fixtures.

Add new test to test merging. This test currently fails because the code isn't implemented yet

* Get merging tests passing

* Change the data type of the wic participation numbers to ints

* Get the main path working

* Fix flake8 and mypy errors

I'm silencing a mypy error on appending lists of different types:
error: Unsupported operand types for + ("List[str]" and "List[int]")
It looks like the fixes to correct the overzealous type error wouldn't be worth implementing: python/mypy#720

* add main test (#45)

* added test_run

* set fail fast to false, allows determination of os/version specific errors

* replaced .loc[] with reindex(), due to key error

* Fix Error in Test test_run()

ValueError: invalid literal for int() with base 10: '1,044'

* Fix flake8 error

Co-authored-by: michaelpkuhn <mickuhn95@gmail.com>
  • Loading branch information
brickman1444 and michaelpkuhn committed Apr 29, 2021
1 parent c019cb3 commit df6a25b
Show file tree
Hide file tree
Showing 17 changed files with 411 additions and 75 deletions.
1 change: 1 addition & 0 deletions .github/workflows/run_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ jobs:
run-tests:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
python-version: ['3.8.5', '3.9']
os: [ubuntu-latest, windows-latest, macos-latest]
Expand Down
2 changes: 1 addition & 1 deletion src/census_response.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ def majority(series):
# prevents conflicts with new columns
# race_values = tuple(cls.data_metrics['race'].values())
race_values = cls.get_data_values('race')
race_df = geo_df.loc[:, race_values]
race_df = geo_df.reindex(race_values, axis="columns")

# divides df by race_total column to calculate percentages
race_percent_df, pct_dict_series = nest_percentages(race_df, 'race_total') # noqa: E501
Expand Down
19 changes: 11 additions & 8 deletions src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,29 +20,32 @@ def main(geo_ls=["zip", "county"], verbose: bool = False) -> None:
'''
mph.setup_memory_usage_file_if_enabled()

print("Reading WIC Data")
print("Reading Census Data")
mph.record_current_memory_usage_if_enabled()
start_time = time.time()
src.wic.read_wic_data()
src.census_response.download_census_data()
if (verbose):
duration = time.time() - start_time
print("Reading WIC Data took: {0:.2f} seconds".format(duration))
print("Reading Census Data took: {0:.2f} seconds".format(duration)) # noqa: E501

print("Reading Census Data")
print("Reading WIC Data")
mph.record_current_memory_usage_if_enabled()
start_time = time.time()
src.census_response.download_census_data()
wic_participation = src.wic.read_wic_data()
src.wic.merge_wic_data_file(wic_participation,
'final_jsons/df_merged_json.json',
'final_jsons/df_merged_with_wic.json')
if (verbose):
duration = time.time() - start_time
print("Reading Census Data took: {0:.2f} seconds".format(duration)) # noqa: E501
print("Reading WIC Data took: {0:.2f} seconds".format(duration))

print("Reading Food Insecurity Data")
mph.record_current_memory_usage_if_enabled()
start_time = time.time()
file_to_json('data_folder', 'final_jsons', blacklist=['Key'])
merge_ins_data('final_jsons/Countyfood_insecurity_rates_12.15.2020.json',
'final_jsons/df_merged_json.json',
'final_jsons/df_merged_with_insecurity.json')
'final_jsons/df_merged_with_wic.json',
'final_jsons/df_merged_with_wic_and_insecurity.json')
if (verbose):
duration = time.time() - start_time
print("Reading Food Insecurity Data took: {0:.2f} seconds".format(duration)) # noqa: E501
Expand Down
179 changes: 128 additions & 51 deletions src/wic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import re
import pdfplumber
import pandas as pd
import json
from typing import List, Dict, Any


'''
Expand All @@ -21,39 +23,102 @@ def is_up_to_date(input_file_path: str, output_file_path: str) -> bool:
return False


def read_wic_data(always_run: bool = False) -> None:
class WICParticipation:
def __init__(self,
women: pd.DataFrame,
infants: pd.DataFrame,
children: pd.DataFrame,
total: pd.DataFrame):
self.women = women
self.infants = infants
self.children = children
self.total = total

input_file_path = "data_folder/illinois_wic_data_january_2021.pdf"
output_file_path = "final_jsons/wic.csv"

if always_run or not is_up_to_date(input_file_path, output_file_path):
def read_wic_data(always_run: bool = False) -> WICParticipation:

input_file_path = "data_folder/illinois_wic_data_january_2021.pdf"
women_output_csv_path = "final_jsons/wic_participation_women.csv"
infants_output_csv_path = "final_jsons/wic_participation_infants.csv"
children_output_csv_path = "final_jsons/wic_participation_children.csv"
total_output_csv_path = "final_jsons/wic_participation_total.csv"

if always_run or \
not is_up_to_date(input_file_path, women_output_csv_path) or \
not is_up_to_date(input_file_path, infants_output_csv_path) or \
not is_up_to_date(input_file_path, children_output_csv_path) or \
not is_up_to_date(input_file_path, total_output_csv_path):
# PDF has 97 pages. Skip page 0 because it shows Statewide totals
# which we don't need
parse_wic_pdf(
participation: WICParticipation = parse_wic_pdf(
input_file_path,
output_file_path,
1,
96)

participation.women.to_csv(women_output_csv_path, index=False)
participation.children.to_csv(children_output_csv_path, index=False)
participation.infants.to_csv(infants_output_csv_path, index=False)
participation.total.to_csv(total_output_csv_path, index=False)
return participation
else:
return WICParticipation(
women=read_csv(women_output_csv_path),
infants=read_csv(infants_output_csv_path),
children=read_csv(children_output_csv_path),
total=read_csv(total_output_csv_path))


def read_csv(path: str) -> pd.DataFrame:
return pd.read_csv(path, index_col="fips")


def read_json(path: str) -> pd.DataFrame:
return pd.read_json(path, orient="index", dtype={"fips": str})


def dataframe_from_rows(rows: List[List[str]]) -> pd.DataFrame:
columns = ["fips", # County fips code
"NAME", # County name
"race_amer_indian_or_alaskan_native", # Amer. Indian or Alaskan Native # noqa: E501
"race_asian", # Asian
"race_black", # Black or African American
"race_native_hawaii_or_pacific_islander", # Native Hawaii or Other Pacific Isl. # noqa: E501
"race_white", # White
"race_multiracial", # Multi-Racial
"total", # Total Participants
"hispanic_or_latino"] # Hispanic or Latino
df = pd.DataFrame(data=rows, columns=columns)
return df.set_index('fips')


def extract_columns_from_line(line: str) -> List[int]:
# Split out a list like ["Total", "Infants", "1", "2", "3", "4"]
str_list = line.split(sep=" ")
return [int(s.replace(",", "")) for s in str_list[2:]]


def parse_wic_pdf(
source_pdf_filepath: str,
destination_csv_filepath: str,
first_page_zero_indexed: int,
last_page_zero_indexed: int) -> None:
last_page_zero_indexed: int) -> WICParticipation:

# We'll use these regular expressions to find the lines we care about.
# Find rows that start with Total (this includes Total Women, Total Infant
# and Total Children rows)
total_re = re.compile("Total")
total_women_re = re.compile("Total Women")
total_infants_re = re.compile("Total Infants")
total_children_re = re.compile("Total Children")
# It's not clear specifically what "LA Total" means, but these rows
# contains the subtotal values for the specific County
county_total_re = re.compile("LA Total")
# find rows that start with three digits (these rows contain County ID and
# name, example: 031 COOK)
county_re = re.compile(r"\d\d\d")
county_re = re.compile("[0-9][0-9][0-9]")

rows = []
women_rows = []
infants_rows = []
children_rows = []
total_rows = []

with pdfplumber.open(source_pdf_filepath) as pdf:

Expand All @@ -69,50 +134,62 @@ def parse_wic_pdf(
# greater than y_tolerance.
text = page.extract_text(x_tolerance=2, y_tolerance=0)

county_info = ["", ""] # fips, county name

# iterate thru each line on a page
for line in text.split("\n"):
if county_re.match(line):
# We have to find the County information first because we
# insert it in every row maxsplit=1 because some counties
# have spaces in their name, example: Jo Daviess
county = (line.split(sep=" ", maxsplit=1))
elif total_re.match(line):
# Split out a list like ["Total", "Women", 1, 2, 3, 4]
new_line = (line.split(sep=" "))
rows.append(county + new_line)

county_info = (line.split(sep=" ", maxsplit=1))
county_info[0] = "17" + county_info[0] # the pdf doesn't have the leading 17 indicating Illinois in the fips code # noqa: E501
elif total_women_re.match(line):
women_rows.append(county_info + extract_columns_from_line(line)) # type: ignore # noqa: E501
elif total_infants_re.match(line):
infants_rows.append(county_info + extract_columns_from_line(line)) # type: ignore # noqa: E501
elif total_children_re.match(line):
children_rows.append(county_info + extract_columns_from_line(line)) # type: ignore # noqa: E501
elif county_total_re.match(line):
# Split out a list like ["LA", "Total", 1, 2, 3, 4]
new_line = (line.split(sep=" "))
rows.append(county + new_line)

column_names = ["County_ID",
"County",
"WIC1",
"WIC2",
"Amer. Indian or Alaskan Native",
"Asian",
"Black or African American",
"Native Hawaii or Other Pacific Isl.",
"White",
"Multi-Racial",
"Total Participants",
"Hispanic or Latino"]

data = pd.DataFrame(rows, columns=column_names)

# Currently the data looks like this:
# WIC1 WIC2 etc
# Total Women
# Total Children
# LA Total

# We want to combine WIC1 and WIC2 columns into new column called WIC
# which will contain values such as "Total Women" "Total Children"
# "Total Infants" and "LA Total"
data.insert(2, "WIC", (data["WIC1"] + " " + data["WIC2"]))

# delete WIC1 and WIC 2 columns
data.drop(['WIC1', 'WIC2'], axis=1, inplace=True)

data.to_csv(destination_csv_filepath, index=False)
total_rows.append(county_info + extract_columns_from_line(line)) # type: ignore # noqa: E501

return WICParticipation(
women=dataframe_from_rows(women_rows),
infants=dataframe_from_rows(infants_rows),
children=dataframe_from_rows(children_rows),
total=dataframe_from_rows(total_rows))


def merge_wic_data_file(participation: WICParticipation, merged_src: str, merged_dst: str) -> None: # noqa: E501
with open(merged_src) as merged_src_file:
merged_data = merge_wic_data(participation, json.load(merged_src_file))
with open(merged_dst, "w") as merged_dst_file:
json.dump(merged_data, merged_dst_file)


def to_dict_for_merging(df: pd.DataFrame) -> Dict:
# calling df.to_dict() directly messes up all the types
data_dict = json.loads(df.to_json(orient='index'))
for county_blob in data_dict.values():
del county_blob["NAME"] # we already include the county name elsewhere in the merged data # noqa: E501
return data_dict


def merge_wic_data(participation: WICParticipation, merged_data: Dict[str, Any]) -> Dict[str, Any]: # noqa: E501

women_dict = to_dict_for_merging(participation.women)
infants_dict = to_dict_for_merging(participation.infants)
children_dict = to_dict_for_merging(participation.children)
total_dict = to_dict_for_merging(participation.total)

for fips, county_data in merged_data['county_data'].items():
if fips in women_dict:
county_data['wic_participation_women_data'] = women_dict[fips]
if fips in infants_dict:
county_data['wic_participation_infants_data'] = infants_dict[fips]
if fips in children_dict:
county_data['wic_participation_children_data'] = children_dict[fips] # noqa: E501
if fips in total_dict:
county_data['wic_participation_total_data'] = total_dict[fips]

return merged_data
83 changes: 83 additions & 0 deletions tests/resources/df_merged_with_wic_expected.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
{
"county_data": {
"17001": {
"NAME": "Adams County, Illinois",
"poverty_data": {
"poverty_percentages": {
"poverty_population_poverty": 0.12143,
"poverty_population_poverty_child": 0.036781
},
"poverty_population_poverty": 7874,
"poverty_population_poverty_child": 2385,
"poverty_population_total": 64844
},
"race_data": {
"race_asian": 540,
"race_black": 2676,
"race_hispaniclatino_total": 1021,
"race_majority": "race_white",
"race_native": 178,
"race_other": 72,
"race_pacific": 42,
"race_percentages": {
"race_asian": 0.008129,
"race_black": 0.040285,
"race_hispaniclatino_total": 0.01537,
"race_native": 0.00268,
"race_other": 0.001084,
"race_pacific": 0.000632,
"race_twoplus_total": 0.013383,
"race_white": 0.918437
},
"race_total": 66427,
"race_twoplus_total": 889,
"race_white": 61009
},
"wic_participation_children_data": {
"hispanic_or_latino": 15,
"race_amer_indian_or_alaskan_native": 0,
"race_asian": 1,
"race_black": 7,
"race_multiracial": 0,
"race_native_hawaii_or_pacific_islander": 0,
"race_white": 85,
"total": 92
},
"wic_participation_infants_data": {
"hispanic_or_latino": 9,
"race_amer_indian_or_alaskan_native": 0,
"race_asian": 0,
"race_black": 2,
"race_multiracial": 1,
"race_native_hawaii_or_pacific_islander": 1,
"race_white": 36,
"total": 38
},
"wic_participation_total_data": {
"hispanic_or_latino": 33,
"race_amer_indian_or_alaskan_native": 0,
"race_asian": 1,
"race_black": 10,
"race_multiracial": 1,
"race_native_hawaii_or_pacific_islander": 1,
"race_white": 157,
"total": 167
},
"wic_participation_women_data": {
"hispanic_or_latino": 9,
"race_amer_indian_or_alaskan_native": 0,
"race_asian": 0,
"race_black": 1,
"race_multiracial": 0,
"race_native_hawaii_or_pacific_islander": 0,
"race_white": 36,
"total": 37
}
}
},
"meta": {
"data_bins": {},
"data_metrics": {}
},
"zip_data": {}
}

0 comments on commit df6a25b

Please sign in to comment.