Make WIC Participation Data Available to Frontend (#44)

* Start changing the wic participation data into using multiple dataframes * Clean up some code that isn't needed * -Write four CSV files instead of one confusing one. -Pass the participation data back in so it can be merged in with the census data * Cover the new WIC code in tests * Fix flake8 errors * Fix missing returns * Set Up Test for Merging WIC Participation Data into Census Data Changed the WIC participation dataframes to actually have the fips column as the index. This changed the json format of a lot of the fixtures. Add new test to test merging. This test currently fails because the code isn't implemented yet * Get merging tests passing * Change the data type of the wic participation numbers to ints * Get the main path working * Fix flake8 and mypy errors I'm silencing a mypy error on appending lists of different types: error: Unsupported operand types for + ("List[str]" and "List[int]") It looks like the fixes to correct the overzealous type error wouldn't be worth implementing: python/mypy#720 * add main test (#45) * added test_run * set fail fast to false, allows determination of os/version specific errors * replaced .loc[] with reindex(), due to key error * Fix Error in Test test_run() ValueError: invalid literal for int() with base 10: '1,044' * Fix flake8 error Co-authored-by: michaelpkuhn <mickuhn95@gmail.com>
code-312 · Apr 29, 2021 · df6a25b · df6a25b
1 parent c019cb3
commit df6a25b
Show file tree

Hide file tree

Showing 17 changed files with 411 additions and 75 deletions.
diff --git a/.github/workflows/run_tests.yaml b/.github/workflows/run_tests.yaml
@@ -6,6 +6,7 @@ jobs:
   run-tests:
     runs-on: ${{ matrix.os }}
     strategy:
+      fail-fast: false
       matrix:
         python-version: ['3.8.5', '3.9']
         os: [ubuntu-latest, windows-latest, macos-latest]

diff --git a/src/census_response.py b/src/census_response.py
@@ -318,7 +318,7 @@ def majority(series):
             # prevents conflicts with new columns
             # race_values = tuple(cls.data_metrics['race'].values())
             race_values = cls.get_data_values('race')
-            race_df = geo_df.loc[:, race_values]
+            race_df = geo_df.reindex(race_values, axis="columns")
 
             # divides df by race_total column to calculate percentages
             race_percent_df, pct_dict_series = nest_percentages(race_df, 'race_total')  # noqa: E501

diff --git a/src/main.py b/src/main.py
@@ -20,29 +20,32 @@ def main(geo_ls=["zip", "county"], verbose: bool = False) -> None:
     '''
     mph.setup_memory_usage_file_if_enabled()
 
-    print("Reading WIC Data")
+    print("Reading Census Data")
     mph.record_current_memory_usage_if_enabled()
     start_time = time.time()
-    src.wic.read_wic_data()
+    src.census_response.download_census_data()
     if (verbose):
         duration = time.time() - start_time
-        print("Reading WIC Data took: {0:.2f} seconds".format(duration))
+        print("Reading Census Data took: {0:.2f} seconds".format(duration))  # noqa: E501
 
-    print("Reading Census Data")
+    print("Reading WIC Data")
     mph.record_current_memory_usage_if_enabled()
     start_time = time.time()
-    src.census_response.download_census_data()
+    wic_participation = src.wic.read_wic_data()
+    src.wic.merge_wic_data_file(wic_participation,
+                                'final_jsons/df_merged_json.json',
+                                'final_jsons/df_merged_with_wic.json')
     if (verbose):
         duration = time.time() - start_time
-        print("Reading Census Data took: {0:.2f} seconds".format(duration))  # noqa: E501
+        print("Reading WIC Data took: {0:.2f} seconds".format(duration))
 
     print("Reading Food Insecurity Data")
     mph.record_current_memory_usage_if_enabled()
     start_time = time.time()
     file_to_json('data_folder', 'final_jsons', blacklist=['Key'])
     merge_ins_data('final_jsons/Countyfood_insecurity_rates_12.15.2020.json',
-                   'final_jsons/df_merged_json.json',
-                   'final_jsons/df_merged_with_insecurity.json')
+                   'final_jsons/df_merged_with_wic.json',
+                   'final_jsons/df_merged_with_wic_and_insecurity.json')
     if (verbose):
         duration = time.time() - start_time
         print("Reading Food Insecurity Data took: {0:.2f} seconds".format(duration))  # noqa: E501

diff --git a/src/wic.py b/src/wic.py
@@ -2,6 +2,8 @@
 import re
 import pdfplumber
 import pandas as pd
+import json
+from typing import List, Dict, Any
 
 
 '''
@@ -21,39 +23,102 @@ def is_up_to_date(input_file_path: str, output_file_path: str) -> bool:
         return False
 
 
-def read_wic_data(always_run: bool = False) -> None:
+class WICParticipation:
+    def __init__(self,
+                 women: pd.DataFrame,
+                 infants: pd.DataFrame,
+                 children: pd.DataFrame,
+                 total: pd.DataFrame):
+        self.women = women
+        self.infants = infants
+        self.children = children
+        self.total = total
 
-    input_file_path = "data_folder/illinois_wic_data_january_2021.pdf"
-    output_file_path = "final_jsons/wic.csv"
 
-    if always_run or not is_up_to_date(input_file_path, output_file_path):
+def read_wic_data(always_run: bool = False) -> WICParticipation:
+
+    input_file_path = "data_folder/illinois_wic_data_january_2021.pdf"
+    women_output_csv_path = "final_jsons/wic_participation_women.csv"
+    infants_output_csv_path = "final_jsons/wic_participation_infants.csv"
+    children_output_csv_path = "final_jsons/wic_participation_children.csv"
+    total_output_csv_path = "final_jsons/wic_participation_total.csv"
+
+    if always_run or \
+       not is_up_to_date(input_file_path, women_output_csv_path) or \
+       not is_up_to_date(input_file_path, infants_output_csv_path) or \
+       not is_up_to_date(input_file_path, children_output_csv_path) or \
+       not is_up_to_date(input_file_path, total_output_csv_path):
         # PDF has 97 pages. Skip page 0 because it shows Statewide totals
         # which we don't need
-        parse_wic_pdf(
+        participation: WICParticipation = parse_wic_pdf(
             input_file_path,
-            output_file_path,
             1,
             96)
 
+        participation.women.to_csv(women_output_csv_path, index=False)
+        participation.children.to_csv(children_output_csv_path, index=False)
+        participation.infants.to_csv(infants_output_csv_path, index=False)
+        participation.total.to_csv(total_output_csv_path, index=False)
+        return participation
+    else:
+        return WICParticipation(
+            women=read_csv(women_output_csv_path),
+            infants=read_csv(infants_output_csv_path),
+            children=read_csv(children_output_csv_path),
+            total=read_csv(total_output_csv_path))
+
+
+def read_csv(path: str) -> pd.DataFrame:
+    return pd.read_csv(path, index_col="fips")
+
+
+def read_json(path: str) -> pd.DataFrame:
+    return pd.read_json(path, orient="index", dtype={"fips": str})
+
+
+def dataframe_from_rows(rows: List[List[str]]) -> pd.DataFrame:
+    columns = ["fips",  # County fips code
+               "NAME",  # County name
+               "race_amer_indian_or_alaskan_native",  # Amer. Indian or Alaskan Native # noqa: E501
+               "race_asian",  # Asian
+               "race_black",  # Black or African American
+               "race_native_hawaii_or_pacific_islander",  # Native Hawaii or Other Pacific Isl. # noqa: E501
+               "race_white",  # White
+               "race_multiracial",  # Multi-Racial
+               "total",  # Total Participants
+               "hispanic_or_latino"]  # Hispanic or Latino
+    df = pd.DataFrame(data=rows, columns=columns)
+    return df.set_index('fips')
+
+
+def extract_columns_from_line(line: str) -> List[int]:
+    # Split out a list like ["Total", "Infants", "1", "2", "3", "4"]
+    str_list = line.split(sep=" ")
+    return [int(s.replace(",", "")) for s in str_list[2:]]
+
 
 def parse_wic_pdf(
         source_pdf_filepath: str,
-        destination_csv_filepath: str,
         first_page_zero_indexed: int,
-        last_page_zero_indexed: int) -> None:
+        last_page_zero_indexed: int) -> WICParticipation:
 
     # We'll use these regular expressions to find the lines we care about.
     # Find rows that start with Total (this includes Total Women, Total Infant
     # and Total Children rows)
-    total_re = re.compile("Total")
+    total_women_re = re.compile("Total Women")
+    total_infants_re = re.compile("Total Infants")
+    total_children_re = re.compile("Total Children")
     # It's not clear specifically what "LA Total" means, but these rows
     # contains the subtotal values for the specific County
     county_total_re = re.compile("LA Total")
     # find rows that start with three digits (these rows contain County ID and
     # name, example: 031 COOK)
-    county_re = re.compile(r"\d\d\d")
+    county_re = re.compile("[0-9][0-9][0-9]")
 
-    rows = []
+    women_rows = []
+    infants_rows = []
+    children_rows = []
+    total_rows = []
 
     with pdfplumber.open(source_pdf_filepath) as pdf:
 
@@ -69,50 +134,62 @@ def parse_wic_pdf(
             # greater than y_tolerance.
             text = page.extract_text(x_tolerance=2, y_tolerance=0)
 
+            county_info = ["", ""]  # fips, county name
+
             # iterate thru each line on a page
             for line in text.split("\n"):
                 if county_re.match(line):
                     # We have to find the County information first because we
                     # insert it in every row maxsplit=1 because some counties
                     # have spaces in their name, example: Jo Daviess
-                    county = (line.split(sep=" ", maxsplit=1))
-                elif total_re.match(line):
-                    # Split out a list like ["Total", "Women", 1, 2, 3, 4]
-                    new_line = (line.split(sep=" "))
-                    rows.append(county + new_line)
-
+                    county_info = (line.split(sep=" ", maxsplit=1))
+                    county_info[0] = "17" + county_info[0]  # the pdf doesn't have the leading 17 indicating Illinois in the fips code # noqa: E501
+                elif total_women_re.match(line):
+                    women_rows.append(county_info + extract_columns_from_line(line))  # type: ignore # noqa: E501
+                elif total_infants_re.match(line):
+                    infants_rows.append(county_info + extract_columns_from_line(line))  # type: ignore # noqa: E501
+                elif total_children_re.match(line):
+                    children_rows.append(county_info + extract_columns_from_line(line))  # type: ignore # noqa: E501
                 elif county_total_re.match(line):
-                    # Split out a list like ["LA", "Total", 1, 2, 3, 4]
-                    new_line = (line.split(sep=" "))
-                    rows.append(county + new_line)
-
-    column_names = ["County_ID",
-                    "County",
-                    "WIC1",
-                    "WIC2",
-                    "Amer. Indian or Alaskan Native",
-                    "Asian",
-                    "Black or African American",
-                    "Native Hawaii or Other Pacific Isl.",
-                    "White",
-                    "Multi-Racial",
-                    "Total Participants",
-                    "Hispanic or Latino"]
-
-    data = pd.DataFrame(rows, columns=column_names)
-
-    # Currently the data looks like this:
-    # WIC1      WIC2       etc
-    # Total     Women
-    # Total     Children
-    # LA        Total
-
-    # We want to combine WIC1 and WIC2 columns into new column called WIC
-    # which will contain values such as "Total Women" "Total Children"
-    # "Total Infants" and "LA Total"
-    data.insert(2, "WIC", (data["WIC1"] + " " + data["WIC2"]))
-
-    # delete WIC1 and WIC 2 columns
-    data.drop(['WIC1', 'WIC2'], axis=1, inplace=True)
-
-    data.to_csv(destination_csv_filepath, index=False)
+                    total_rows.append(county_info + extract_columns_from_line(line))  # type: ignore # noqa: E501
+
+    return WICParticipation(
+        women=dataframe_from_rows(women_rows),
+        infants=dataframe_from_rows(infants_rows),
+        children=dataframe_from_rows(children_rows),
+        total=dataframe_from_rows(total_rows))
+
+
+def merge_wic_data_file(participation: WICParticipation, merged_src: str, merged_dst: str) -> None:  # noqa: E501
+    with open(merged_src) as merged_src_file:
+        merged_data = merge_wic_data(participation, json.load(merged_src_file))
+    with open(merged_dst, "w") as merged_dst_file:
+        json.dump(merged_data, merged_dst_file)
+
+
+def to_dict_for_merging(df: pd.DataFrame) -> Dict:
+    # calling df.to_dict() directly messes up all the types
+    data_dict = json.loads(df.to_json(orient='index'))
+    for county_blob in data_dict.values():
+        del county_blob["NAME"]  # we already include the county name elsewhere in the merged data # noqa: E501
+    return data_dict
+
+
+def merge_wic_data(participation: WICParticipation, merged_data: Dict[str, Any]) -> Dict[str, Any]:  # noqa: E501
+
+    women_dict = to_dict_for_merging(participation.women)
+    infants_dict = to_dict_for_merging(participation.infants)
+    children_dict = to_dict_for_merging(participation.children)
+    total_dict = to_dict_for_merging(participation.total)
+
+    for fips, county_data in merged_data['county_data'].items():
+        if fips in women_dict:
+            county_data['wic_participation_women_data'] = women_dict[fips]
+        if fips in infants_dict:
+            county_data['wic_participation_infants_data'] = infants_dict[fips]
+        if fips in children_dict:
+            county_data['wic_participation_children_data'] = children_dict[fips]  # noqa: E501
+        if fips in total_dict:
+            county_data['wic_participation_total_data'] = total_dict[fips]
+
+    return merged_data
diff --git a/tests/resources/df_merged_with_wic_expected.json b/tests/resources/df_merged_with_wic_expected.json
@@ -0,0 +1,83 @@
+{
+    "county_data": {
+        "17001": {
+            "NAME": "Adams County, Illinois",
+            "poverty_data": {
+                "poverty_percentages": {
+                    "poverty_population_poverty": 0.12143,
+                    "poverty_population_poverty_child": 0.036781
+                },
+                "poverty_population_poverty": 7874,
+                "poverty_population_poverty_child": 2385,
+                "poverty_population_total": 64844
+            },
+            "race_data": {
+                "race_asian": 540,
+                "race_black": 2676,
+                "race_hispaniclatino_total": 1021,
+                "race_majority": "race_white",
+                "race_native": 178,
+                "race_other": 72,
+                "race_pacific": 42,
+                "race_percentages": {
+                    "race_asian": 0.008129,
+                    "race_black": 0.040285,
+                    "race_hispaniclatino_total": 0.01537,
+                    "race_native": 0.00268,
+                    "race_other": 0.001084,
+                    "race_pacific": 0.000632,
+                    "race_twoplus_total": 0.013383,
+                    "race_white": 0.918437
+                },
+                "race_total": 66427,
+                "race_twoplus_total": 889,
+                "race_white": 61009
+            },
+            "wic_participation_children_data": {
+                "hispanic_or_latino": 15,
+                "race_amer_indian_or_alaskan_native": 0,
+                "race_asian": 1,
+                "race_black": 7,
+                "race_multiracial": 0,
+                "race_native_hawaii_or_pacific_islander": 0,
+                "race_white": 85,
+                "total": 92
+            },
+            "wic_participation_infants_data": {
+                "hispanic_or_latino": 9,
+                "race_amer_indian_or_alaskan_native": 0,
+                "race_asian": 0,
+                "race_black": 2,
+                "race_multiracial": 1,
+                "race_native_hawaii_or_pacific_islander": 1,
+                "race_white": 36,
+                "total": 38
+            },
+            "wic_participation_total_data": {
+                "hispanic_or_latino": 33,
+                "race_amer_indian_or_alaskan_native": 0,
+                "race_asian": 1,
+                "race_black": 10,
+                "race_multiracial": 1,
+                "race_native_hawaii_or_pacific_islander": 1,
+                "race_white": 157,
+                "total": 167
+            },
+            "wic_participation_women_data": {
+                "hispanic_or_latino": 9,
+                "race_amer_indian_or_alaskan_native": 0,
+                "race_asian": 0,
+                "race_black": 1,
+                "race_multiracial": 0,
+                "race_native_hawaii_or_pacific_islander": 0,
+                "race_white": 36,
+                "total": 37
+            }
+        }
+    },
+    "meta": {
+        "data_bins": {},
+        "data_metrics": {}
+    },
+    "zip_data": {}
+}