diff --git a/.gitignore b/.gitignore index 367d520..2c334a1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ -.vscode +.vscode/ +data/ # Byte-compiled / optimized / DLL files diff --git a/README.md b/README.md index 9405e96..4c1639e 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,16 @@ -This project will help cleanup data for [VIA LINK](https://vialink.org/), the non-profit that runs the 211 system and call centers for the New Orleans-based region of Louisiana. +This project will help cleanup data for [VIA LINK](https://vialink.org/), the non-profit that runs the 211 system and call centers for the New Orleans-based region of Louisiana. ## Initial setup -### install Python +### install Python -You must have Python 3 installed. You can download it [here](https://www.python.org/downloads/). +You must have Python 3 installed. You can download it [here](https://www.python.org/downloads/). -> If you are using Windows, be sure to select the "Add Python to PATH" option +> If you are using Windows, be sure to select the "Add Python to PATH" option -You can confirm it is installed correctly by running `python3 --version` in a terminal or command prompt. +You can confirm it is installed correctly by running `python3 --version` in a terminal or command prompt. -### create and activate a Python virtual environment +### create and activate a Python virtual environment This step is optional, but if you have more than one project using Python, it is recommended. @@ -18,6 +18,7 @@ A [virtual environment](https://docs.python.org/3/library/venv.html#creating-vir of each project, which is helpful when working with mulitple projects with different depenencies (or different versions of the same dependency). For macOS or Linux + ``` python3 -m venv .venv source .venv/bin/activate @@ -30,23 +31,37 @@ py -m venv env .\env\Scripts\activate ``` -> Note that you need to [activate the virutal environment](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/#activating-a-virtual-environment) -> before running a script but you only need to create the virtual envrionment once. +> Note that you need to [activate the virutal environment](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/#activating-a-virtual-environment) +> before running a script but you only need to create the virtual envrionment once. ### install the dependencies In Python, dependencies are often installed using [pip](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/#installing-pip) You can install all the dependencies for this project by running: + ``` pip install -r requirements.txt ``` - ## Running the scripts -TBD - but probably something like this: +The basic format looks like `python cleanup.py script_name --inputfile1 ~/path/to/input.csv` + +For example: + +``` +# the keep-calm-with-covid script only requires one input file +python cleanup.py --debug keep-calm-with-covid --input "/tmp/VL 4.29 Call Report.csv" +# the all-covid-calls script requires 2 files +python cleanup.py --debug all-covid-calls --vialink-input ~/Downloads/VL\ 4.29\ Disaster\ Call\ Report\ .csv --232-input ~/Downloads/232-HELP.csv" ``` -python cleanup /path/to/file1.xlsx /path/to/file2.xlsx + +If you want to see the basic usage you can run `python cleanup.py` and for a specifc command you can use the `--help` flag + ``` +python cleanup.py all-covid-calls --help +``` + +> Also, you can use the `--debug` flag to view debug logs diff --git a/cleanup.py b/cleanup.py new file mode 100644 index 0000000..14f5969 --- /dev/null +++ b/cleanup.py @@ -0,0 +1,100 @@ +import logging + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + handlers=[logging.StreamHandler()], +) +import os +import sys + +import click +import pandas as pd + +from cleanup_all_covid_calls import cleanup as cleanup_all_covid_calls +from cleanup_keep_calm_with_covid import ( + CONVERTERS, + cleanup as cleanup_keep_calm_with_covid, +) +from utils import write_output_file + + +@click.group() +@click.option("--debug/--no-debug", default=False) +@click.pass_context +def cleanup(ctx, debug): + ctx.ensure_object(dict) + ctx.obj["DEBUG"] = debug + + +@cleanup.command() +@click.pass_context +@click.option( + "--vialink-input", + "vl_infile", + required=True, + help="Path to the VIA LINK input csv file", +) +@click.option( + "--232-input", + "two32_infile", + required=True, + help="Path to the 232 HELP input csv file", +) +@click.option( + "--output", + default="data/all_covid_calls_cleaned.xlsx", + help="Path to the output spreadsheet (cleaned .xlsx file)", +) +def all_covid_calls(ctx, vl_infile, two32_infile, output): + if ctx.obj["DEBUG"]: + logging.getLogger().setLevel(logging.DEBUG) + logging.debug("Running in debug mode") + logging.debug(f"Reading VIALINK file from '{vl_infile}'") + logging.debug(f"Reading 232-HELP file from '{two32_infile}'") + dfs = {} + dfvl = pd.read_csv(vl_infile, encoding="ISO-8859-1") + dfvl = remove_first_rows(dfvl) + dfs["VIALINK"] = dfvl + df232 = pd.read_csv(two32_infile, encoding="ISO-8859-1") + df232 = remove_first_rows(df232) + dfs["TWO32"] = df232 + logging.info("Cleaning data for All COVID Calls Dashboard") + df = cleanup_all_covid_calls(dfs) + logging.info(f"Writing data for All COVID Calls Dashboard to '{output}'") + write_output_file(df, output) + + +@cleanup.command() +@click.pass_context +@click.option( + "--input", "infile", required=True, help="Path to the input csv file", +) +@click.option( + "--output", + default="data/keep_calm_with_covid_cleaned.xlsx", + help="Path to the output spreadsheet (cleaned .xlsx file)", +) +def keep_calm_with_covid(ctx, infile, output): + if ctx.obj["DEBUG"]: + logging.getLogger().setLevel(logging.DEBUG) + logging.debug("Running in debug mode") + logging.debug(f"Reading input file '{infile}'") + df = pd.read_csv(infile, encoding="ISO-8859-1", converters=CONVERTERS) + df = remove_first_rows(df) + logging.info("Cleaning data for Keep Calm with COVID Dashboard") + cleanup_keep_calm_with_covid(df) + logging.info(f"Writing data for Keep Calm with COVID Dashboard to '{output}'") + write_output_file(df, output) + + +def remove_first_rows(df): + columns = df.iloc[1].values.tolist() + df = df.iloc[2:] + df.columns = columns + return df + + +if __name__ == "__main__": + cleanup(obj={}) diff --git a/cleanup_all_covid_calls.py b/cleanup_all_covid_calls.py new file mode 100644 index 0000000..706b6b4 --- /dev/null +++ b/cleanup_all_covid_calls.py @@ -0,0 +1,118 @@ +import pandas as pd +import numpy as np +from datetime import datetime +from utils import ( + explode_needs, + get_lat, + get_lng, + replacements, + write_output_file, +) + +pd.options.mode.chained_assignment = None + + +def cleanup(dfs): + ### Cleanup for All COVID Calls dashboard + + # step 1 + # select required columns from VIA LINK’s Disaster Form + # pretty sure the distaster form is "Uncleaned data type 1 VIA LINK" + VIA_LINK_REQUIRED_COLUMNS_DISASTER = [ + "CallReportNum", + "ReportVersion", + "CallDateAndTimeStart", + "CityName", + "CountyName", + "StateProvince", + "PostalCode", + "Client Information - Age Group", + "Client Information - Call Type", + "Client Information - Identifies as", + "Concerns/Needs - Concerns/Needs", + "Contact Source - Program ", # ending space is needed + "Needs - Basic Needs Requested", + ] + vialink1_df = dfs["VIALINK"][VIA_LINK_REQUIRED_COLUMNS_DISASTER] + + # step 2 + # select required columns from 232-Help’s Disaster Form + TWO32_HELP_REQUIRED_COLUMNS = [ + "CallReportNum", + "ReportVersion", + "CallDateAndTimeStart", + "CityName", + "CountyName", + "StateProvince", + "PostalCode", + "Client Information - Date of Birth", + "Client Information - Call Type", + "Call Outcome - What concerns/needs were identified?", + "Client Information - Identifies as", + "Needs - Basic Needs Requested", + ] + two32_help_df = dfs["TWO32"][TWO32_HELP_REQUIRED_COLUMNS] + + # step 3 + # Create age ranges from date of birth + # use ranges 0-5, 6-12, 13-17, 18-24, 25-40, 41-59, 60+. + now = datetime.now() + bins = [0, 5, 12, 17, 24, 40, 59, 150] + labels = ["0-5", "6-12", "13-17", "18-24", "24-40", "41-49", "60+"] + dob = pd.to_datetime( + two32_help_df["Client Information - Date of Birth"], errors="coerce" + ) + years_old = (now - dob).astype("timedelta64[Y]") + age_range = pd.cut(years_old, bins=bins, labels=labels, include_lowest=True) + two32_help_df["Client Information - Age Group"] = age_range + # remove original Date of Birth column + two32_help_df.drop(columns=["Client Information - Date of Birth"], inplace=True) + + # step 4 + # add "Data From" column + vialink1_df["Data From"] = "VIA LINK" + two32_help_df["Data From"] = "232-HELP" + + # step 5 + # add data to master spreadsheet + # first merge "Call Outcome - What concerns/needs were identified" from 232-HELP + # into "Concerns/Needs - Concerns/Needs" + two32_help_df.rename( + columns={ + "Call Outcome - What concerns/needs were identified?": "Concerns/Needs - Concerns/Needs" + }, + inplace=True, + ) + + # new steps + # cleanup invalid values + vialink1_df["Contact Source - Program "].replace( + to_replace=datetime(2001, 2, 1, 0, 0), value=np.nan, inplace=True + ) + + # then combine data + master_df = pd.concat([vialink1_df, two32_help_df], join="outer", ignore_index=True) + + # step 6 + # add lat/lon columns + master_df["Latitude"] = master_df["PostalCode"].apply(get_lat) + master_df["Longitude"] = master_df["PostalCode"].apply(get_lng) + + # step 7 + # first put the values from "Needs - Basic Needs Requested" into "Concerns/Needs - Concerns/Needs" + cn = "Concerns/Needs - Concerns/Needs" + master_df["all_needs"] = master_df[[cn, "Needs - Basic Needs Requested"]].apply( + lambda x: "; ".join(x[x.notnull()]), axis=1 + ) + master_df.drop(columns=[cn, "Needs - Basic Needs Requested"], inplace=True) + master_df.rename(columns={"all_needs": cn}, inplace=True) + master_df = explode_needs(master_df, cn) + + # step 8 + # cleanup Concerns/Needs + master_df[cn] = master_df[cn].str.strip() + master_df = master_df[master_df[cn] != "Hangup / Wrong Number"] + master_df = master_df[master_df[cn] != "Hangup / Wrong #"] + master_df.replace(to_replace=replacements, value=None, inplace=True) + + return master_df diff --git a/cleanup_keep_calm_with_covid.py b/cleanup_keep_calm_with_covid.py new file mode 100644 index 0000000..858cd44 --- /dev/null +++ b/cleanup_keep_calm_with_covid.py @@ -0,0 +1,94 @@ +import pandas as pd +import numpy as np +from datetime import datetime +from utils import explode_needs, get_lat, get_lng, replacements + +CONVERTERS = { + "Concerns/Needs - Disaster Services ": str, + "Concerns/Needs - Domestic Abuse/IPV": str, + "Concerns/Needs - Early Childhood Education ": str, + "Concerns/Needs - Education/ Employment ": str, + "Concerns/Needs - Environmental Quality & Prtcn ": str, + "Concerns/Needs - Health Care ": str, + "Concerns/Needs - Interpersonal": str, + "Concerns/Needs - Mental Health": str, + "Concerns/Needs - Mental Health Concerns": str, + "Concerns/Needs - Organizational Development": str, + "Concerns/Needs - Other ": str, + "Concerns/Needs - Other Community Services": str, + "Concerns/Needs - Protective Service/Abuse": str, + "Concerns/Needs - Public Asst & Social Insurance": str, + "Concerns/Needs - Relationship Concerns / Issues ": str, + "Concerns/Needs - Self-Harm": str, + "Concerns/Needs - Sexuality": str, +} + + +def cleanup(df): + ### Cleanup for Keeping Calm with COVID dashboard + # step 1 + # select only the required columns + needs_columns = [ + "Concerns/Needs - Disaster Services ", + "Concerns/Needs - Domestic Abuse/IPV", + "Concerns/Needs - Early Childhood Education ", + "Concerns/Needs - Education/ Employment ", + "Concerns/Needs - Environmental Quality & Prtcn ", + "Concerns/Needs - Health Care ", + "Concerns/Needs - Interpersonal", + "Concerns/Needs - Mental Health", + "Concerns/Needs - Mental Health Concerns", + "Concerns/Needs - Organizational Development", + "Concerns/Needs - Other ", + "Concerns/Needs - Other Community Services", + "Concerns/Needs - Protective Service/Abuse", + "Concerns/Needs - Public Asst & Social Insurance", + "Concerns/Needs - Relationship Concerns / Issues ", + "Concerns/Needs - Self-Harm", + "Concerns/Needs - Sexuality", + ] + VIA_LINK_REQUIRED_COLUMNS_CALLS = [ + "CallReportNum", + "ReportVersion", + "CallDateAndTimeStart", + "CityName", + "CountyName", + "StateProvince", + "PostalCode", + "Call Information - Program", + "Demographics - Age", + "Demographics - Gender", + ] + needs_columns + df = df[VIA_LINK_REQUIRED_COLUMNS_CALLS] + + # step 2 + # remove calls not from LA Spirit line + df = df[df["Call Information - Program"] == "LA Spirit Crisis Line"] + + # step 3 + # combine all needs column into 1 column + all_needs = "Concerns/Needs - Concerns/Needs" + df[all_needs] = df[needs_columns].apply(lambda x: "; ".join(x[x.notnull()]), axis=1) + df = explode_needs(df, all_needs) + + # step 4 + # add "Data From" column + df["Data From"] = "VIA LINK" + + # step 5 + # cleanup Concerns/Needs Data + df[all_needs] = df[all_needs].str.strip() + df = df[df[all_needs] != "Wrong #"] + df = df[df[all_needs] != "hangup"] + df.replace(to_replace=replacements, value=None, inplace=True) + + # step 6 + # drop all the original needs columns + df.drop(columns=needs_columns, inplace=True) + + # step 7 + # add the Lat/Lng columns + df["Latitude"] = df["PostalCode"].apply(get_lat) + df["Longitude"] = df["PostalCode"].apply(get_lng) + + return df diff --git a/requirements.txt b/requirements.txt index da5b8fd..3f07058 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +click==7.1.1 openpyxl==3.0.3 pandas==1.0.3 uszipcode==0.2.4 diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..dec2a10 --- /dev/null +++ b/utils.py @@ -0,0 +1,58 @@ +import logging +import os +import pandas as pd +from uszipcode import SearchEngine + +search = SearchEngine(simple_zipcode=True) + + +def write_output_file(df, filename, sheet_name="codefornola cleaned"): + if filename.startswith("data") and not os.path.exists("data"): + logging.debug("Creating data directory") + os.makedirs("data") + logging.debug(f"Writing sheet '{sheet_name}' into '{filename}'") + df.to_excel(filename) + + +def get_lat(zipcode): + if pd.isnull(zipcode): + return None + else: + lat = search.by_zipcode(int(zipcode)).lat + return lat if lat else None + + +def get_lng(zipcode): + if pd.isnull(zipcode): + return None + else: + lng = search.by_zipcode(int(zipcode)).lng + return lng if lng else None + + +def explode_needs(df, need_column): + logging.debug(f"exploding needs into {need_column}") + df["tmp_needs"] = df[need_column].str.split(";") + df = df.explode("tmp_needs") + df.drop(columns=[need_column], inplace=True) + df.rename(columns={"tmp_needs": need_column}, inplace=True) + return df + + +replacements = { + "†": "", + "Employment": "Employment Services", + "Food": "Food/Meals", + "Food/Meal": "Food/Meals", + "I'm Sick (what next?)": "I'm Sick (What's Next?)", + "I'm Sick (Whats Next?)": "I'm Sick (What's Next?)", + "Income support/assistance": "Income Support/Assistance", + "information only call": "", + "Inquires about Health Complications / Concerns": "Inquires about Health Complications", + "International Travel Concerns": "International / General Travel Concerns", + "Legal Consumer": "Legal Assistance", + "Other - Interpersonal": "Other", + "Other (PLEASE Specify Caller Need in Call Notes)": "Other", + "other 2-1-1 referral": "Other", + "Unemployment": "Unemployment Benefits", +}