codefornola · mrcnc · May 9, 2020 · Apr 15, 2020 · Apr 16, 2020 · Apr 18, 2020
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
-.vscode
+.vscode/
+data/
 
 
 # Byte-compiled / optimized / DLL files

diff --git a/README.md b/README.md
@@ -1,23 +1,24 @@
-This project will help cleanup data for [VIA LINK](https://vialink.org/), the non-profit that runs the 211 system and call centers for the New Orleans-based region of Louisiana. 
+This project will help cleanup data for [VIA LINK](https://vialink.org/), the non-profit that runs the 211 system and call centers for the New Orleans-based region of Louisiana.
 
 ## Initial setup
 
-### install Python 
+### install Python
 
-You must have Python 3 installed.  You can download it [here](https://www.python.org/downloads/).
+You must have Python 3 installed. You can download it [here](https://www.python.org/downloads/).
 
-> If you are using Windows, be sure to select the "Add Python to PATH" option 
+> If you are using Windows, be sure to select the "Add Python to PATH" option
 
-You can confirm it is installed correctly by running `python3 --version` in a terminal or command prompt.  
+You can confirm it is installed correctly by running `python3 --version` in a terminal or command prompt.
 
-### create and activate a Python virtual environment 
+### create and activate a Python virtual environment
 
 This step is optional, but if you have more than one project using Python, it is recommended.
 
 A [virtual environment](https://docs.python.org/3/library/venv.html#creating-virtual-environments) isolates the dependencies
 of each project, which is helpful when working with mulitple projects with different depenencies (or different versions of the same dependency).
 
 For macOS or Linux
+
 ```
 python3 -m venv .venv
 source .venv/bin/activate
@@ -30,23 +31,37 @@ py -m venv env
 .\env\Scripts\activate
 ```
 
-> Note that you need to [activate the virutal environment](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/#activating-a-virtual-environment) 
-> before running a script but you only need to create the virtual envrionment once. 
+> Note that you need to [activate the virutal environment](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/#activating-a-virtual-environment)
+> before running a script but you only need to create the virtual envrionment once.
 
 ### install the dependencies
 
 In Python, dependencies are often installed using [pip](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/#installing-pip)
 
 You can install all the dependencies for this project by running:
+
 ```
 pip install -r requirements.txt
 ```
 
-
 ## Running the scripts
 
-TBD - but probably something like this:
+The basic format looks like `python cleanup.py script_name --inputfile1 ~/path/to/input.csv`
+
+For example:
+
+```
+# the keep-calm-with-covid script only requires one input file
+python cleanup.py --debug keep-calm-with-covid --input "/tmp/VL 4.29 Call Report.csv"
 
+# the all-covid-calls script requires 2 files
+python cleanup.py --debug all-covid-calls --vialink-input ~/Downloads/VL\ 4.29\ Disaster\ Call\ Report\ .csv --232-input ~/Downloads/232-HELP.csv"
 ```
-python cleanup /path/to/file1.xlsx /path/to/file2.xlsx
+
+If you want to see the basic usage you can run `python cleanup.py` and for a specifc command you can use the `--help` flag
+
 ```
+python cleanup.py all-covid-calls --help
+```
+
+> Also, you can use the `--debug` flag to view debug logs
diff --git a/cleanup.py b/cleanup.py
@@ -0,0 +1,100 @@
+import logging
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    handlers=[logging.StreamHandler()],
+)
+import os
+import sys
+
+import click
+import pandas as pd
+
+from cleanup_all_covid_calls import cleanup as cleanup_all_covid_calls
+from cleanup_keep_calm_with_covid import (
+    CONVERTERS,
+    cleanup as cleanup_keep_calm_with_covid,
+)
+from utils import write_output_file
+
+
+@click.group()
+@click.option("--debug/--no-debug", default=False)
+@click.pass_context
+def cleanup(ctx, debug):
+    ctx.ensure_object(dict)
+    ctx.obj["DEBUG"] = debug
+
+
+@cleanup.command()
+@click.pass_context
+@click.option(
+    "--vialink-input",
+    "vl_infile",
+    required=True,
+    help="Path to the VIA LINK input csv file",
+)
+@click.option(
+    "--232-input",
+    "two32_infile",
+    required=True,
+    help="Path to the 232 HELP input csv file",
+)
+@click.option(
+    "--output",
+    default="data/all_covid_calls_cleaned.xlsx",
+    help="Path to the output spreadsheet (cleaned .xlsx file)",
+)
+def all_covid_calls(ctx, vl_infile, two32_infile, output):
+    if ctx.obj["DEBUG"]:
+        logging.getLogger().setLevel(logging.DEBUG)
+        logging.debug("Running in debug mode")
+    logging.debug(f"Reading VIALINK file from '{vl_infile}'")
+    logging.debug(f"Reading 232-HELP file from '{two32_infile}'")
+    dfs = {}
+    dfvl = pd.read_csv(vl_infile, encoding="ISO-8859-1")
+    dfvl = remove_first_rows(dfvl)
+    dfs["VIALINK"] = dfvl
+    df232 = pd.read_csv(two32_infile, encoding="ISO-8859-1")
+    df232 = remove_first_rows(df232)
+    dfs["TWO32"] = df232
+    logging.info("Cleaning data for All COVID Calls Dashboard")
+    df = cleanup_all_covid_calls(dfs)
+    logging.info(f"Writing data for All COVID Calls Dashboard to '{output}'")
+    write_output_file(df, output)
+
+
+@cleanup.command()
+@click.pass_context
+@click.option(
+    "--input", "infile", required=True, help="Path to the input csv file",
+)
+@click.option(
+    "--output",
+    default="data/keep_calm_with_covid_cleaned.xlsx",
+    help="Path to the output spreadsheet (cleaned .xlsx file)",
+)
+def keep_calm_with_covid(ctx, infile, output):
+    if ctx.obj["DEBUG"]:
+        logging.getLogger().setLevel(logging.DEBUG)
+        logging.debug("Running in debug mode")
+    logging.debug(f"Reading input file '{infile}'")
+    df = pd.read_csv(infile, encoding="ISO-8859-1", converters=CONVERTERS)
+    df = remove_first_rows(df)
+    logging.info("Cleaning data for Keep Calm with COVID Dashboard")
+    cleanup_keep_calm_with_covid(df)
+    logging.info(f"Writing data for Keep Calm with COVID Dashboard to '{output}'")
+    write_output_file(df, output)
+
+
+def remove_first_rows(df):
+    columns = df.iloc[1].values.tolist()
+    df = df.iloc[2:]
+    df.columns = columns
+    return df
+
+
+if __name__ == "__main__":
+    cleanup(obj={})
diff --git a/cleanup_all_covid_calls.py b/cleanup_all_covid_calls.py
@@ -0,0 +1,118 @@
+import pandas as pd
+import numpy as np
+from datetime import datetime
+from utils import (
+    explode_needs,
+    get_lat,
+    get_lng,
+    replacements,
+    write_output_file,
+)
+
+pd.options.mode.chained_assignment = None
+
+
+def cleanup(dfs):
+    ### Cleanup for All COVID Calls dashboard
+
+    # step 1
+    # select required columns from VIA LINK’s Disaster Form
+    # pretty sure the distaster form is "Uncleaned data type 1 VIA LINK"
+    VIA_LINK_REQUIRED_COLUMNS_DISASTER = [
+        "CallReportNum",
+        "ReportVersion",
+        "CallDateAndTimeStart",
+        "CityName",
+        "CountyName",
+        "StateProvince",
+        "PostalCode",
+        "Client Information - Age Group",
+        "Client Information - Call Type",
+        "Client Information - Identifies as",
+        "Concerns/Needs - Concerns/Needs",
+        "Contact Source - Program ",  # ending space is needed
+        "Needs - Basic Needs Requested",
+    ]
+    vialink1_df = dfs["VIALINK"][VIA_LINK_REQUIRED_COLUMNS_DISASTER]
+
+    # step 2
+    # select required columns from 232-Help’s Disaster Form
+    TWO32_HELP_REQUIRED_COLUMNS = [
+        "CallReportNum",
+        "ReportVersion",
+        "CallDateAndTimeStart",
+        "CityName",
+        "CountyName",
+        "StateProvince",
+        "PostalCode",
+        "Client Information - Date of Birth",
+        "Client Information - Call Type",
+        "Call Outcome - What concerns/needs were identified?",
+        "Client Information - Identifies as",
+        "Needs - Basic Needs Requested",
+    ]
+    two32_help_df = dfs["TWO32"][TWO32_HELP_REQUIRED_COLUMNS]
+
+    # step 3
+    # Create age ranges from date of birth
+    # use ranges 0-5, 6-12, 13-17, 18-24, 25-40, 41-59, 60+.
+    now = datetime.now()
+    bins = [0, 5, 12, 17, 24, 40, 59, 150]
+    labels = ["0-5", "6-12", "13-17", "18-24", "24-40", "41-49", "60+"]
+    dob = pd.to_datetime(
+        two32_help_df["Client Information - Date of Birth"], errors="coerce"
+    )
+    years_old = (now - dob).astype("timedelta64[Y]")
+    age_range = pd.cut(years_old, bins=bins, labels=labels, include_lowest=True)
+    two32_help_df["Client Information - Age Group"] = age_range
+    # remove original Date of Birth column
+    two32_help_df.drop(columns=["Client Information - Date of Birth"], inplace=True)
+
+    # step 4
+    # add "Data From" column
+    vialink1_df["Data From"] = "VIA LINK"
+    two32_help_df["Data From"] = "232-HELP"
+
+    # step 5
+    # add data to master spreadsheet
+    # first merge "Call Outcome - What concerns/needs were identified" from 232-HELP
+    # into "Concerns/Needs - Concerns/Needs"
+    two32_help_df.rename(
+        columns={
+            "Call Outcome - What concerns/needs were identified?": "Concerns/Needs - Concerns/Needs"
+        },
+        inplace=True,
+    )
+
+    # new steps
+    # cleanup invalid values
+    vialink1_df["Contact Source - Program "].replace(
+        to_replace=datetime(2001, 2, 1, 0, 0), value=np.nan, inplace=True
+    )
+
+    # then combine data
+    master_df = pd.concat([vialink1_df, two32_help_df], join="outer", ignore_index=True)
+
+    # step 6
+    # add lat/lon columns
+    master_df["Latitude"] = master_df["PostalCode"].apply(get_lat)
+    master_df["Longitude"] = master_df["PostalCode"].apply(get_lng)
+
+    # step 7
+    # first put the values from "Needs - Basic Needs Requested" into "Concerns/Needs - Concerns/Needs"
+    cn = "Concerns/Needs - Concerns/Needs"
+    master_df["all_needs"] = master_df[[cn, "Needs - Basic Needs Requested"]].apply(
+        lambda x: "; ".join(x[x.notnull()]), axis=1
+    )
+    master_df.drop(columns=[cn, "Needs - Basic Needs Requested"], inplace=True)
+    master_df.rename(columns={"all_needs": cn}, inplace=True)
+    master_df = explode_needs(master_df, cn)
+
+    # step 8
+    # cleanup Concerns/Needs
+    master_df[cn] = master_df[cn].str.strip()
+    master_df = master_df[master_df[cn] != "Hangup / Wrong Number"]
+    master_df = master_df[master_df[cn] != "Hangup / Wrong #"]
+    master_df.replace(to_replace=replacements, value=None, inplace=True)
+
+    return master_df
diff --git a/cleanup_keep_calm_with_covid.py b/cleanup_keep_calm_with_covid.py
@@ -0,0 +1,94 @@
+import pandas as pd
+import numpy as np
+from datetime import datetime
+from utils import explode_needs, get_lat, get_lng, replacements
+
+CONVERTERS = {
+    "Concerns/Needs  - Disaster Services ": str,
+    "Concerns/Needs  - Domestic Abuse/IPV": str,
+    "Concerns/Needs  - Early Childhood Education ": str,
+    "Concerns/Needs  - Education/ Employment ": str,
+    "Concerns/Needs  - Environmental Quality & Prtcn ": str,
+    "Concerns/Needs  - Health Care ": str,
+    "Concerns/Needs  - Interpersonal": str,
+    "Concerns/Needs  - Mental Health": str,
+    "Concerns/Needs  - Mental Health Concerns": str,
+    "Concerns/Needs  - Organizational Development": str,
+    "Concerns/Needs  - Other ": str,
+    "Concerns/Needs  - Other Community Services": str,
+    "Concerns/Needs  - Protective Service/Abuse": str,
+    "Concerns/Needs  - Public Asst & Social Insurance": str,
+    "Concerns/Needs  - Relationship Concerns / Issues ": str,
+    "Concerns/Needs  - Self-Harm": str,
+    "Concerns/Needs  - Sexuality": str,
+}
+
+
+def cleanup(df):
+    ### Cleanup for Keeping Calm with COVID dashboard
+    # step 1
+    # select only the required columns
+    needs_columns = [
+        "Concerns/Needs  - Disaster Services ",
+        "Concerns/Needs  - Domestic Abuse/IPV",
+        "Concerns/Needs  - Early Childhood Education ",
+        "Concerns/Needs  - Education/ Employment ",
+        "Concerns/Needs  - Environmental Quality & Prtcn ",
+        "Concerns/Needs  - Health Care ",
+        "Concerns/Needs  - Interpersonal",
+        "Concerns/Needs  - Mental Health",
+        "Concerns/Needs  - Mental Health Concerns",
+        "Concerns/Needs  - Organizational Development",
+        "Concerns/Needs  - Other ",
+        "Concerns/Needs  - Other Community Services",
+        "Concerns/Needs  - Protective Service/Abuse",
+        "Concerns/Needs  - Public Asst & Social Insurance",
+        "Concerns/Needs  - Relationship Concerns / Issues ",
+        "Concerns/Needs  - Self-Harm",
+        "Concerns/Needs  - Sexuality",
+    ]
+    VIA_LINK_REQUIRED_COLUMNS_CALLS = [
+        "CallReportNum",
+        "ReportVersion",
+        "CallDateAndTimeStart",
+        "CityName",
+        "CountyName",
+        "StateProvince",
+        "PostalCode",
+        "Call Information - Program",
+        "Demographics - Age",
+        "Demographics - Gender",
+    ] + needs_columns
+    df = df[VIA_LINK_REQUIRED_COLUMNS_CALLS]
+
+    # step 2
+    # remove calls not from LA Spirit line
+    df = df[df["Call Information - Program"] == "LA Spirit Crisis Line"]
+
+    # step 3
+    # combine all needs column into 1 column
+    all_needs = "Concerns/Needs - Concerns/Needs"
+    df[all_needs] = df[needs_columns].apply(lambda x: "; ".join(x[x.notnull()]), axis=1)
+    df = explode_needs(df, all_needs)
+
+    # step 4
+    # add "Data From" column
+    df["Data From"] = "VIA LINK"
+
+    # step 5
+    # cleanup Concerns/Needs Data
+    df[all_needs] = df[all_needs].str.strip()
+    df = df[df[all_needs] != "Wrong #"]
+    df = df[df[all_needs] != "hangup"]
+    df.replace(to_replace=replacements, value=None, inplace=True)
+
+    # step 6
+    # drop all the original needs columns
+    df.drop(columns=needs_columns, inplace=True)
+
+    # step 7
+    # add the Lat/Lng columns
+    df["Latitude"] = df["PostalCode"].apply(get_lat)
+    df["Longitude"] = df["PostalCode"].apply(get_lng)
+
+    return df