From 3312dff5354af94bcaa14c1c4094ccc7559cfd6d Mon Sep 17 00:00:00 2001
From: Marc Cenac <marc.j.cenac@gmail.com>
Date: Tue, 14 Apr 2020 20:33:45 -0500
Subject: [PATCH 01/11] Add initial scripts to cleanup data

---
 .gitignore                      |   3 +-
 README.md                       |   5 +-
 cleanup_all_covid_calls.py      | 155 +++++++++++++++++++++++++++++
 cleanup_keep_calm_with_covid.py | 167 ++++++++++++++++++++++++++++++++
 4 files changed, 327 insertions(+), 3 deletions(-)
 create mode 100644 cleanup_all_covid_calls.py
 create mode 100644 cleanup_keep_calm_with_covid.py

diff --git a/.gitignore b/.gitignore
index 367d520..2c334a1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
-.vscode
+.vscode/
+data/
 
 
 # Byte-compiled / optimized / DLL files
diff --git a/README.md b/README.md
index 9405e96..1793847 100644
--- a/README.md
+++ b/README.md
@@ -45,8 +45,9 @@ pip install -r requirements.txt
 
 ## Running the scripts
 
-TBD - but probably something like this:
+TBD - but probably something like this but eventually specifying input and output files 
 
 ```
-python cleanup /path/to/file1.xlsx /path/to/file2.xlsx
+python cleanup_keep_calm_with_covid.py
+python cleanup_all_covid_calls.py
 ```
diff --git a/cleanup_all_covid_calls.py b/cleanup_all_covid_calls.py
new file mode 100644
index 0000000..9542044
--- /dev/null
+++ b/cleanup_all_covid_calls.py
@@ -0,0 +1,155 @@
+import pandas as pd
+from uszipcode import SearchEngine
+import numpy as np
+from datetime import datetime
+
+file = "Data from 4.2.20 Fake Data.xlsx"
+
+# read all sheets, returns a dict of dataframes
+dfs = pd.read_excel(file, sheet_name=None)
+
+### Cleanup for All COVID Calls dashboard
+
+# step 1
+# select required columns from VIA LINK’s Disaster Form
+# pretty sure the distaster form is "Uncleaned data type 1 VIA LINK"
+VIA_LINK_REQUIRED_COLUMNS_DISASTER = [
+    "CallReportNum",
+    "ReportVersion",
+    "CallDateAndTimeStart",
+    "CityName",
+    "CountyName",
+    "StateProvince",
+    "PostalCode",
+    "Client Information - Age Group",
+    "Client Information - Call Type",
+    "Client Information - Identifies as",
+    "Concerns/Needs - Concerns/Needs",
+    "Contact Source - Program ",  # ending space is needed
+    "Needs - Basic Needs Requested",
+]
+vialink1_df = dfs["Uncleaned data type 1 VIA LINK"][VIA_LINK_REQUIRED_COLUMNS_DISASTER]
+
+# step 2
+# select required columns from 232-Help’s Disaster Form
+TWO32_HELP_REQUIRED_COLUMNS = [
+    "CallReportNum",
+    "ReportVersion",
+    "CallDateAndTimeStart",
+    "CityName",
+    "CountyName",
+    "StateProvince",
+    "PostalCode",
+    "Client Information - Date of Birth",
+    "Client Information - Call Type",
+    "Call Outcome - What concerns/needs were identified?",
+    "Client Information - Identifies as",
+    "Needs - Basic Needs Requested",
+]
+two32_help_df = dfs["Uncleaned Data from 232-Help"][TWO32_HELP_REQUIRED_COLUMNS]
+
+# step 3
+# Create age ranges from date of birth
+# use ranges 0-5, 6-12, 13-17, 18-24, 25-40, 41-59, 60+.
+now = datetime.now()
+bins = [0, 5, 12, 17, 24, 40, 59, 150]
+labels = ["0-5", "6-12", "13-17", "18-24", "24-40", "41-49", "60+"]
+dob = pd.to_datetime(
+    two32_help_df["Client Information - Date of Birth"], errors="coerce"
+)
+years_old = (now - dob).astype("timedelta64[Y]")
+age_range = pd.cut(years_old, bins=bins, labels=labels, include_lowest=True)
+two32_help_df["Client Information - Age Group"] = age_range
+# remove original Date of Birth column
+two32_help_df.drop(columns=["Client Information - Date of Birth"], inplace=True)
+
+# step 4
+# add "Data From" column
+vialink1_df["Data From"] = "VIA LINK"
+two32_help_df["Data From"] = "232-HELP"
+
+
+# step 5
+# add data to master spreadsheet
+# first merge "Call Outcome - What concerns/needs were identified" from 232-HELP
+# into "Concerns/Needs - Concerns/Needs"
+two32_help_df.rename(
+    columns={
+        "Call Outcome - What concerns/needs were identified?": "Concerns/Needs - Concerns/Needs"
+    },
+    inplace=True,
+)
+
+
+# new steps
+# cleanup invalid values
+vialink1_df["Contact Source - Program "].replace(
+    to_replace=datetime(2001, 2, 1, 0, 0), value=np.nan, inplace=True
+)
+
+
+# then combine data
+master_df = pd.concat([vialink1_df, two32_help_df], join="outer", ignore_index=True)
+
+
+# step 6
+# add lat/lon columns
+search = SearchEngine(simple_zipcode=True)
+
+
+# todo: pull this into a utils module
+def get_lat(zipcode):
+    if pd.isnull(zipcode):
+        return None
+    else:
+        lat = search.by_zipcode(int(zipcode)).lat
+        return lat if lat else None
+
+
+def get_lng(zipcode):
+    if pd.isnull(zipcode):
+        return None
+    else:
+        lng = search.by_zipcode(int(zipcode)).lng
+        return lng if lng else None
+
+
+master_df["Latitude"] = master_df["PostalCode"].apply(get_lat)
+master_df["Longitude"] = master_df["PostalCode"].apply(get_lng)
+
+
+# step 7
+# first put the values from "Needs - Basic Needs Requested" into "Concerns/Needs - Concerns/Needs"
+cn = "Concerns/Needs - Concerns/Needs"
+master_df["all_needs"] = master_df[[cn, "Needs - Basic Needs Requested"]].apply(
+    lambda x: "; ".join(x[x.notnull()]), axis=1
+)
+# then explode needs into their own rows
+def explode_needs(df, need_column):
+    df["tmp_needs"] = df[need_column].str.split(";")
+    df = df.explode("tmp_needs")
+    df.drop(columns=[need_column], inplace=True)
+    df.rename(columns={"tmp_needs": need_column}, inplace=True)
+    return df
+
+
+master_df.drop(columns=[cn, "Needs - Basic Needs Requested"], inplace=True)
+master_df.rename(columns={"all_needs": cn}, inplace=True)
+master_df = explode_needs(master_df, cn)
+
+
+# step 8
+# cleanup Concerns/Needs
+master_df = master_df[master_df[cn] != "Hangup / Wrong #"]
+master_df[cn] = master_df[cn].str.strip()
+master_df = master_df.replace(
+    {
+        "Health Complications / Concerns": "Health Complications",
+        "Other (please specify caller need in call notes)": "Other",
+    }
+)
+
+# write out spreadsheet
+master_df.to_excel(
+    "data/all_covid_calls_cleaned.xlsx", sheet_name="codefornola cleaned"
+)
diff --git a/cleanup_keep_calm_with_covid.py b/cleanup_keep_calm_with_covid.py
new file mode 100644
index 0000000..0d85893
--- /dev/null
+++ b/cleanup_keep_calm_with_covid.py
@@ -0,0 +1,167 @@
+import pandas as pd
+from uszipcode import SearchEngine
+import numpy as np
+from datetime import datetime
+
+file = "Data from 4.2.20 Fake Data.xlsx"
+
+# read all sheets, returns a dict of dataframes
+dfs = pd.read_excel(file, sheet_name=None)
+
+
+### Cleanup for Keeping Calm with COVID dashboard
+converters = {
+    "Concerns/Needs  - Disaster Services ": str,
+    "Concerns/Needs  - Domestic Abuse/IPV": str,
+    "Concerns/Needs  - Early Childhood Education ": str,
+    "Concerns/Needs  - Education/ Employment ": str,
+    "Concerns/Needs  - Environmental Quality & Prtcn ": str,
+    "Concerns/Needs  - Health Care ": str,
+    "Concerns/Needs  - Interpersonal": str,
+    "Concerns/Needs  - Mental Health": str,
+    "Concerns/Needs  - Mental Health Concerns": str,
+    "Concerns/Needs  - Organizational Development": str,
+    "Concerns/Needs  - Other ": str,
+    "Concerns/Needs  - Other Community Services": str,
+    "Concerns/Needs  - Protective Service/Abuse": str,
+    "Concerns/Needs  - Public Asst & Social Insurance": str,
+    "Concerns/Needs  - Relationship Concerns / Issues ": str,
+    "Concerns/Needs  - Self-Harm": str,
+    "Concerns/Needs  - Sexuality": str,
+}
+df = pd.read_excel(
+    file, sheet_name="Uncleaned data type 2 VIA LINK", converters=converters
+)
+
+
+# step 1
+# pretty sure the call reports form is "Uncleaned data type 2 VIA LINK"
+
+# todo: why not use all conerns/needs?  why only DD-DT?
+# needs_columns = [c for c in dfs["Uncleaned data type 2 VIA LINK"] if c.startswith("Concerns/Needs")]
+needs_columns = [
+    # "Concerns/Needs  - N/A - must list WHY",
+    # "Concerns/Needs  - Addictive Disorders",
+    # "Concerns/Needs  - Arts & Culture ",
+    # "Concerns/Needs  - Basic Needs ",
+    # "Concerns/Needs  - Campus Information",
+    # "Concerns/Needs  - Consumer Services ",
+    # "Concerns/Needs  - Criminal Justice & Legal Srvcs ",
+    "Concerns/Needs  - Disaster Services ",
+    "Concerns/Needs  - Domestic Abuse/IPV",
+    "Concerns/Needs  - Early Childhood Education ",
+    "Concerns/Needs  - Education/ Employment ",
+    "Concerns/Needs  - Environmental Quality & Prtcn ",
+    "Concerns/Needs  - Health Care ",
+    "Concerns/Needs  - Interpersonal",
+    "Concerns/Needs  - Mental Health",
+    "Concerns/Needs  - Mental Health Concerns",
+    "Concerns/Needs  - Organizational Development",
+    "Concerns/Needs  - Other ",
+    "Concerns/Needs  - Other Community Services",
+    "Concerns/Needs  - Protective Service/Abuse",
+    "Concerns/Needs  - Public Asst & Social Insurance",
+    "Concerns/Needs  - Relationship Concerns / Issues ",
+    "Concerns/Needs  - Self-Harm",
+    "Concerns/Needs  - Sexuality",
+    # "Concerns/Needs  - Suicide Related",
+    # "Concerns/Needs  - Validity Question",
+    # "Concerns/Needs  - Victim Assistance / Survivor Support ",
+    # "Concerns/Needs  - Violence",
+    # "Concerns/Needs  - xxx1",
+    # "Concerns/Needs  - xxx2",
+]
+
+VIA_LINK_REQUIRED_COLUMNS_CALLS = [
+    "CallReportNum",
+    "ReportVersion",
+    "CallDateAndTimeStart",
+    "CityName",
+    "CountyName",
+    "StateProvince",
+    "PostalCode",
+    "Call Information - Program",
+    "Demographics - Age",
+    "Demographics - Gender",
+] + needs_columns
+vialink2_df = dfs["Uncleaned data type 2 VIA LINK"][VIA_LINK_REQUIRED_COLUMNS_CALLS]
+
+# step 2
+# remove calls not from LA Spirit line
+vialink2_df = vialink2_df[
+    vialink2_df["Call Information - Program"] == "LA Spirit Crisis Line"
+]
+
+
+# step 3
+# combine all needs column into 1 column
+all_needs = "Concerns/Needs - Concerns/Needs"
+vialink2_df[all_needs] = vialink2_df[needs_columns].apply(
+    lambda x: "; ".join(x[x.notnull()]), axis=1
+)
+
+
+# todo: pull this into a utils module
+def explode_needs(df, need_column):
+    df["tmp_needs"] = df[need_column].str.split(";")
+    df = df.explode("tmp_needs")
+    df.drop(columns=[need_column], inplace=True)
+    df.rename(columns={"tmp_needs": need_column}, inplace=True)
+    return df
+
+
+vialink2_df = explode_needs(vialink2_df, all_needs)
+
+# step 4
+# add "Data From" column
+vialink2_df["Data From"] = "VIA LINK"
+
+# step 5
+# cleanup Concerns/Needs Data
+
+vialink2_df = vialink2_df[vialink2_df[all_needs] != "Wrong #"]
+vialink2_df = vialink2_df[vialink2_df[all_needs] != "hangup"]
+vialink2_df = vialink2_df.replace(
+    {
+        "Concerns/Needs  - Interpersonal": "Interpersonal Conflict",
+        "Food": "Food/Meals",
+        "Interpersonal Conflict": "Income Support/Assistance",
+    }
+)
+
+
+# step 6
+# drop all the original needs columns
+vialink2_df.drop(columns=needs_columns, inplace=True)
+
+
+# step 7
+# add the Lat/Lng columns
+
+# todo: pull this into a utils module
+search = SearchEngine(simple_zipcode=True)
+
+
+def get_lat(zipcode):
+    if pd.isnull(zipcode):
+        return None
+    else:
+        lat = search.by_zipcode(int(zipcode)).lat
+        return lat if lat else None
+
+
+def get_lng(zipcode):
+    if pd.isnull(zipcode):
+        return None
+    else:
+        lng = search.by_zipcode(int(zipcode)).lng
+        return lng if lng else None
+
+
+vialink2_df["Latitude"] = vialink2_df["PostalCode"].apply(get_lat)
+vialink2_df["Longitude"] = vialink2_df["PostalCode"].apply(get_lng)
+
+
+vialink2_df.to_excel(
+    "data/keep_calm_with_covid_cleaned.xlsx", sheet_name="codefornola cleaned"
+)

From d2a6a2bc3c0599193491429e52b9cc75c1162767 Mon Sep 17 00:00:00 2001
From: Marc Cenac <marc.j.cenac@gmail.com>
Date: Thu, 16 Apr 2020 08:48:00 -0500
Subject: [PATCH 02/11] Refactoring, adding more text replacements

---
 cleanup_all_covid_calls.py      | 273 ++++++++++++++------------------
 cleanup_keep_calm_with_covid.py | 255 +++++++++++------------------
 utils.py                        |  45 ++++++
 3 files changed, 262 insertions(+), 311 deletions(-)
 create mode 100644 utils.py

diff --git a/cleanup_all_covid_calls.py b/cleanup_all_covid_calls.py
index 9542044..851749a 100644
--- a/cleanup_all_covid_calls.py
+++ b/cleanup_all_covid_calls.py
@@ -1,155 +1,124 @@
 import pandas as pd
-from uszipcode import SearchEngine
 import numpy as np
 from datetime import datetime
-
-file = "Data from 4.2.20 Fake Data.xlsx"
-
-# read all sheets, returns a dict of dataframes
-dfs = pd.read_excel(file, sheet_name=None)
-
-### Cleanup for All COVID Calls dashboard
-
-# step 1
-# select required columns from VIA LINK’s Disaster Form
-# pretty sure the distaster form is "Uncleaned data type 1 VIA LINK"
-VIA_LINK_REQUIRED_COLUMNS_DISASTER = [
-    "CallReportNum",
-    "ReportVersion",
-    "CallDateAndTimeStart",
-    "CityName",
-    "CountyName",
-    "StateProvince",
-    "PostalCode",
-    "Client Information - Age Group",
-    "Client Information - Call Type",
-    "Client Information - Identifies as",
-    "Concerns/Needs - Concerns/Needs",
-    "Contact Source - Program ",  # ending space is needed
-    "Needs - Basic Needs Requested",
-]
-vialink1_df = dfs["Uncleaned data type 1 VIA LINK"][VIA_LINK_REQUIRED_COLUMNS_DISASTER]
-
-# step 2
-# select required columns from 232-Help’s Disaster Form
-TWO32_HELP_REQUIRED_COLUMNS = [
-    "CallReportNum",
-    "ReportVersion",
-    "CallDateAndTimeStart",
-    "CityName",
-    "CountyName",
-    "StateProvince",
-    "PostalCode",
-    "Client Information - Date of Birth",
-    "Client Information - Call Type",
-    "Call Outcome - What concerns/needs were identified?",
-    "Client Information - Identifies as",
-    "Needs - Basic Needs Requested",
-]
-two32_help_df = dfs["Uncleaned Data from 232-Help"][TWO32_HELP_REQUIRED_COLUMNS]
-
-# step 3
-# Create age ranges from date of birth
-# use ranges 0-5, 6-12, 13-17, 18-24, 25-40, 41-59, 60+.
-now = datetime.now()
-bins = [0, 5, 12, 17, 24, 40, 59, 150]
-labels = ["0-5", "6-12", "13-17", "18-24", "24-40", "41-49", "60+"]
-dob = pd.to_datetime(
-    two32_help_df["Client Information - Date of Birth"], errors="coerce"
-)
-years_old = (now - dob).astype("timedelta64[Y]")
-age_range = pd.cut(years_old, bins=bins, labels=labels, include_lowest=True)
-two32_help_df["Client Information - Age Group"] = age_range
-# remove original Date of Birth column
-two32_help_df.drop(columns=["Client Information - Date of Birth"], inplace=True)
-
-# step 4
-# add "Data From" column
-vialink1_df["Data From"] = "VIA LINK"
-two32_help_df["Data From"] = "232-HELP"
-
-
-# step 5
-# add data to master spreadsheet
-# first merge "Call Outcome - What concerns/needs were identified" from 232-HELP
-# into "Concerns/Needs - Concerns/Needs"
-two32_help_df.rename(
-    columns={
-        "Call Outcome - What concerns/needs were identified?": "Concerns/Needs - Concerns/Needs"
-    },
-    inplace=True,
-)
-
-
-# new steps
-# cleanup invalid values
-vialink1_df["Contact Source - Program "].replace(
-    to_replace=datetime(2001, 2, 1, 0, 0), value=np.nan, inplace=True
-)
-
-
-# then combine data
-master_df = pd.concat([vialink1_df, two32_help_df], join="outer", ignore_index=True)
-
-
-# step 6
-# add lat/lon columns
-search = SearchEngine(simple_zipcode=True)
-
-
-# todo: pull this into a utils module
-def get_lat(zipcode):
-    if pd.isnull(zipcode):
-        return None
-    else:
-        lat = search.by_zipcode(int(zipcode)).lat
-        return lat if lat else None
-
-
-def get_lng(zipcode):
-    if pd.isnull(zipcode):
-        return None
-    else:
-        lng = search.by_zipcode(int(zipcode)).lng
-        return lng if lng else None
-
-
-master_df["Latitude"] = master_df["PostalCode"].apply(get_lat)
-master_df["Longitude"] = master_df["PostalCode"].apply(get_lng)
-
-
-# step 7
-# first put the values from "Needs - Basic Needs Requested" into "Concerns/Needs - Concerns/Needs"
-cn = "Concerns/Needs - Concerns/Needs"
-master_df["all_needs"] = master_df[[cn, "Needs - Basic Needs Requested"]].apply(
-    lambda x: "; ".join(x[x.notnull()]), axis=1
-)
-# then explode needs into their own rows
-def explode_needs(df, need_column):
-    df["tmp_needs"] = df[need_column].str.split(";")
-    df = df.explode("tmp_needs")
-    df.drop(columns=[need_column], inplace=True)
-    df.rename(columns={"tmp_needs": need_column}, inplace=True)
-    return df
-
-
-master_df.drop(columns=[cn, "Needs - Basic Needs Requested"], inplace=True)
-master_df.rename(columns={"all_needs": cn}, inplace=True)
-master_df = explode_needs(master_df, cn)
-
-
-# step 8
-# cleanup Concerns/Needs
-master_df = master_df[master_df[cn] != "Hangup / Wrong #"]
-master_df[cn] = master_df[cn].str.strip()
-master_df = master_df.replace(
-    {
-        "Health Complications / Concerns": "Health Complications",
-        "Other (please specify caller need in call notes)": "Other",
-    }
-)
-
-# write out spreadsheet
-master_df.to_excel(
-    "data/all_covid_calls_cleaned.xlsx", sheet_name="codefornola cleaned"
-)
+from utils import explode_needs, get_lat, get_lng, replacements
+
+
+def cleanup(dfs):
+    ### Cleanup for All COVID Calls dashboard
+
+    # step 1
+    # select required columns from VIA LINK’s Disaster Form
+    # pretty sure the distaster form is "Uncleaned data type 1 VIA LINK"
+    VIA_LINK_REQUIRED_COLUMNS_DISASTER = [
+        "CallReportNum",
+        "ReportVersion",
+        "CallDateAndTimeStart",
+        "CityName",
+        "CountyName",
+        "StateProvince",
+        "PostalCode",
+        "Client Information - Age Group",
+        "Client Information - Call Type",
+        "Client Information - Identifies as",
+        "Concerns/Needs - Concerns/Needs",
+        "Contact Source - Program ",  # ending space is needed
+        "Needs - Basic Needs Requested",
+    ]
+    vialink1_df = dfs["Uncleaned data type 1 VIA LINK"][
+        VIA_LINK_REQUIRED_COLUMNS_DISASTER
+    ]
+
+    # step 2
+    # select required columns from 232-Help’s Disaster Form
+    TWO32_HELP_REQUIRED_COLUMNS = [
+        "CallReportNum",
+        "ReportVersion",
+        "CallDateAndTimeStart",
+        "CityName",
+        "CountyName",
+        "StateProvince",
+        "PostalCode",
+        "Client Information - Date of Birth",
+        "Client Information - Call Type",
+        "Call Outcome - What concerns/needs were identified?",
+        "Client Information - Identifies as",
+        "Needs - Basic Needs Requested",
+    ]
+    two32_help_df = dfs["Uncleaned Data from 232-Help"][TWO32_HELP_REQUIRED_COLUMNS]
+
+    # step 3
+    # Create age ranges from date of birth
+    # use ranges 0-5, 6-12, 13-17, 18-24, 25-40, 41-59, 60+.
+    now = datetime.now()
+    bins = [0, 5, 12, 17, 24, 40, 59, 150]
+    labels = ["0-5", "6-12", "13-17", "18-24", "24-40", "41-49", "60+"]
+    dob = pd.to_datetime(
+        two32_help_df["Client Information - Date of Birth"], errors="coerce"
+    )
+    years_old = (now - dob).astype("timedelta64[Y]")
+    age_range = pd.cut(years_old, bins=bins, labels=labels, include_lowest=True)
+    two32_help_df["Client Information - Age Group"] = age_range
+    # remove original Date of Birth column
+    two32_help_df.drop(columns=["Client Information - Date of Birth"], inplace=True)
+
+    # step 4
+    # add "Data From" column
+    vialink1_df["Data From"] = "VIA LINK"
+    two32_help_df["Data From"] = "232-HELP"
+
+    # step 5
+    # add data to master spreadsheet
+    # first merge "Call Outcome - What concerns/needs were identified" from 232-HELP
+    # into "Concerns/Needs - Concerns/Needs"
+    two32_help_df.rename(
+        columns={
+            "Call Outcome - What concerns/needs were identified?": "Concerns/Needs - Concerns/Needs"
+        },
+        inplace=True,
+    )
+
+    # new steps
+    # cleanup invalid values
+    vialink1_df["Contact Source - Program "].replace(
+        to_replace=datetime(2001, 2, 1, 0, 0), value=np.nan, inplace=True
+    )
+
+    # then combine data
+    master_df = pd.concat([vialink1_df, two32_help_df], join="outer", ignore_index=True)
+
+    # step 6
+    # add lat/lon columns
+    master_df["Latitude"] = master_df["PostalCode"].apply(get_lat)
+    master_df["Longitude"] = master_df["PostalCode"].apply(get_lng)
+
+    # step 7
+    # first put the values from "Needs - Basic Needs Requested" into "Concerns/Needs - Concerns/Needs"
+    cn = "Concerns/Needs - Concerns/Needs"
+    master_df["all_needs"] = master_df[[cn, "Needs - Basic Needs Requested"]].apply(
+        lambda x: "; ".join(x[x.notnull()]), axis=1
+    )
+    master_df.drop(columns=[cn, "Needs - Basic Needs Requested"], inplace=True)
+    master_df.rename(columns={"all_needs": cn}, inplace=True)
+    master_df = explode_needs(master_df, cn)
+
+    # step 8
+    # cleanup Concerns/Needs
+    master_df[cn] = master_df[cn].str.strip()
+    master_df = master_df[master_df[cn] != "Hangup / Wrong Number"]
+    master_df = master_df[master_df[cn] != "Hangup / Wrong #"]
+    master_df = master_df.replace(to_replace=replacements, value=None, inplace=True)
+
+    return master_df
+
+
+if __name__ == "__main__":
+    file = "Data from 4.2.20 Fake Data.xlsx"
+
+    # read all sheets, returns a dict of dataframes
+    dfs = pd.read_excel(file, sheet_name=None)
+
+    df = cleanup(dfs)
+
+    # write out spreadsheet
+    df.to_excel("data/all_covid_calls_cleaned.xlsx", sheet_name="codefornola cleaned")
diff --git a/cleanup_keep_calm_with_covid.py b/cleanup_keep_calm_with_covid.py
index 0d85893..b5ed4e3 100644
--- a/cleanup_keep_calm_with_covid.py
+++ b/cleanup_keep_calm_with_covid.py
@@ -1,167 +1,104 @@
 import pandas as pd
-from uszipcode import SearchEngine
 import numpy as np
 from datetime import datetime
+from utils import explode_needs, get_lat, get_lng, replacements
+
+
+def cleanup(df):
+    ### Cleanup for Keeping Calm with COVID dashboard
+    # step 1
+    # select only the required columns
+    needs_columns = [
+        "Concerns/Needs  - Disaster Services ",
+        "Concerns/Needs  - Domestic Abuse/IPV",
+        "Concerns/Needs  - Early Childhood Education ",
+        "Concerns/Needs  - Education/ Employment ",
+        "Concerns/Needs  - Environmental Quality & Prtcn ",
+        "Concerns/Needs  - Health Care ",
+        "Concerns/Needs  - Interpersonal",
+        "Concerns/Needs  - Mental Health",
+        "Concerns/Needs  - Mental Health Concerns",
+        "Concerns/Needs  - Organizational Development",
+        "Concerns/Needs  - Other ",
+        "Concerns/Needs  - Other Community Services",
+        "Concerns/Needs  - Protective Service/Abuse",
+        "Concerns/Needs  - Public Asst & Social Insurance",
+        "Concerns/Needs  - Relationship Concerns / Issues ",
+        "Concerns/Needs  - Self-Harm",
+        "Concerns/Needs  - Sexuality",
+    ]
+    VIA_LINK_REQUIRED_COLUMNS_CALLS = [
+        "CallReportNum",
+        "ReportVersion",
+        "CallDateAndTimeStart",
+        "CityName",
+        "CountyName",
+        "StateProvince",
+        "PostalCode",
+        "Call Information - Program",
+        "Demographics - Age",
+        "Demographics - Gender",
+    ] + needs_columns
+    df = df[VIA_LINK_REQUIRED_COLUMNS_CALLS]
+
+    # step 2
+    # remove calls not from LA Spirit line
+    df = df[df["Call Information - Program"] == "LA Spirit Crisis Line"]
+
+    # step 3
+    # combine all needs column into 1 column
+    all_needs = "Concerns/Needs - Concerns/Needs"
+    df[all_needs] = df[needs_columns].apply(lambda x: "; ".join(x[x.notnull()]), axis=1)
+    df = explode_needs(df, all_needs)
+
+    # step 4
+    # add "Data From" column
+    df["Data From"] = "VIA LINK"
+
+    # step 5
+    # cleanup Concerns/Needs Data
+    df[all_needs] = df[all_needs].str.strip()
+    df = df[df[all_needs] != "Wrong #"]
+    df = df[df[all_needs] != "hangup"]
+    df.replace(to_replace=replacements, value=None, inplace=True)
+
+    # step 6
+    # drop all the original needs columns
+    df.drop(columns=needs_columns, inplace=True)
+
+    # step 7
+    # add the Lat/Lng columns
+    df["Latitude"] = df["PostalCode"].apply(get_lat)
+    df["Longitude"] = df["PostalCode"].apply(get_lng)
 
-file = "Data from 4.2.20 Fake Data.xlsx"
-
-# read all sheets, returns a dict of dataframes
-dfs = pd.read_excel(file, sheet_name=None)
-
-
-### Cleanup for Keeping Calm with COVID dashboard
-converters = {
-    "Concerns/Needs  - Disaster Services ": str,
-    "Concerns/Needs  - Domestic Abuse/IPV": str,
-    "Concerns/Needs  - Early Childhood Education ": str,
-    "Concerns/Needs  - Education/ Employment ": str,
-    "Concerns/Needs  - Environmental Quality & Prtcn ": str,
-    "Concerns/Needs  - Health Care ": str,
-    "Concerns/Needs  - Interpersonal": str,
-    "Concerns/Needs  - Mental Health": str,
-    "Concerns/Needs  - Mental Health Concerns": str,
-    "Concerns/Needs  - Organizational Development": str,
-    "Concerns/Needs  - Other ": str,
-    "Concerns/Needs  - Other Community Services": str,
-    "Concerns/Needs  - Protective Service/Abuse": str,
-    "Concerns/Needs  - Public Asst & Social Insurance": str,
-    "Concerns/Needs  - Relationship Concerns / Issues ": str,
-    "Concerns/Needs  - Self-Harm": str,
-    "Concerns/Needs  - Sexuality": str,
-}
-df = pd.read_excel(
-    file, sheet_name="Uncleaned data type 2 VIA LINK", converters=converters
-)
-
-
-# step 1
-# pretty sure the call reports form is "Uncleaned data type 2 VIA LINK"
-
-# todo: why not use all conerns/needs?  why only DD-DT?
-# needs_columns = [c for c in dfs["Uncleaned data type 2 VIA LINK"] if c.startswith("Concerns/Needs")]
-needs_columns = [
-    # "Concerns/Needs  - N/A - must list WHY",
-    # "Concerns/Needs  - Addictive Disorders",
-    # "Concerns/Needs  - Arts & Culture ",
-    # "Concerns/Needs  - Basic Needs ",
-    # "Concerns/Needs  - Campus Information",
-    # "Concerns/Needs  - Consumer Services ",
-    # "Concerns/Needs  - Criminal Justice & Legal Srvcs ",
-    "Concerns/Needs  - Disaster Services ",
-    "Concerns/Needs  - Domestic Abuse/IPV",
-    "Concerns/Needs  - Early Childhood Education ",
-    "Concerns/Needs  - Education/ Employment ",
-    "Concerns/Needs  - Environmental Quality & Prtcn ",
-    "Concerns/Needs  - Health Care ",
-    "Concerns/Needs  - Interpersonal",
-    "Concerns/Needs  - Mental Health",
-    "Concerns/Needs  - Mental Health Concerns",
-    "Concerns/Needs  - Organizational Development",
-    "Concerns/Needs  - Other ",
-    "Concerns/Needs  - Other Community Services",
-    "Concerns/Needs  - Protective Service/Abuse",
-    "Concerns/Needs  - Public Asst & Social Insurance",
-    "Concerns/Needs  - Relationship Concerns / Issues ",
-    "Concerns/Needs  - Self-Harm",
-    "Concerns/Needs  - Sexuality",
-    # "Concerns/Needs  - Suicide Related",
-    # "Concerns/Needs  - Validity Question",
-    # "Concerns/Needs  - Victim Assistance / Survivor Support ",
-    # "Concerns/Needs  - Violence",
-    # "Concerns/Needs  - xxx1",
-    # "Concerns/Needs  - xxx2",
-]
-
-VIA_LINK_REQUIRED_COLUMNS_CALLS = [
-    "CallReportNum",
-    "ReportVersion",
-    "CallDateAndTimeStart",
-    "CityName",
-    "CountyName",
-    "StateProvince",
-    "PostalCode",
-    "Call Information - Program",
-    "Demographics - Age",
-    "Demographics - Gender",
-] + needs_columns
-vialink2_df = dfs["Uncleaned data type 2 VIA LINK"][VIA_LINK_REQUIRED_COLUMNS_CALLS]
-
-# step 2
-# remove calls not from LA Spirit line
-vialink2_df = vialink2_df[
-    vialink2_df["Call Information - Program"] == "LA Spirit Crisis Line"
-]
-
-
-# step 3
-# combine all needs column into 1 column
-all_needs = "Concerns/Needs - Concerns/Needs"
-vialink2_df[all_needs] = vialink2_df[needs_columns].apply(
-    lambda x: "; ".join(x[x.notnull()]), axis=1
-)
-
-
-# todo: pull this into a utils module
-def explode_needs(df, need_column):
-    df["tmp_needs"] = df[need_column].str.split(";")
-    df = df.explode("tmp_needs")
-    df.drop(columns=[need_column], inplace=True)
-    df.rename(columns={"tmp_needs": need_column}, inplace=True)
     return df
 
 
-vialink2_df = explode_needs(vialink2_df, all_needs)
-
-# step 4
-# add "Data From" column
-vialink2_df["Data From"] = "VIA LINK"
-
-# step 5
-# cleanup Concerns/Needs Data
-
-vialink2_df = vialink2_df[vialink2_df[all_needs] != "Wrong #"]
-vialink2_df = vialink2_df[vialink2_df[all_needs] != "hangup"]
-vialink2_df = vialink2_df.replace(
-    {
-        "Concerns/Needs  - Interpersonal": "Interpersonal Conflict",
-        "Food": "Food/Meals",
-        "Interpersonal Conflict": "Income Support/Assistance",
+if __name__ == "__main__":
+    file = "Data from 4.2.20 Fake Data.xlsx"
+    converters = {
+        "Concerns/Needs  - Disaster Services ": str,
+        "Concerns/Needs  - Domestic Abuse/IPV": str,
+        "Concerns/Needs  - Early Childhood Education ": str,
+        "Concerns/Needs  - Education/ Employment ": str,
+        "Concerns/Needs  - Environmental Quality & Prtcn ": str,
+        "Concerns/Needs  - Health Care ": str,
+        "Concerns/Needs  - Interpersonal": str,
+        "Concerns/Needs  - Mental Health": str,
+        "Concerns/Needs  - Mental Health Concerns": str,
+        "Concerns/Needs  - Organizational Development": str,
+        "Concerns/Needs  - Other ": str,
+        "Concerns/Needs  - Other Community Services": str,
+        "Concerns/Needs  - Protective Service/Abuse": str,
+        "Concerns/Needs  - Public Asst & Social Insurance": str,
+        "Concerns/Needs  - Relationship Concerns / Issues ": str,
+        "Concerns/Needs  - Self-Harm": str,
+        "Concerns/Needs  - Sexuality": str,
     }
-)
-
-
-# step 6
-# drop all the original needs columns
-vialink2_df.drop(columns=needs_columns, inplace=True)
-
-
-# step 7
-# add the Lat/Lng columns
-
-# todo: pull this into a utils module
-search = SearchEngine(simple_zipcode=True)
-
-
-def get_lat(zipcode):
-    if pd.isnull(zipcode):
-        return None
-    else:
-        lat = search.by_zipcode(int(zipcode)).lat
-        return lat if lat else None
-
-
-def get_lng(zipcode):
-    if pd.isnull(zipcode):
-        return None
-    else:
-        lng = search.by_zipcode(int(zipcode)).lng
-        return lng if lng else None
-
-
-vialink2_df["Latitude"] = vialink2_df["PostalCode"].apply(get_lat)
-vialink2_df["Longitude"] = vialink2_df["PostalCode"].apply(get_lng)
-
-
-vialink2_df.to_excel(
-    "data/keep_calm_with_covid_cleaned.xlsx", sheet_name="codefornola cleaned"
-)
+    df = pd.read_excel(
+        file, sheet_name="Uncleaned data type 2 VIA LINK", converters=converters
+    )
+    df = cleanup(df)
+    df.to_excel(
+        "data/keep_calm_with_covid_cleaned.xlsx", sheet_name="codefornola cleaned"
+    )
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000..af9e1bb
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,45 @@
+import pandas as pd
+from uszipcode import SearchEngine
+
+search = SearchEngine(simple_zipcode=True)
+
+
+def get_lat(zipcode):
+    if pd.isnull(zipcode):
+        return None
+    else:
+        lat = search.by_zipcode(int(zipcode)).lat
+        return lat if lat else None
+
+
+def get_lng(zipcode):
+    if pd.isnull(zipcode):
+        return None
+    else:
+        lng = search.by_zipcode(int(zipcode)).lng
+        return lng if lng else None
+
+
+def explode_needs(df, need_column):
+    df["tmp_needs"] = df[need_column].str.split(";")
+    df = df.explode("tmp_needs")
+    df.drop(columns=[need_column], inplace=True)
+    df.rename(columns={"tmp_needs": need_column}, inplace=True)
+    return df
+
+
+replacements = {
+    "†": "For some reason this cross mark is showing up in some of the entries, so just removing it",
+    "Employment": "Employment Services",
+    "Food": "Food/Meals",
+    "I'm Sick (what next?)": "I'm Sick (What's Next?)",
+    "I'm Sick (Whats Next?)": "I'm Sick (What's Next?)",
+    "information only call": "",
+    "Inquires about Health Complications / Concerns": "Inquires about Health Complications",
+    "International Travel Concerns": "International / General Travel Concerns",
+    "Legal Consumer": "Legal Assistance",
+    "Other - Interpersonal": "Other",
+    "Other (PLEASE Specify Caller Need in Call Notes)": "Other",
+    "other 2-1-1 referral": "Other",
+    "Unemployment": "Unemployment Benefits",
+}

From 6cd086436e701a56183e114c5b937b558e80dab4 Mon Sep 17 00:00:00 2001
From: Marc Cenac <marc.j.cenac@gmail.com>
Date: Sat, 18 Apr 2020 12:03:00 -0500
Subject: [PATCH 03/11] Fix bug and suppress SettingWithCopyWarning

---
 cleanup_all_covid_calls.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cleanup_all_covid_calls.py b/cleanup_all_covid_calls.py
index 851749a..596e21d 100644
--- a/cleanup_all_covid_calls.py
+++ b/cleanup_all_covid_calls.py
@@ -2,6 +2,8 @@
 import numpy as np
 from datetime import datetime
 from utils import explode_needs, get_lat, get_lng, replacements
+pd.options.mode.chained_assignment = None
+
 
 
 def cleanup(dfs):
@@ -107,7 +109,7 @@ def cleanup(dfs):
     master_df[cn] = master_df[cn].str.strip()
     master_df = master_df[master_df[cn] != "Hangup / Wrong Number"]
     master_df = master_df[master_df[cn] != "Hangup / Wrong #"]
-    master_df = master_df.replace(to_replace=replacements, value=None, inplace=True)
+    master_df.replace(to_replace=replacements, value=None, inplace=True)
 
     return master_df
 

From 2b7db9e7966a7373ef59837cdcbc643395a2e35f Mon Sep 17 00:00:00 2001
From: Marc Cenac <marc.j.cenac@gmail.com>
Date: Sat, 18 Apr 2020 13:39:54 -0500
Subject: [PATCH 04/11] Adding CLI using click

---
 README.md                       | 13 ++++--
 cleanup.py                      | 73 +++++++++++++++++++++++++++++++++
 cleanup_all_covid_calls.py      | 14 ++++---
 cleanup_keep_calm_with_covid.py | 41 +++++++++---------
 requirements.txt                |  1 +
 utils.py                        | 11 +++++
 6 files changed, 125 insertions(+), 28 deletions(-)
 create mode 100644 cleanup.py

diff --git a/README.md b/README.md
index 1793847..0544215 100644
--- a/README.md
+++ b/README.md
@@ -45,9 +45,16 @@ pip install -r requirements.txt
 
 ## Running the scripts
 
-TBD - but probably something like this but eventually specifying input and output files 
+The basic format is `python cleanup.py <command> --input <path/to/input.xlsx>` 
 
+For example:
 ```
-python cleanup_keep_calm_with_covid.py
-python cleanup_all_covid_calls.py
+python cleanup.py all-covid-calls --input "Data from 4.2.20 Fake Data.xlsx"
+python cleanup.py --debug keep-calm-with-covid --input "Data from 4.2.20 Fake Data.xlsx"  --sheetname "Uncleaned data type 2 VIA LINK"
+```
+
+If you want to see the basic usage you can run `python cleanup.py` and for a specifc command you can use the `--help` flag
+
+```
+python cleanup.py all-covid-calls --help
 ```
diff --git a/cleanup.py b/cleanup.py
new file mode 100644
index 0000000..f44e36f
--- /dev/null
+++ b/cleanup.py
@@ -0,0 +1,73 @@
+import logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    handlers=[logging.StreamHandler()],
+)
+import os
+import sys
+
+import click
+import pandas as pd
+
+from cleanup_all_covid_calls import cleanup as cleanup_all_covid_calls
+from cleanup_keep_calm_with_covid import (
+    CONVERTERS,
+    cleanup as cleanup_keep_calm_with_covid,
+)
+from utils import write_output_file
+
+
+@click.group()
+@click.option("--debug/--no-debug", default=False)
+@click.pass_context
+def cleanup(ctx, debug):
+    ctx.ensure_object(dict)
+    ctx.obj["DEBUG"] = debug
+
+
+@cleanup.command()
+@click.pass_context
+@click.option("--input", "infile", required=True, help="Path to the input spreadsheet (.xlsx file)")
+@click.option("--sheetname", default=None, help="Name of the sheet to use")
+@click.option(
+    "--output",
+    default="data/all_covid_calls_cleaned.xlsx",
+    help="Path to the output spreadsheet (cleaned .xlsx file)",
+)
+def all_covid_calls(ctx, infile, sheetname, output):
+    if ctx.obj["DEBUG"]:
+        logging.getLogger().setLevel(logging.DEBUG)
+        logging.debug("Running in debug mode")
+    logging.debug(f"Reading input file '{infile}'")
+    df = pd.read_excel(infile, sheet_name=sheetname)
+    logging.info("Cleaning data for All COVID Calls Dashboard")
+    df = cleanup_all_covid_calls(df)
+    logging.info(f"Writing data for All COVID Calls Dashboard to '{output}'")
+    write_output_file(df, output)
+
+
+@cleanup.command()
+@click.pass_context
+@click.option("--input", "infile", required=True, help="Path to the input spreadsheet (.xlsx file)")
+@click.option("--sheetname", required=True, help="Name of the sheet to use")
+@click.option(
+    "--output",
+    default="data/keep_calm_with_covid_cleaned.xlsx",
+    help="Path to the output spreadsheet (cleaned .xlsx file)",
+)
+def keep_calm_with_covid(ctx, infile, sheetname, output):
+    if ctx.obj["DEBUG"]:
+        logging.getLogger().setLevel(logging.DEBUG)
+        logging.debug("Running in debug mode")
+    logging.debug(f"Reading input file '{infile}'")
+    df = pd.read_excel(infile, sheet_name=sheetname, converters=CONVERTERS)
+    logging.info("Cleaning data for Keep Calm with COVID Dashboard")
+    cleanup_keep_calm_with_covid(df)
+    logging.info(f"Writing data for Keep Calm with COVID Dashboard to '{output}'")
+    write_output_file(df, output)
+
+
+if __name__ == "__main__":
+    cleanup(obj={})
diff --git a/cleanup_all_covid_calls.py b/cleanup_all_covid_calls.py
index 596e21d..30c36aa 100644
--- a/cleanup_all_covid_calls.py
+++ b/cleanup_all_covid_calls.py
@@ -1,9 +1,15 @@
 import pandas as pd
 import numpy as np
 from datetime import datetime
-from utils import explode_needs, get_lat, get_lng, replacements
-pd.options.mode.chained_assignment = None
+from utils import (
+    explode_needs,
+    get_lat,
+    get_lng,
+    replacements,
+    write_output_file,
+)
 
+pd.options.mode.chained_assignment = None
 
 
 def cleanup(dfs):
@@ -121,6 +127,4 @@ def cleanup(dfs):
     dfs = pd.read_excel(file, sheet_name=None)
 
     df = cleanup(dfs)
-
-    # write out spreadsheet
-    df.to_excel("data/all_covid_calls_cleaned.xlsx", sheet_name="codefornola cleaned")
+    write_output_file(df, "data/all_covid_calls_cleaned.xlsx")
diff --git a/cleanup_keep_calm_with_covid.py b/cleanup_keep_calm_with_covid.py
index b5ed4e3..baa6b12 100644
--- a/cleanup_keep_calm_with_covid.py
+++ b/cleanup_keep_calm_with_covid.py
@@ -3,6 +3,26 @@
 from datetime import datetime
 from utils import explode_needs, get_lat, get_lng, replacements
 
+CONVERTERS = {
+    "Concerns/Needs  - Disaster Services ": str,
+    "Concerns/Needs  - Domestic Abuse/IPV": str,
+    "Concerns/Needs  - Early Childhood Education ": str,
+    "Concerns/Needs  - Education/ Employment ": str,
+    "Concerns/Needs  - Environmental Quality & Prtcn ": str,
+    "Concerns/Needs  - Health Care ": str,
+    "Concerns/Needs  - Interpersonal": str,
+    "Concerns/Needs  - Mental Health": str,
+    "Concerns/Needs  - Mental Health Concerns": str,
+    "Concerns/Needs  - Organizational Development": str,
+    "Concerns/Needs  - Other ": str,
+    "Concerns/Needs  - Other Community Services": str,
+    "Concerns/Needs  - Protective Service/Abuse": str,
+    "Concerns/Needs  - Public Asst & Social Insurance": str,
+    "Concerns/Needs  - Relationship Concerns / Issues ": str,
+    "Concerns/Needs  - Self-Harm": str,
+    "Concerns/Needs  - Sexuality": str,
+}
+
 
 def cleanup(df):
     ### Cleanup for Keeping Calm with COVID dashboard
@@ -76,27 +96,8 @@ def cleanup(df):
 
 if __name__ == "__main__":
     file = "Data from 4.2.20 Fake Data.xlsx"
-    converters = {
-        "Concerns/Needs  - Disaster Services ": str,
-        "Concerns/Needs  - Domestic Abuse/IPV": str,
-        "Concerns/Needs  - Early Childhood Education ": str,
-        "Concerns/Needs  - Education/ Employment ": str,
-        "Concerns/Needs  - Environmental Quality & Prtcn ": str,
-        "Concerns/Needs  - Health Care ": str,
-        "Concerns/Needs  - Interpersonal": str,
-        "Concerns/Needs  - Mental Health": str,
-        "Concerns/Needs  - Mental Health Concerns": str,
-        "Concerns/Needs  - Organizational Development": str,
-        "Concerns/Needs  - Other ": str,
-        "Concerns/Needs  - Other Community Services": str,
-        "Concerns/Needs  - Protective Service/Abuse": str,
-        "Concerns/Needs  - Public Asst & Social Insurance": str,
-        "Concerns/Needs  - Relationship Concerns / Issues ": str,
-        "Concerns/Needs  - Self-Harm": str,
-        "Concerns/Needs  - Sexuality": str,
-    }
     df = pd.read_excel(
-        file, sheet_name="Uncleaned data type 2 VIA LINK", converters=converters
+        file, sheet_name="Uncleaned data type 2 VIA LINK", converters=CONVERTERS
     )
     df = cleanup(df)
     df.to_excel(
diff --git a/requirements.txt b/requirements.txt
index da5b8fd..3f07058 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
+click==7.1.1
 openpyxl==3.0.3
 pandas==1.0.3
 uszipcode==0.2.4
diff --git a/utils.py b/utils.py
index af9e1bb..def5526 100644
--- a/utils.py
+++ b/utils.py
@@ -1,9 +1,19 @@
+import logging
+import os
 import pandas as pd
 from uszipcode import SearchEngine
 
 search = SearchEngine(simple_zipcode=True)
 
 
+def write_output_file(df, filename, sheet_name="codefornola cleaned"):
+    if filename.startswith("data") and not os.path.exists("data"):
+        logging.debug("Creating data directory")
+        os.makedirs("data")
+    logging.debug(f"Writing sheet '{sheet_name}' into '{filename}'")
+    df.to_excel(filename)
+
+
 def get_lat(zipcode):
     if pd.isnull(zipcode):
         return None
@@ -21,6 +31,7 @@ def get_lng(zipcode):
 
 
 def explode_needs(df, need_column):
+    logging.debug(f"exploding needs into {need_column}")
     df["tmp_needs"] = df[need_column].str.split(";")
     df = df.explode("tmp_needs")
     df.drop(columns=[need_column], inplace=True)

From 4003395bf248a2eb4530c15bf86502afe9f846d7 Mon Sep 17 00:00:00 2001
From: Marc Cenac <marc.j.cenac@gmail.com>
Date: Sat, 25 Apr 2020 09:54:45 -0500
Subject: [PATCH 05/11] Adding additional need replacements

---
 utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/utils.py b/utils.py
index def5526..65172ac 100644
--- a/utils.py
+++ b/utils.py
@@ -43,8 +43,10 @@ def explode_needs(df, need_column):
     "†": "For some reason this cross mark is showing up in some of the entries, so just removing it",
     "Employment": "Employment Services",
     "Food": "Food/Meals",
+    "Food/Meal": "Food/Meals",
     "I'm Sick (what next?)": "I'm Sick (What's Next?)",
     "I'm Sick (Whats Next?)": "I'm Sick (What's Next?)",
+    "Income support/assistance": "Income Support/Assistance",
     "information only call": "",
     "Inquires about Health Complications / Concerns": "Inquires about Health Complications",
     "International Travel Concerns": "International / General Travel Concerns",

From a437ca4a175d4d188da68283ba35c7c56030fe55 Mon Sep 17 00:00:00 2001
From: Marc Cenac <marc.j.cenac@gmail.com>
Date: Sat, 25 Apr 2020 09:57:55 -0500
Subject: [PATCH 06/11] Reformatting

---
 cleanup.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/cleanup.py b/cleanup.py
index f44e36f..c05b90c 100644
--- a/cleanup.py
+++ b/cleanup.py
@@ -1,4 +1,5 @@
 import logging
+
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s [%(levelname)s] %(message)s",
@@ -29,7 +30,12 @@ def cleanup(ctx, debug):
 
 @cleanup.command()
 @click.pass_context
-@click.option("--input", "infile", required=True, help="Path to the input spreadsheet (.xlsx file)")
+@click.option(
+    "--input",
+    "infile",
+    required=True,
+    help="Path to the input spreadsheet (.xlsx file)",
+)
 @click.option("--sheetname", default=None, help="Name of the sheet to use")
 @click.option(
     "--output",
@@ -50,7 +56,12 @@ def all_covid_calls(ctx, infile, sheetname, output):
 
 @cleanup.command()
 @click.pass_context
-@click.option("--input", "infile", required=True, help="Path to the input spreadsheet (.xlsx file)")
+@click.option(
+    "--input",
+    "infile",
+    required=True,
+    help="Path to the input spreadsheet (.xlsx file)",
+)
 @click.option("--sheetname", required=True, help="Name of the sheet to use")
 @click.option(
     "--output",

From 39ff2f92154d1230803feec98f5b0f12f39b9f4b Mon Sep 17 00:00:00 2001
From: Marc Cenac <marc.j.cenac@gmail.com>
Date: Sat, 25 Apr 2020 10:00:47 -0500
Subject: [PATCH 07/11] Remove cross symbol

---
 utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils.py b/utils.py
index 65172ac..dec2a10 100644
--- a/utils.py
+++ b/utils.py
@@ -40,7 +40,7 @@ def explode_needs(df, need_column):
 
 
 replacements = {
-    "†": "For some reason this cross mark is showing up in some of the entries, so just removing it",
+    "†": "",
     "Employment": "Employment Services",
     "Food": "Food/Meals",
     "Food/Meal": "Food/Meals",

From 9311cff4e65c686b0937d666ea6c0eabdc584b25 Mon Sep 17 00:00:00 2001
From: Marc Cenac <marc.j.cenac@gmail.com>
Date: Tue, 5 May 2020 20:34:06 -0500
Subject: [PATCH 08/11] Remove main method from cleanup modules

---
 cleanup_all_covid_calls.py      | 10 ----------
 cleanup_keep_calm_with_covid.py | 11 -----------
 2 files changed, 21 deletions(-)

diff --git a/cleanup_all_covid_calls.py b/cleanup_all_covid_calls.py
index 30c36aa..b5e3331 100644
--- a/cleanup_all_covid_calls.py
+++ b/cleanup_all_covid_calls.py
@@ -118,13 +118,3 @@ def cleanup(dfs):
     master_df.replace(to_replace=replacements, value=None, inplace=True)
 
     return master_df
-
-
-if __name__ == "__main__":
-    file = "Data from 4.2.20 Fake Data.xlsx"
-
-    # read all sheets, returns a dict of dataframes
-    dfs = pd.read_excel(file, sheet_name=None)
-
-    df = cleanup(dfs)
-    write_output_file(df, "data/all_covid_calls_cleaned.xlsx")
diff --git a/cleanup_keep_calm_with_covid.py b/cleanup_keep_calm_with_covid.py
index baa6b12..858cd44 100644
--- a/cleanup_keep_calm_with_covid.py
+++ b/cleanup_keep_calm_with_covid.py
@@ -92,14 +92,3 @@ def cleanup(df):
     df["Longitude"] = df["PostalCode"].apply(get_lng)
 
     return df
-
-
-if __name__ == "__main__":
-    file = "Data from 4.2.20 Fake Data.xlsx"
-    df = pd.read_excel(
-        file, sheet_name="Uncleaned data type 2 VIA LINK", converters=CONVERTERS
-    )
-    df = cleanup(df)
-    df.to_excel(
-        "data/keep_calm_with_covid_cleaned.xlsx", sheet_name="codefornola cleaned"
-    )

From 163056cbdc89d332931dcedfb0e012684908f17c Mon Sep 17 00:00:00 2001
From: Marc Cenac <marc.j.cenac@gmail.com>
Date: Tue, 5 May 2020 20:35:21 -0500
Subject: [PATCH 09/11] Update to read iCarol formatted csv file

---
 cleanup.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/cleanup.py b/cleanup.py
index c05b90c..a6cf935 100644
--- a/cleanup.py
+++ b/cleanup.py
@@ -57,23 +57,22 @@ def all_covid_calls(ctx, infile, sheetname, output):
 @cleanup.command()
 @click.pass_context
 @click.option(
-    "--input",
-    "infile",
-    required=True,
-    help="Path to the input spreadsheet (.xlsx file)",
+    "--input", "infile", required=True, help="Path to the input csv file",
 )
-@click.option("--sheetname", required=True, help="Name of the sheet to use")
 @click.option(
     "--output",
     default="data/keep_calm_with_covid_cleaned.xlsx",
     help="Path to the output spreadsheet (cleaned .xlsx file)",
 )
-def keep_calm_with_covid(ctx, infile, sheetname, output):
+def keep_calm_with_covid(ctx, infile, output):
     if ctx.obj["DEBUG"]:
         logging.getLogger().setLevel(logging.DEBUG)
         logging.debug("Running in debug mode")
     logging.debug(f"Reading input file '{infile}'")
-    df = pd.read_excel(infile, sheet_name=sheetname, converters=CONVERTERS)
+    df = pd.read_csv(infile, encoding="ISO-8859-1", converters=CONVERTERS)
+    columns = df.iloc[1].values.tolist()
+    df = df.iloc[2:]
+    df.columns = columns
     logging.info("Cleaning data for Keep Calm with COVID Dashboard")
     cleanup_keep_calm_with_covid(df)
     logging.info(f"Writing data for Keep Calm with COVID Dashboard to '{output}'")

From 3d3aba1b15edd0a1d2fb63ed08eea444fbb393d1 Mon Sep 17 00:00:00 2001
From: Marc Cenac <marc.j.cenac@gmail.com>
Date: Thu, 7 May 2020 09:30:59 -0500
Subject: [PATCH 10/11] Update all-covid-calls to read 2 csv files

---
 cleanup.py                 | 39 +++++++++++++++++++++++++++-----------
 cleanup_all_covid_calls.py |  6 ++----
 2 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/cleanup.py b/cleanup.py
index a6cf935..14f5969 100644
--- a/cleanup.py
+++ b/cleanup.py
@@ -31,25 +31,37 @@ def cleanup(ctx, debug):
 @cleanup.command()
 @click.pass_context
 @click.option(
-    "--input",
-    "infile",
+    "--vialink-input",
+    "vl_infile",
     required=True,
-    help="Path to the input spreadsheet (.xlsx file)",
+    help="Path to the VIA LINK input csv file",
+)
+@click.option(
+    "--232-input",
+    "two32_infile",
+    required=True,
+    help="Path to the 232 HELP input csv file",
 )
-@click.option("--sheetname", default=None, help="Name of the sheet to use")
 @click.option(
     "--output",
     default="data/all_covid_calls_cleaned.xlsx",
     help="Path to the output spreadsheet (cleaned .xlsx file)",
 )
-def all_covid_calls(ctx, infile, sheetname, output):
+def all_covid_calls(ctx, vl_infile, two32_infile, output):
     if ctx.obj["DEBUG"]:
         logging.getLogger().setLevel(logging.DEBUG)
         logging.debug("Running in debug mode")
-    logging.debug(f"Reading input file '{infile}'")
-    df = pd.read_excel(infile, sheet_name=sheetname)
+    logging.debug(f"Reading VIALINK file from '{vl_infile}'")
+    logging.debug(f"Reading 232-HELP file from '{two32_infile}'")
+    dfs = {}
+    dfvl = pd.read_csv(vl_infile, encoding="ISO-8859-1")
+    dfvl = remove_first_rows(dfvl)
+    dfs["VIALINK"] = dfvl
+    df232 = pd.read_csv(two32_infile, encoding="ISO-8859-1")
+    df232 = remove_first_rows(df232)
+    dfs["TWO32"] = df232
     logging.info("Cleaning data for All COVID Calls Dashboard")
-    df = cleanup_all_covid_calls(df)
+    df = cleanup_all_covid_calls(dfs)
     logging.info(f"Writing data for All COVID Calls Dashboard to '{output}'")
     write_output_file(df, output)
 
@@ -70,14 +82,19 @@ def keep_calm_with_covid(ctx, infile, output):
         logging.debug("Running in debug mode")
     logging.debug(f"Reading input file '{infile}'")
     df = pd.read_csv(infile, encoding="ISO-8859-1", converters=CONVERTERS)
-    columns = df.iloc[1].values.tolist()
-    df = df.iloc[2:]
-    df.columns = columns
+    df = remove_first_rows(df)
     logging.info("Cleaning data for Keep Calm with COVID Dashboard")
     cleanup_keep_calm_with_covid(df)
     logging.info(f"Writing data for Keep Calm with COVID Dashboard to '{output}'")
     write_output_file(df, output)
 
 
+def remove_first_rows(df):
+    columns = df.iloc[1].values.tolist()
+    df = df.iloc[2:]
+    df.columns = columns
+    return df
+
+
 if __name__ == "__main__":
     cleanup(obj={})
diff --git a/cleanup_all_covid_calls.py b/cleanup_all_covid_calls.py
index b5e3331..706b6b4 100644
--- a/cleanup_all_covid_calls.py
+++ b/cleanup_all_covid_calls.py
@@ -33,9 +33,7 @@ def cleanup(dfs):
         "Contact Source - Program ",  # ending space is needed
         "Needs - Basic Needs Requested",
     ]
-    vialink1_df = dfs["Uncleaned data type 1 VIA LINK"][
-        VIA_LINK_REQUIRED_COLUMNS_DISASTER
-    ]
+    vialink1_df = dfs["VIALINK"][VIA_LINK_REQUIRED_COLUMNS_DISASTER]
 
     # step 2
     # select required columns from 232-Help’s Disaster Form
@@ -53,7 +51,7 @@ def cleanup(dfs):
         "Client Information - Identifies as",
         "Needs - Basic Needs Requested",
     ]
-    two32_help_df = dfs["Uncleaned Data from 232-Help"][TWO32_HELP_REQUIRED_COLUMNS]
+    two32_help_df = dfs["TWO32"][TWO32_HELP_REQUIRED_COLUMNS]
 
     # step 3
     # Create age ranges from date of birth

From 9752f0d01d3c02de6ec410cd1ebc812e76e24af2 Mon Sep 17 00:00:00 2001
From: Marc Cenac <marc.j.cenac@gmail.com>
Date: Thu, 7 May 2020 09:31:08 -0500
Subject: [PATCH 11/11] Update usage in readme

---
 README.md | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 0544215..4c1639e 100644
--- a/README.md
+++ b/README.md
@@ -1,16 +1,16 @@
-This project will help cleanup data for [VIA LINK](https://vialink.org/), the non-profit that runs the 211 system and call centers for the New Orleans-based region of Louisiana. 
+This project will help cleanup data for [VIA LINK](https://vialink.org/), the non-profit that runs the 211 system and call centers for the New Orleans-based region of Louisiana.
 
 ## Initial setup
 
-### install Python 
+### install Python
 
-You must have Python 3 installed.  You can download it [here](https://www.python.org/downloads/).
+You must have Python 3 installed. You can download it [here](https://www.python.org/downloads/).
 
-> If you are using Windows, be sure to select the "Add Python to PATH" option 
+> If you are using Windows, be sure to select the "Add Python to PATH" option
 
-You can confirm it is installed correctly by running `python3 --version` in a terminal or command prompt.  
+You can confirm it is installed correctly by running `python3 --version` in a terminal or command prompt.
 
-### create and activate a Python virtual environment 
+### create and activate a Python virtual environment
 
 This step is optional, but if you have more than one project using Python, it is recommended.
 
@@ -18,6 +18,7 @@ A [virtual environment](https://docs.python.org/3/library/venv.html#creating-vir
 of each project, which is helpful when working with mulitple projects with different depenencies (or different versions of the same dependency).
 
 For macOS or Linux
+
 ```
 python3 -m venv .venv
 source .venv/bin/activate
@@ -30,27 +31,31 @@ py -m venv env
 .\env\Scripts\activate
 ```
 
-> Note that you need to [activate the virutal environment](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/#activating-a-virtual-environment) 
-> before running a script but you only need to create the virtual envrionment once. 
+> Note that you need to [activate the virutal environment](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/#activating-a-virtual-environment)
+> before running a script but you only need to create the virtual envrionment once.
 
 ### install the dependencies
 
 In Python, dependencies are often installed using [pip](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/#installing-pip)
 
 You can install all the dependencies for this project by running:
+
 ```
 pip install -r requirements.txt
 ```
 
-
 ## Running the scripts
 
-The basic format is `python cleanup.py <command> --input <path/to/input.xlsx>` 
+The basic format looks like `python cleanup.py script_name --inputfile1 ~/path/to/input.csv`
 
 For example:
+
 ```
-python cleanup.py all-covid-calls --input "Data from 4.2.20 Fake Data.xlsx"
-python cleanup.py --debug keep-calm-with-covid --input "Data from 4.2.20 Fake Data.xlsx"  --sheetname "Uncleaned data type 2 VIA LINK"
+# the keep-calm-with-covid script only requires one input file
+python cleanup.py --debug keep-calm-with-covid --input "/tmp/VL 4.29 Call Report.csv"
+
+# the all-covid-calls script requires 2 files
+python cleanup.py --debug all-covid-calls --vialink-input ~/Downloads/VL\ 4.29\ Disaster\ Call\ Report\ .csv --232-input ~/Downloads/232-HELP.csv"
 ```
 
 If you want to see the basic usage you can run `python cleanup.py` and for a specifc command you can use the `--help` flag
@@ -58,3 +63,5 @@ If you want to see the basic usage you can run `python cleanup.py` and for a spe
 ```
 python cleanup.py all-covid-calls --help
 ```
+
+> Also, you can use the `--debug` flag to view debug logs