From 3312dff5354af94bcaa14c1c4094ccc7559cfd6d Mon Sep 17 00:00:00 2001 From: Marc Cenac Date: Tue, 14 Apr 2020 20:33:45 -0500 Subject: [PATCH 01/11] Add initial scripts to cleanup data --- .gitignore | 3 +- README.md | 5 +- cleanup_all_covid_calls.py | 155 +++++++++++++++++++++++++++++ cleanup_keep_calm_with_covid.py | 167 ++++++++++++++++++++++++++++++++ 4 files changed, 327 insertions(+), 3 deletions(-) create mode 100644 cleanup_all_covid_calls.py create mode 100644 cleanup_keep_calm_with_covid.py diff --git a/.gitignore b/.gitignore index 367d520..2c334a1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ -.vscode +.vscode/ +data/ # Byte-compiled / optimized / DLL files diff --git a/README.md b/README.md index 9405e96..1793847 100644 --- a/README.md +++ b/README.md @@ -45,8 +45,9 @@ pip install -r requirements.txt ## Running the scripts -TBD - but probably something like this: +TBD - but probably something like this but eventually specifying input and output files ``` -python cleanup /path/to/file1.xlsx /path/to/file2.xlsx +python cleanup_keep_calm_with_covid.py +python cleanup_all_covid_calls.py ``` diff --git a/cleanup_all_covid_calls.py b/cleanup_all_covid_calls.py new file mode 100644 index 0000000..9542044 --- /dev/null +++ b/cleanup_all_covid_calls.py @@ -0,0 +1,155 @@ +import pandas as pd +from uszipcode import SearchEngine +import numpy as np +from datetime import datetime + +file = "Data from 4.2.20 Fake Data.xlsx" + +# read all sheets, returns a dict of dataframes +dfs = pd.read_excel(file, sheet_name=None) + +### Cleanup for All COVID Calls dashboard + +# step 1 +# select required columns from VIA LINK’s Disaster Form +# pretty sure the distaster form is "Uncleaned data type 1 VIA LINK" +VIA_LINK_REQUIRED_COLUMNS_DISASTER = [ + "CallReportNum", + "ReportVersion", + "CallDateAndTimeStart", + "CityName", + "CountyName", + "StateProvince", + "PostalCode", + "Client Information - Age Group", + "Client Information - Call Type", + "Client Information - Identifies as", + "Concerns/Needs - Concerns/Needs", + "Contact Source - Program ", # ending space is needed + "Needs - Basic Needs Requested", +] +vialink1_df = dfs["Uncleaned data type 1 VIA LINK"][VIA_LINK_REQUIRED_COLUMNS_DISASTER] + +# step 2 +# select required columns from 232-Help’s Disaster Form +TWO32_HELP_REQUIRED_COLUMNS = [ + "CallReportNum", + "ReportVersion", + "CallDateAndTimeStart", + "CityName", + "CountyName", + "StateProvince", + "PostalCode", + "Client Information - Date of Birth", + "Client Information - Call Type", + "Call Outcome - What concerns/needs were identified?", + "Client Information - Identifies as", + "Needs - Basic Needs Requested", +] +two32_help_df = dfs["Uncleaned Data from 232-Help"][TWO32_HELP_REQUIRED_COLUMNS] + +# step 3 +# Create age ranges from date of birth +# use ranges 0-5, 6-12, 13-17, 18-24, 25-40, 41-59, 60+. +now = datetime.now() +bins = [0, 5, 12, 17, 24, 40, 59, 150] +labels = ["0-5", "6-12", "13-17", "18-24", "24-40", "41-49", "60+"] +dob = pd.to_datetime( + two32_help_df["Client Information - Date of Birth"], errors="coerce" +) +years_old = (now - dob).astype("timedelta64[Y]") +age_range = pd.cut(years_old, bins=bins, labels=labels, include_lowest=True) +two32_help_df["Client Information - Age Group"] = age_range +# remove original Date of Birth column +two32_help_df.drop(columns=["Client Information - Date of Birth"], inplace=True) + +# step 4 +# add "Data From" column +vialink1_df["Data From"] = "VIA LINK" +two32_help_df["Data From"] = "232-HELP" + + +# step 5 +# add data to master spreadsheet +# first merge "Call Outcome - What concerns/needs were identified" from 232-HELP +# into "Concerns/Needs - Concerns/Needs" +two32_help_df.rename( + columns={ + "Call Outcome - What concerns/needs were identified?": "Concerns/Needs - Concerns/Needs" + }, + inplace=True, +) + + +# new steps +# cleanup invalid values +vialink1_df["Contact Source - Program "].replace( + to_replace=datetime(2001, 2, 1, 0, 0), value=np.nan, inplace=True +) + + +# then combine data +master_df = pd.concat([vialink1_df, two32_help_df], join="outer", ignore_index=True) + + +# step 6 +# add lat/lon columns +search = SearchEngine(simple_zipcode=True) + + +# todo: pull this into a utils module +def get_lat(zipcode): + if pd.isnull(zipcode): + return None + else: + lat = search.by_zipcode(int(zipcode)).lat + return lat if lat else None + + +def get_lng(zipcode): + if pd.isnull(zipcode): + return None + else: + lng = search.by_zipcode(int(zipcode)).lng + return lng if lng else None + + +master_df["Latitude"] = master_df["PostalCode"].apply(get_lat) +master_df["Longitude"] = master_df["PostalCode"].apply(get_lng) + + +# step 7 +# first put the values from "Needs - Basic Needs Requested" into "Concerns/Needs - Concerns/Needs" +cn = "Concerns/Needs - Concerns/Needs" +master_df["all_needs"] = master_df[[cn, "Needs - Basic Needs Requested"]].apply( + lambda x: "; ".join(x[x.notnull()]), axis=1 +) +# then explode needs into their own rows +def explode_needs(df, need_column): + df["tmp_needs"] = df[need_column].str.split(";") + df = df.explode("tmp_needs") + df.drop(columns=[need_column], inplace=True) + df.rename(columns={"tmp_needs": need_column}, inplace=True) + return df + + +master_df.drop(columns=[cn, "Needs - Basic Needs Requested"], inplace=True) +master_df.rename(columns={"all_needs": cn}, inplace=True) +master_df = explode_needs(master_df, cn) + + +# step 8 +# cleanup Concerns/Needs +master_df = master_df[master_df[cn] != "Hangup / Wrong #"] +master_df[cn] = master_df[cn].str.strip() +master_df = master_df.replace( + { + "Health Complications / Concerns": "Health Complications", + "Other (please specify caller need in call notes)": "Other", + } +) + +# write out spreadsheet +master_df.to_excel( + "data/all_covid_calls_cleaned.xlsx", sheet_name="codefornola cleaned" +) diff --git a/cleanup_keep_calm_with_covid.py b/cleanup_keep_calm_with_covid.py new file mode 100644 index 0000000..0d85893 --- /dev/null +++ b/cleanup_keep_calm_with_covid.py @@ -0,0 +1,167 @@ +import pandas as pd +from uszipcode import SearchEngine +import numpy as np +from datetime import datetime + +file = "Data from 4.2.20 Fake Data.xlsx" + +# read all sheets, returns a dict of dataframes +dfs = pd.read_excel(file, sheet_name=None) + + +### Cleanup for Keeping Calm with COVID dashboard +converters = { + "Concerns/Needs - Disaster Services ": str, + "Concerns/Needs - Domestic Abuse/IPV": str, + "Concerns/Needs - Early Childhood Education ": str, + "Concerns/Needs - Education/ Employment ": str, + "Concerns/Needs - Environmental Quality & Prtcn ": str, + "Concerns/Needs - Health Care ": str, + "Concerns/Needs - Interpersonal": str, + "Concerns/Needs - Mental Health": str, + "Concerns/Needs - Mental Health Concerns": str, + "Concerns/Needs - Organizational Development": str, + "Concerns/Needs - Other ": str, + "Concerns/Needs - Other Community Services": str, + "Concerns/Needs - Protective Service/Abuse": str, + "Concerns/Needs - Public Asst & Social Insurance": str, + "Concerns/Needs - Relationship Concerns / Issues ": str, + "Concerns/Needs - Self-Harm": str, + "Concerns/Needs - Sexuality": str, +} +df = pd.read_excel( + file, sheet_name="Uncleaned data type 2 VIA LINK", converters=converters +) + + +# step 1 +# pretty sure the call reports form is "Uncleaned data type 2 VIA LINK" + +# todo: why not use all conerns/needs? why only DD-DT? +# needs_columns = [c for c in dfs["Uncleaned data type 2 VIA LINK"] if c.startswith("Concerns/Needs")] +needs_columns = [ + # "Concerns/Needs - N/A - must list WHY", + # "Concerns/Needs - Addictive Disorders", + # "Concerns/Needs - Arts & Culture ", + # "Concerns/Needs - Basic Needs ", + # "Concerns/Needs - Campus Information", + # "Concerns/Needs - Consumer Services ", + # "Concerns/Needs - Criminal Justice & Legal Srvcs ", + "Concerns/Needs - Disaster Services ", + "Concerns/Needs - Domestic Abuse/IPV", + "Concerns/Needs - Early Childhood Education ", + "Concerns/Needs - Education/ Employment ", + "Concerns/Needs - Environmental Quality & Prtcn ", + "Concerns/Needs - Health Care ", + "Concerns/Needs - Interpersonal", + "Concerns/Needs - Mental Health", + "Concerns/Needs - Mental Health Concerns", + "Concerns/Needs - Organizational Development", + "Concerns/Needs - Other ", + "Concerns/Needs - Other Community Services", + "Concerns/Needs - Protective Service/Abuse", + "Concerns/Needs - Public Asst & Social Insurance", + "Concerns/Needs - Relationship Concerns / Issues ", + "Concerns/Needs - Self-Harm", + "Concerns/Needs - Sexuality", + # "Concerns/Needs - Suicide Related", + # "Concerns/Needs - Validity Question", + # "Concerns/Needs - Victim Assistance / Survivor Support ", + # "Concerns/Needs - Violence", + # "Concerns/Needs - xxx1", + # "Concerns/Needs - xxx2", +] + +VIA_LINK_REQUIRED_COLUMNS_CALLS = [ + "CallReportNum", + "ReportVersion", + "CallDateAndTimeStart", + "CityName", + "CountyName", + "StateProvince", + "PostalCode", + "Call Information - Program", + "Demographics - Age", + "Demographics - Gender", +] + needs_columns +vialink2_df = dfs["Uncleaned data type 2 VIA LINK"][VIA_LINK_REQUIRED_COLUMNS_CALLS] + +# step 2 +# remove calls not from LA Spirit line +vialink2_df = vialink2_df[ + vialink2_df["Call Information - Program"] == "LA Spirit Crisis Line" +] + + +# step 3 +# combine all needs column into 1 column +all_needs = "Concerns/Needs - Concerns/Needs" +vialink2_df[all_needs] = vialink2_df[needs_columns].apply( + lambda x: "; ".join(x[x.notnull()]), axis=1 +) + + +# todo: pull this into a utils module +def explode_needs(df, need_column): + df["tmp_needs"] = df[need_column].str.split(";") + df = df.explode("tmp_needs") + df.drop(columns=[need_column], inplace=True) + df.rename(columns={"tmp_needs": need_column}, inplace=True) + return df + + +vialink2_df = explode_needs(vialink2_df, all_needs) + +# step 4 +# add "Data From" column +vialink2_df["Data From"] = "VIA LINK" + +# step 5 +# cleanup Concerns/Needs Data + +vialink2_df = vialink2_df[vialink2_df[all_needs] != "Wrong #"] +vialink2_df = vialink2_df[vialink2_df[all_needs] != "hangup"] +vialink2_df = vialink2_df.replace( + { + "Concerns/Needs - Interpersonal": "Interpersonal Conflict", + "Food": "Food/Meals", + "Interpersonal Conflict": "Income Support/Assistance", + } +) + + +# step 6 +# drop all the original needs columns +vialink2_df.drop(columns=needs_columns, inplace=True) + + +# step 7 +# add the Lat/Lng columns + +# todo: pull this into a utils module +search = SearchEngine(simple_zipcode=True) + + +def get_lat(zipcode): + if pd.isnull(zipcode): + return None + else: + lat = search.by_zipcode(int(zipcode)).lat + return lat if lat else None + + +def get_lng(zipcode): + if pd.isnull(zipcode): + return None + else: + lng = search.by_zipcode(int(zipcode)).lng + return lng if lng else None + + +vialink2_df["Latitude"] = vialink2_df["PostalCode"].apply(get_lat) +vialink2_df["Longitude"] = vialink2_df["PostalCode"].apply(get_lng) + + +vialink2_df.to_excel( + "data/keep_calm_with_covid_cleaned.xlsx", sheet_name="codefornola cleaned" +) From d2a6a2bc3c0599193491429e52b9cc75c1162767 Mon Sep 17 00:00:00 2001 From: Marc Cenac Date: Thu, 16 Apr 2020 08:48:00 -0500 Subject: [PATCH 02/11] Refactoring, adding more text replacements --- cleanup_all_covid_calls.py | 273 ++++++++++++++------------------ cleanup_keep_calm_with_covid.py | 255 +++++++++++------------------ utils.py | 45 ++++++ 3 files changed, 262 insertions(+), 311 deletions(-) create mode 100644 utils.py diff --git a/cleanup_all_covid_calls.py b/cleanup_all_covid_calls.py index 9542044..851749a 100644 --- a/cleanup_all_covid_calls.py +++ b/cleanup_all_covid_calls.py @@ -1,155 +1,124 @@ import pandas as pd -from uszipcode import SearchEngine import numpy as np from datetime import datetime - -file = "Data from 4.2.20 Fake Data.xlsx" - -# read all sheets, returns a dict of dataframes -dfs = pd.read_excel(file, sheet_name=None) - -### Cleanup for All COVID Calls dashboard - -# step 1 -# select required columns from VIA LINK’s Disaster Form -# pretty sure the distaster form is "Uncleaned data type 1 VIA LINK" -VIA_LINK_REQUIRED_COLUMNS_DISASTER = [ - "CallReportNum", - "ReportVersion", - "CallDateAndTimeStart", - "CityName", - "CountyName", - "StateProvince", - "PostalCode", - "Client Information - Age Group", - "Client Information - Call Type", - "Client Information - Identifies as", - "Concerns/Needs - Concerns/Needs", - "Contact Source - Program ", # ending space is needed - "Needs - Basic Needs Requested", -] -vialink1_df = dfs["Uncleaned data type 1 VIA LINK"][VIA_LINK_REQUIRED_COLUMNS_DISASTER] - -# step 2 -# select required columns from 232-Help’s Disaster Form -TWO32_HELP_REQUIRED_COLUMNS = [ - "CallReportNum", - "ReportVersion", - "CallDateAndTimeStart", - "CityName", - "CountyName", - "StateProvince", - "PostalCode", - "Client Information - Date of Birth", - "Client Information - Call Type", - "Call Outcome - What concerns/needs were identified?", - "Client Information - Identifies as", - "Needs - Basic Needs Requested", -] -two32_help_df = dfs["Uncleaned Data from 232-Help"][TWO32_HELP_REQUIRED_COLUMNS] - -# step 3 -# Create age ranges from date of birth -# use ranges 0-5, 6-12, 13-17, 18-24, 25-40, 41-59, 60+. -now = datetime.now() -bins = [0, 5, 12, 17, 24, 40, 59, 150] -labels = ["0-5", "6-12", "13-17", "18-24", "24-40", "41-49", "60+"] -dob = pd.to_datetime( - two32_help_df["Client Information - Date of Birth"], errors="coerce" -) -years_old = (now - dob).astype("timedelta64[Y]") -age_range = pd.cut(years_old, bins=bins, labels=labels, include_lowest=True) -two32_help_df["Client Information - Age Group"] = age_range -# remove original Date of Birth column -two32_help_df.drop(columns=["Client Information - Date of Birth"], inplace=True) - -# step 4 -# add "Data From" column -vialink1_df["Data From"] = "VIA LINK" -two32_help_df["Data From"] = "232-HELP" - - -# step 5 -# add data to master spreadsheet -# first merge "Call Outcome - What concerns/needs were identified" from 232-HELP -# into "Concerns/Needs - Concerns/Needs" -two32_help_df.rename( - columns={ - "Call Outcome - What concerns/needs were identified?": "Concerns/Needs - Concerns/Needs" - }, - inplace=True, -) - - -# new steps -# cleanup invalid values -vialink1_df["Contact Source - Program "].replace( - to_replace=datetime(2001, 2, 1, 0, 0), value=np.nan, inplace=True -) - - -# then combine data -master_df = pd.concat([vialink1_df, two32_help_df], join="outer", ignore_index=True) - - -# step 6 -# add lat/lon columns -search = SearchEngine(simple_zipcode=True) - - -# todo: pull this into a utils module -def get_lat(zipcode): - if pd.isnull(zipcode): - return None - else: - lat = search.by_zipcode(int(zipcode)).lat - return lat if lat else None - - -def get_lng(zipcode): - if pd.isnull(zipcode): - return None - else: - lng = search.by_zipcode(int(zipcode)).lng - return lng if lng else None - - -master_df["Latitude"] = master_df["PostalCode"].apply(get_lat) -master_df["Longitude"] = master_df["PostalCode"].apply(get_lng) - - -# step 7 -# first put the values from "Needs - Basic Needs Requested" into "Concerns/Needs - Concerns/Needs" -cn = "Concerns/Needs - Concerns/Needs" -master_df["all_needs"] = master_df[[cn, "Needs - Basic Needs Requested"]].apply( - lambda x: "; ".join(x[x.notnull()]), axis=1 -) -# then explode needs into their own rows -def explode_needs(df, need_column): - df["tmp_needs"] = df[need_column].str.split(";") - df = df.explode("tmp_needs") - df.drop(columns=[need_column], inplace=True) - df.rename(columns={"tmp_needs": need_column}, inplace=True) - return df - - -master_df.drop(columns=[cn, "Needs - Basic Needs Requested"], inplace=True) -master_df.rename(columns={"all_needs": cn}, inplace=True) -master_df = explode_needs(master_df, cn) - - -# step 8 -# cleanup Concerns/Needs -master_df = master_df[master_df[cn] != "Hangup / Wrong #"] -master_df[cn] = master_df[cn].str.strip() -master_df = master_df.replace( - { - "Health Complications / Concerns": "Health Complications", - "Other (please specify caller need in call notes)": "Other", - } -) - -# write out spreadsheet -master_df.to_excel( - "data/all_covid_calls_cleaned.xlsx", sheet_name="codefornola cleaned" -) +from utils import explode_needs, get_lat, get_lng, replacements + + +def cleanup(dfs): + ### Cleanup for All COVID Calls dashboard + + # step 1 + # select required columns from VIA LINK’s Disaster Form + # pretty sure the distaster form is "Uncleaned data type 1 VIA LINK" + VIA_LINK_REQUIRED_COLUMNS_DISASTER = [ + "CallReportNum", + "ReportVersion", + "CallDateAndTimeStart", + "CityName", + "CountyName", + "StateProvince", + "PostalCode", + "Client Information - Age Group", + "Client Information - Call Type", + "Client Information - Identifies as", + "Concerns/Needs - Concerns/Needs", + "Contact Source - Program ", # ending space is needed + "Needs - Basic Needs Requested", + ] + vialink1_df = dfs["Uncleaned data type 1 VIA LINK"][ + VIA_LINK_REQUIRED_COLUMNS_DISASTER + ] + + # step 2 + # select required columns from 232-Help’s Disaster Form + TWO32_HELP_REQUIRED_COLUMNS = [ + "CallReportNum", + "ReportVersion", + "CallDateAndTimeStart", + "CityName", + "CountyName", + "StateProvince", + "PostalCode", + "Client Information - Date of Birth", + "Client Information - Call Type", + "Call Outcome - What concerns/needs were identified?", + "Client Information - Identifies as", + "Needs - Basic Needs Requested", + ] + two32_help_df = dfs["Uncleaned Data from 232-Help"][TWO32_HELP_REQUIRED_COLUMNS] + + # step 3 + # Create age ranges from date of birth + # use ranges 0-5, 6-12, 13-17, 18-24, 25-40, 41-59, 60+. + now = datetime.now() + bins = [0, 5, 12, 17, 24, 40, 59, 150] + labels = ["0-5", "6-12", "13-17", "18-24", "24-40", "41-49", "60+"] + dob = pd.to_datetime( + two32_help_df["Client Information - Date of Birth"], errors="coerce" + ) + years_old = (now - dob).astype("timedelta64[Y]") + age_range = pd.cut(years_old, bins=bins, labels=labels, include_lowest=True) + two32_help_df["Client Information - Age Group"] = age_range + # remove original Date of Birth column + two32_help_df.drop(columns=["Client Information - Date of Birth"], inplace=True) + + # step 4 + # add "Data From" column + vialink1_df["Data From"] = "VIA LINK" + two32_help_df["Data From"] = "232-HELP" + + # step 5 + # add data to master spreadsheet + # first merge "Call Outcome - What concerns/needs were identified" from 232-HELP + # into "Concerns/Needs - Concerns/Needs" + two32_help_df.rename( + columns={ + "Call Outcome - What concerns/needs were identified?": "Concerns/Needs - Concerns/Needs" + }, + inplace=True, + ) + + # new steps + # cleanup invalid values + vialink1_df["Contact Source - Program "].replace( + to_replace=datetime(2001, 2, 1, 0, 0), value=np.nan, inplace=True + ) + + # then combine data + master_df = pd.concat([vialink1_df, two32_help_df], join="outer", ignore_index=True) + + # step 6 + # add lat/lon columns + master_df["Latitude"] = master_df["PostalCode"].apply(get_lat) + master_df["Longitude"] = master_df["PostalCode"].apply(get_lng) + + # step 7 + # first put the values from "Needs - Basic Needs Requested" into "Concerns/Needs - Concerns/Needs" + cn = "Concerns/Needs - Concerns/Needs" + master_df["all_needs"] = master_df[[cn, "Needs - Basic Needs Requested"]].apply( + lambda x: "; ".join(x[x.notnull()]), axis=1 + ) + master_df.drop(columns=[cn, "Needs - Basic Needs Requested"], inplace=True) + master_df.rename(columns={"all_needs": cn}, inplace=True) + master_df = explode_needs(master_df, cn) + + # step 8 + # cleanup Concerns/Needs + master_df[cn] = master_df[cn].str.strip() + master_df = master_df[master_df[cn] != "Hangup / Wrong Number"] + master_df = master_df[master_df[cn] != "Hangup / Wrong #"] + master_df = master_df.replace(to_replace=replacements, value=None, inplace=True) + + return master_df + + +if __name__ == "__main__": + file = "Data from 4.2.20 Fake Data.xlsx" + + # read all sheets, returns a dict of dataframes + dfs = pd.read_excel(file, sheet_name=None) + + df = cleanup(dfs) + + # write out spreadsheet + df.to_excel("data/all_covid_calls_cleaned.xlsx", sheet_name="codefornola cleaned") diff --git a/cleanup_keep_calm_with_covid.py b/cleanup_keep_calm_with_covid.py index 0d85893..b5ed4e3 100644 --- a/cleanup_keep_calm_with_covid.py +++ b/cleanup_keep_calm_with_covid.py @@ -1,167 +1,104 @@ import pandas as pd -from uszipcode import SearchEngine import numpy as np from datetime import datetime +from utils import explode_needs, get_lat, get_lng, replacements + + +def cleanup(df): + ### Cleanup for Keeping Calm with COVID dashboard + # step 1 + # select only the required columns + needs_columns = [ + "Concerns/Needs - Disaster Services ", + "Concerns/Needs - Domestic Abuse/IPV", + "Concerns/Needs - Early Childhood Education ", + "Concerns/Needs - Education/ Employment ", + "Concerns/Needs - Environmental Quality & Prtcn ", + "Concerns/Needs - Health Care ", + "Concerns/Needs - Interpersonal", + "Concerns/Needs - Mental Health", + "Concerns/Needs - Mental Health Concerns", + "Concerns/Needs - Organizational Development", + "Concerns/Needs - Other ", + "Concerns/Needs - Other Community Services", + "Concerns/Needs - Protective Service/Abuse", + "Concerns/Needs - Public Asst & Social Insurance", + "Concerns/Needs - Relationship Concerns / Issues ", + "Concerns/Needs - Self-Harm", + "Concerns/Needs - Sexuality", + ] + VIA_LINK_REQUIRED_COLUMNS_CALLS = [ + "CallReportNum", + "ReportVersion", + "CallDateAndTimeStart", + "CityName", + "CountyName", + "StateProvince", + "PostalCode", + "Call Information - Program", + "Demographics - Age", + "Demographics - Gender", + ] + needs_columns + df = df[VIA_LINK_REQUIRED_COLUMNS_CALLS] + + # step 2 + # remove calls not from LA Spirit line + df = df[df["Call Information - Program"] == "LA Spirit Crisis Line"] + + # step 3 + # combine all needs column into 1 column + all_needs = "Concerns/Needs - Concerns/Needs" + df[all_needs] = df[needs_columns].apply(lambda x: "; ".join(x[x.notnull()]), axis=1) + df = explode_needs(df, all_needs) + + # step 4 + # add "Data From" column + df["Data From"] = "VIA LINK" + + # step 5 + # cleanup Concerns/Needs Data + df[all_needs] = df[all_needs].str.strip() + df = df[df[all_needs] != "Wrong #"] + df = df[df[all_needs] != "hangup"] + df.replace(to_replace=replacements, value=None, inplace=True) + + # step 6 + # drop all the original needs columns + df.drop(columns=needs_columns, inplace=True) + + # step 7 + # add the Lat/Lng columns + df["Latitude"] = df["PostalCode"].apply(get_lat) + df["Longitude"] = df["PostalCode"].apply(get_lng) -file = "Data from 4.2.20 Fake Data.xlsx" - -# read all sheets, returns a dict of dataframes -dfs = pd.read_excel(file, sheet_name=None) - - -### Cleanup for Keeping Calm with COVID dashboard -converters = { - "Concerns/Needs - Disaster Services ": str, - "Concerns/Needs - Domestic Abuse/IPV": str, - "Concerns/Needs - Early Childhood Education ": str, - "Concerns/Needs - Education/ Employment ": str, - "Concerns/Needs - Environmental Quality & Prtcn ": str, - "Concerns/Needs - Health Care ": str, - "Concerns/Needs - Interpersonal": str, - "Concerns/Needs - Mental Health": str, - "Concerns/Needs - Mental Health Concerns": str, - "Concerns/Needs - Organizational Development": str, - "Concerns/Needs - Other ": str, - "Concerns/Needs - Other Community Services": str, - "Concerns/Needs - Protective Service/Abuse": str, - "Concerns/Needs - Public Asst & Social Insurance": str, - "Concerns/Needs - Relationship Concerns / Issues ": str, - "Concerns/Needs - Self-Harm": str, - "Concerns/Needs - Sexuality": str, -} -df = pd.read_excel( - file, sheet_name="Uncleaned data type 2 VIA LINK", converters=converters -) - - -# step 1 -# pretty sure the call reports form is "Uncleaned data type 2 VIA LINK" - -# todo: why not use all conerns/needs? why only DD-DT? -# needs_columns = [c for c in dfs["Uncleaned data type 2 VIA LINK"] if c.startswith("Concerns/Needs")] -needs_columns = [ - # "Concerns/Needs - N/A - must list WHY", - # "Concerns/Needs - Addictive Disorders", - # "Concerns/Needs - Arts & Culture ", - # "Concerns/Needs - Basic Needs ", - # "Concerns/Needs - Campus Information", - # "Concerns/Needs - Consumer Services ", - # "Concerns/Needs - Criminal Justice & Legal Srvcs ", - "Concerns/Needs - Disaster Services ", - "Concerns/Needs - Domestic Abuse/IPV", - "Concerns/Needs - Early Childhood Education ", - "Concerns/Needs - Education/ Employment ", - "Concerns/Needs - Environmental Quality & Prtcn ", - "Concerns/Needs - Health Care ", - "Concerns/Needs - Interpersonal", - "Concerns/Needs - Mental Health", - "Concerns/Needs - Mental Health Concerns", - "Concerns/Needs - Organizational Development", - "Concerns/Needs - Other ", - "Concerns/Needs - Other Community Services", - "Concerns/Needs - Protective Service/Abuse", - "Concerns/Needs - Public Asst & Social Insurance", - "Concerns/Needs - Relationship Concerns / Issues ", - "Concerns/Needs - Self-Harm", - "Concerns/Needs - Sexuality", - # "Concerns/Needs - Suicide Related", - # "Concerns/Needs - Validity Question", - # "Concerns/Needs - Victim Assistance / Survivor Support ", - # "Concerns/Needs - Violence", - # "Concerns/Needs - xxx1", - # "Concerns/Needs - xxx2", -] - -VIA_LINK_REQUIRED_COLUMNS_CALLS = [ - "CallReportNum", - "ReportVersion", - "CallDateAndTimeStart", - "CityName", - "CountyName", - "StateProvince", - "PostalCode", - "Call Information - Program", - "Demographics - Age", - "Demographics - Gender", -] + needs_columns -vialink2_df = dfs["Uncleaned data type 2 VIA LINK"][VIA_LINK_REQUIRED_COLUMNS_CALLS] - -# step 2 -# remove calls not from LA Spirit line -vialink2_df = vialink2_df[ - vialink2_df["Call Information - Program"] == "LA Spirit Crisis Line" -] - - -# step 3 -# combine all needs column into 1 column -all_needs = "Concerns/Needs - Concerns/Needs" -vialink2_df[all_needs] = vialink2_df[needs_columns].apply( - lambda x: "; ".join(x[x.notnull()]), axis=1 -) - - -# todo: pull this into a utils module -def explode_needs(df, need_column): - df["tmp_needs"] = df[need_column].str.split(";") - df = df.explode("tmp_needs") - df.drop(columns=[need_column], inplace=True) - df.rename(columns={"tmp_needs": need_column}, inplace=True) return df -vialink2_df = explode_needs(vialink2_df, all_needs) - -# step 4 -# add "Data From" column -vialink2_df["Data From"] = "VIA LINK" - -# step 5 -# cleanup Concerns/Needs Data - -vialink2_df = vialink2_df[vialink2_df[all_needs] != "Wrong #"] -vialink2_df = vialink2_df[vialink2_df[all_needs] != "hangup"] -vialink2_df = vialink2_df.replace( - { - "Concerns/Needs - Interpersonal": "Interpersonal Conflict", - "Food": "Food/Meals", - "Interpersonal Conflict": "Income Support/Assistance", +if __name__ == "__main__": + file = "Data from 4.2.20 Fake Data.xlsx" + converters = { + "Concerns/Needs - Disaster Services ": str, + "Concerns/Needs - Domestic Abuse/IPV": str, + "Concerns/Needs - Early Childhood Education ": str, + "Concerns/Needs - Education/ Employment ": str, + "Concerns/Needs - Environmental Quality & Prtcn ": str, + "Concerns/Needs - Health Care ": str, + "Concerns/Needs - Interpersonal": str, + "Concerns/Needs - Mental Health": str, + "Concerns/Needs - Mental Health Concerns": str, + "Concerns/Needs - Organizational Development": str, + "Concerns/Needs - Other ": str, + "Concerns/Needs - Other Community Services": str, + "Concerns/Needs - Protective Service/Abuse": str, + "Concerns/Needs - Public Asst & Social Insurance": str, + "Concerns/Needs - Relationship Concerns / Issues ": str, + "Concerns/Needs - Self-Harm": str, + "Concerns/Needs - Sexuality": str, } -) - - -# step 6 -# drop all the original needs columns -vialink2_df.drop(columns=needs_columns, inplace=True) - - -# step 7 -# add the Lat/Lng columns - -# todo: pull this into a utils module -search = SearchEngine(simple_zipcode=True) - - -def get_lat(zipcode): - if pd.isnull(zipcode): - return None - else: - lat = search.by_zipcode(int(zipcode)).lat - return lat if lat else None - - -def get_lng(zipcode): - if pd.isnull(zipcode): - return None - else: - lng = search.by_zipcode(int(zipcode)).lng - return lng if lng else None - - -vialink2_df["Latitude"] = vialink2_df["PostalCode"].apply(get_lat) -vialink2_df["Longitude"] = vialink2_df["PostalCode"].apply(get_lng) - - -vialink2_df.to_excel( - "data/keep_calm_with_covid_cleaned.xlsx", sheet_name="codefornola cleaned" -) + df = pd.read_excel( + file, sheet_name="Uncleaned data type 2 VIA LINK", converters=converters + ) + df = cleanup(df) + df.to_excel( + "data/keep_calm_with_covid_cleaned.xlsx", sheet_name="codefornola cleaned" + ) diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..af9e1bb --- /dev/null +++ b/utils.py @@ -0,0 +1,45 @@ +import pandas as pd +from uszipcode import SearchEngine + +search = SearchEngine(simple_zipcode=True) + + +def get_lat(zipcode): + if pd.isnull(zipcode): + return None + else: + lat = search.by_zipcode(int(zipcode)).lat + return lat if lat else None + + +def get_lng(zipcode): + if pd.isnull(zipcode): + return None + else: + lng = search.by_zipcode(int(zipcode)).lng + return lng if lng else None + + +def explode_needs(df, need_column): + df["tmp_needs"] = df[need_column].str.split(";") + df = df.explode("tmp_needs") + df.drop(columns=[need_column], inplace=True) + df.rename(columns={"tmp_needs": need_column}, inplace=True) + return df + + +replacements = { + "†": "For some reason this cross mark is showing up in some of the entries, so just removing it", + "Employment": "Employment Services", + "Food": "Food/Meals", + "I'm Sick (what next?)": "I'm Sick (What's Next?)", + "I'm Sick (Whats Next?)": "I'm Sick (What's Next?)", + "information only call": "", + "Inquires about Health Complications / Concerns": "Inquires about Health Complications", + "International Travel Concerns": "International / General Travel Concerns", + "Legal Consumer": "Legal Assistance", + "Other - Interpersonal": "Other", + "Other (PLEASE Specify Caller Need in Call Notes)": "Other", + "other 2-1-1 referral": "Other", + "Unemployment": "Unemployment Benefits", +} From 6cd086436e701a56183e114c5b937b558e80dab4 Mon Sep 17 00:00:00 2001 From: Marc Cenac Date: Sat, 18 Apr 2020 12:03:00 -0500 Subject: [PATCH 03/11] Fix bug and suppress SettingWithCopyWarning --- cleanup_all_covid_calls.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cleanup_all_covid_calls.py b/cleanup_all_covid_calls.py index 851749a..596e21d 100644 --- a/cleanup_all_covid_calls.py +++ b/cleanup_all_covid_calls.py @@ -2,6 +2,8 @@ import numpy as np from datetime import datetime from utils import explode_needs, get_lat, get_lng, replacements +pd.options.mode.chained_assignment = None + def cleanup(dfs): @@ -107,7 +109,7 @@ def cleanup(dfs): master_df[cn] = master_df[cn].str.strip() master_df = master_df[master_df[cn] != "Hangup / Wrong Number"] master_df = master_df[master_df[cn] != "Hangup / Wrong #"] - master_df = master_df.replace(to_replace=replacements, value=None, inplace=True) + master_df.replace(to_replace=replacements, value=None, inplace=True) return master_df From 2b7db9e7966a7373ef59837cdcbc643395a2e35f Mon Sep 17 00:00:00 2001 From: Marc Cenac Date: Sat, 18 Apr 2020 13:39:54 -0500 Subject: [PATCH 04/11] Adding CLI using click --- README.md | 13 ++++-- cleanup.py | 73 +++++++++++++++++++++++++++++++++ cleanup_all_covid_calls.py | 14 ++++--- cleanup_keep_calm_with_covid.py | 41 +++++++++--------- requirements.txt | 1 + utils.py | 11 +++++ 6 files changed, 125 insertions(+), 28 deletions(-) create mode 100644 cleanup.py diff --git a/README.md b/README.md index 1793847..0544215 100644 --- a/README.md +++ b/README.md @@ -45,9 +45,16 @@ pip install -r requirements.txt ## Running the scripts -TBD - but probably something like this but eventually specifying input and output files +The basic format is `python cleanup.py --input ` +For example: ``` -python cleanup_keep_calm_with_covid.py -python cleanup_all_covid_calls.py +python cleanup.py all-covid-calls --input "Data from 4.2.20 Fake Data.xlsx" +python cleanup.py --debug keep-calm-with-covid --input "Data from 4.2.20 Fake Data.xlsx" --sheetname "Uncleaned data type 2 VIA LINK" +``` + +If you want to see the basic usage you can run `python cleanup.py` and for a specifc command you can use the `--help` flag + +``` +python cleanup.py all-covid-calls --help ``` diff --git a/cleanup.py b/cleanup.py new file mode 100644 index 0000000..f44e36f --- /dev/null +++ b/cleanup.py @@ -0,0 +1,73 @@ +import logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + handlers=[logging.StreamHandler()], +) +import os +import sys + +import click +import pandas as pd + +from cleanup_all_covid_calls import cleanup as cleanup_all_covid_calls +from cleanup_keep_calm_with_covid import ( + CONVERTERS, + cleanup as cleanup_keep_calm_with_covid, +) +from utils import write_output_file + + +@click.group() +@click.option("--debug/--no-debug", default=False) +@click.pass_context +def cleanup(ctx, debug): + ctx.ensure_object(dict) + ctx.obj["DEBUG"] = debug + + +@cleanup.command() +@click.pass_context +@click.option("--input", "infile", required=True, help="Path to the input spreadsheet (.xlsx file)") +@click.option("--sheetname", default=None, help="Name of the sheet to use") +@click.option( + "--output", + default="data/all_covid_calls_cleaned.xlsx", + help="Path to the output spreadsheet (cleaned .xlsx file)", +) +def all_covid_calls(ctx, infile, sheetname, output): + if ctx.obj["DEBUG"]: + logging.getLogger().setLevel(logging.DEBUG) + logging.debug("Running in debug mode") + logging.debug(f"Reading input file '{infile}'") + df = pd.read_excel(infile, sheet_name=sheetname) + logging.info("Cleaning data for All COVID Calls Dashboard") + df = cleanup_all_covid_calls(df) + logging.info(f"Writing data for All COVID Calls Dashboard to '{output}'") + write_output_file(df, output) + + +@cleanup.command() +@click.pass_context +@click.option("--input", "infile", required=True, help="Path to the input spreadsheet (.xlsx file)") +@click.option("--sheetname", required=True, help="Name of the sheet to use") +@click.option( + "--output", + default="data/keep_calm_with_covid_cleaned.xlsx", + help="Path to the output spreadsheet (cleaned .xlsx file)", +) +def keep_calm_with_covid(ctx, infile, sheetname, output): + if ctx.obj["DEBUG"]: + logging.getLogger().setLevel(logging.DEBUG) + logging.debug("Running in debug mode") + logging.debug(f"Reading input file '{infile}'") + df = pd.read_excel(infile, sheet_name=sheetname, converters=CONVERTERS) + logging.info("Cleaning data for Keep Calm with COVID Dashboard") + cleanup_keep_calm_with_covid(df) + logging.info(f"Writing data for Keep Calm with COVID Dashboard to '{output}'") + write_output_file(df, output) + + +if __name__ == "__main__": + cleanup(obj={}) diff --git a/cleanup_all_covid_calls.py b/cleanup_all_covid_calls.py index 596e21d..30c36aa 100644 --- a/cleanup_all_covid_calls.py +++ b/cleanup_all_covid_calls.py @@ -1,9 +1,15 @@ import pandas as pd import numpy as np from datetime import datetime -from utils import explode_needs, get_lat, get_lng, replacements -pd.options.mode.chained_assignment = None +from utils import ( + explode_needs, + get_lat, + get_lng, + replacements, + write_output_file, +) +pd.options.mode.chained_assignment = None def cleanup(dfs): @@ -121,6 +127,4 @@ def cleanup(dfs): dfs = pd.read_excel(file, sheet_name=None) df = cleanup(dfs) - - # write out spreadsheet - df.to_excel("data/all_covid_calls_cleaned.xlsx", sheet_name="codefornola cleaned") + write_output_file(df, "data/all_covid_calls_cleaned.xlsx") diff --git a/cleanup_keep_calm_with_covid.py b/cleanup_keep_calm_with_covid.py index b5ed4e3..baa6b12 100644 --- a/cleanup_keep_calm_with_covid.py +++ b/cleanup_keep_calm_with_covid.py @@ -3,6 +3,26 @@ from datetime import datetime from utils import explode_needs, get_lat, get_lng, replacements +CONVERTERS = { + "Concerns/Needs - Disaster Services ": str, + "Concerns/Needs - Domestic Abuse/IPV": str, + "Concerns/Needs - Early Childhood Education ": str, + "Concerns/Needs - Education/ Employment ": str, + "Concerns/Needs - Environmental Quality & Prtcn ": str, + "Concerns/Needs - Health Care ": str, + "Concerns/Needs - Interpersonal": str, + "Concerns/Needs - Mental Health": str, + "Concerns/Needs - Mental Health Concerns": str, + "Concerns/Needs - Organizational Development": str, + "Concerns/Needs - Other ": str, + "Concerns/Needs - Other Community Services": str, + "Concerns/Needs - Protective Service/Abuse": str, + "Concerns/Needs - Public Asst & Social Insurance": str, + "Concerns/Needs - Relationship Concerns / Issues ": str, + "Concerns/Needs - Self-Harm": str, + "Concerns/Needs - Sexuality": str, +} + def cleanup(df): ### Cleanup for Keeping Calm with COVID dashboard @@ -76,27 +96,8 @@ def cleanup(df): if __name__ == "__main__": file = "Data from 4.2.20 Fake Data.xlsx" - converters = { - "Concerns/Needs - Disaster Services ": str, - "Concerns/Needs - Domestic Abuse/IPV": str, - "Concerns/Needs - Early Childhood Education ": str, - "Concerns/Needs - Education/ Employment ": str, - "Concerns/Needs - Environmental Quality & Prtcn ": str, - "Concerns/Needs - Health Care ": str, - "Concerns/Needs - Interpersonal": str, - "Concerns/Needs - Mental Health": str, - "Concerns/Needs - Mental Health Concerns": str, - "Concerns/Needs - Organizational Development": str, - "Concerns/Needs - Other ": str, - "Concerns/Needs - Other Community Services": str, - "Concerns/Needs - Protective Service/Abuse": str, - "Concerns/Needs - Public Asst & Social Insurance": str, - "Concerns/Needs - Relationship Concerns / Issues ": str, - "Concerns/Needs - Self-Harm": str, - "Concerns/Needs - Sexuality": str, - } df = pd.read_excel( - file, sheet_name="Uncleaned data type 2 VIA LINK", converters=converters + file, sheet_name="Uncleaned data type 2 VIA LINK", converters=CONVERTERS ) df = cleanup(df) df.to_excel( diff --git a/requirements.txt b/requirements.txt index da5b8fd..3f07058 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +click==7.1.1 openpyxl==3.0.3 pandas==1.0.3 uszipcode==0.2.4 diff --git a/utils.py b/utils.py index af9e1bb..def5526 100644 --- a/utils.py +++ b/utils.py @@ -1,9 +1,19 @@ +import logging +import os import pandas as pd from uszipcode import SearchEngine search = SearchEngine(simple_zipcode=True) +def write_output_file(df, filename, sheet_name="codefornola cleaned"): + if filename.startswith("data") and not os.path.exists("data"): + logging.debug("Creating data directory") + os.makedirs("data") + logging.debug(f"Writing sheet '{sheet_name}' into '{filename}'") + df.to_excel(filename) + + def get_lat(zipcode): if pd.isnull(zipcode): return None @@ -21,6 +31,7 @@ def get_lng(zipcode): def explode_needs(df, need_column): + logging.debug(f"exploding needs into {need_column}") df["tmp_needs"] = df[need_column].str.split(";") df = df.explode("tmp_needs") df.drop(columns=[need_column], inplace=True) From 4003395bf248a2eb4530c15bf86502afe9f846d7 Mon Sep 17 00:00:00 2001 From: Marc Cenac Date: Sat, 25 Apr 2020 09:54:45 -0500 Subject: [PATCH 05/11] Adding additional need replacements --- utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils.py b/utils.py index def5526..65172ac 100644 --- a/utils.py +++ b/utils.py @@ -43,8 +43,10 @@ def explode_needs(df, need_column): "†": "For some reason this cross mark is showing up in some of the entries, so just removing it", "Employment": "Employment Services", "Food": "Food/Meals", + "Food/Meal": "Food/Meals", "I'm Sick (what next?)": "I'm Sick (What's Next?)", "I'm Sick (Whats Next?)": "I'm Sick (What's Next?)", + "Income support/assistance": "Income Support/Assistance", "information only call": "", "Inquires about Health Complications / Concerns": "Inquires about Health Complications", "International Travel Concerns": "International / General Travel Concerns", From a437ca4a175d4d188da68283ba35c7c56030fe55 Mon Sep 17 00:00:00 2001 From: Marc Cenac Date: Sat, 25 Apr 2020 09:57:55 -0500 Subject: [PATCH 06/11] Reformatting --- cleanup.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/cleanup.py b/cleanup.py index f44e36f..c05b90c 100644 --- a/cleanup.py +++ b/cleanup.py @@ -1,4 +1,5 @@ import logging + logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", @@ -29,7 +30,12 @@ def cleanup(ctx, debug): @cleanup.command() @click.pass_context -@click.option("--input", "infile", required=True, help="Path to the input spreadsheet (.xlsx file)") +@click.option( + "--input", + "infile", + required=True, + help="Path to the input spreadsheet (.xlsx file)", +) @click.option("--sheetname", default=None, help="Name of the sheet to use") @click.option( "--output", @@ -50,7 +56,12 @@ def all_covid_calls(ctx, infile, sheetname, output): @cleanup.command() @click.pass_context -@click.option("--input", "infile", required=True, help="Path to the input spreadsheet (.xlsx file)") +@click.option( + "--input", + "infile", + required=True, + help="Path to the input spreadsheet (.xlsx file)", +) @click.option("--sheetname", required=True, help="Name of the sheet to use") @click.option( "--output", From 39ff2f92154d1230803feec98f5b0f12f39b9f4b Mon Sep 17 00:00:00 2001 From: Marc Cenac Date: Sat, 25 Apr 2020 10:00:47 -0500 Subject: [PATCH 07/11] Remove cross symbol --- utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils.py b/utils.py index 65172ac..dec2a10 100644 --- a/utils.py +++ b/utils.py @@ -40,7 +40,7 @@ def explode_needs(df, need_column): replacements = { - "†": "For some reason this cross mark is showing up in some of the entries, so just removing it", + "†": "", "Employment": "Employment Services", "Food": "Food/Meals", "Food/Meal": "Food/Meals", From 9311cff4e65c686b0937d666ea6c0eabdc584b25 Mon Sep 17 00:00:00 2001 From: Marc Cenac Date: Tue, 5 May 2020 20:34:06 -0500 Subject: [PATCH 08/11] Remove main method from cleanup modules --- cleanup_all_covid_calls.py | 10 ---------- cleanup_keep_calm_with_covid.py | 11 ----------- 2 files changed, 21 deletions(-) diff --git a/cleanup_all_covid_calls.py b/cleanup_all_covid_calls.py index 30c36aa..b5e3331 100644 --- a/cleanup_all_covid_calls.py +++ b/cleanup_all_covid_calls.py @@ -118,13 +118,3 @@ def cleanup(dfs): master_df.replace(to_replace=replacements, value=None, inplace=True) return master_df - - -if __name__ == "__main__": - file = "Data from 4.2.20 Fake Data.xlsx" - - # read all sheets, returns a dict of dataframes - dfs = pd.read_excel(file, sheet_name=None) - - df = cleanup(dfs) - write_output_file(df, "data/all_covid_calls_cleaned.xlsx") diff --git a/cleanup_keep_calm_with_covid.py b/cleanup_keep_calm_with_covid.py index baa6b12..858cd44 100644 --- a/cleanup_keep_calm_with_covid.py +++ b/cleanup_keep_calm_with_covid.py @@ -92,14 +92,3 @@ def cleanup(df): df["Longitude"] = df["PostalCode"].apply(get_lng) return df - - -if __name__ == "__main__": - file = "Data from 4.2.20 Fake Data.xlsx" - df = pd.read_excel( - file, sheet_name="Uncleaned data type 2 VIA LINK", converters=CONVERTERS - ) - df = cleanup(df) - df.to_excel( - "data/keep_calm_with_covid_cleaned.xlsx", sheet_name="codefornola cleaned" - ) From 163056cbdc89d332931dcedfb0e012684908f17c Mon Sep 17 00:00:00 2001 From: Marc Cenac Date: Tue, 5 May 2020 20:35:21 -0500 Subject: [PATCH 09/11] Update to read iCarol formatted csv file --- cleanup.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/cleanup.py b/cleanup.py index c05b90c..a6cf935 100644 --- a/cleanup.py +++ b/cleanup.py @@ -57,23 +57,22 @@ def all_covid_calls(ctx, infile, sheetname, output): @cleanup.command() @click.pass_context @click.option( - "--input", - "infile", - required=True, - help="Path to the input spreadsheet (.xlsx file)", + "--input", "infile", required=True, help="Path to the input csv file", ) -@click.option("--sheetname", required=True, help="Name of the sheet to use") @click.option( "--output", default="data/keep_calm_with_covid_cleaned.xlsx", help="Path to the output spreadsheet (cleaned .xlsx file)", ) -def keep_calm_with_covid(ctx, infile, sheetname, output): +def keep_calm_with_covid(ctx, infile, output): if ctx.obj["DEBUG"]: logging.getLogger().setLevel(logging.DEBUG) logging.debug("Running in debug mode") logging.debug(f"Reading input file '{infile}'") - df = pd.read_excel(infile, sheet_name=sheetname, converters=CONVERTERS) + df = pd.read_csv(infile, encoding="ISO-8859-1", converters=CONVERTERS) + columns = df.iloc[1].values.tolist() + df = df.iloc[2:] + df.columns = columns logging.info("Cleaning data for Keep Calm with COVID Dashboard") cleanup_keep_calm_with_covid(df) logging.info(f"Writing data for Keep Calm with COVID Dashboard to '{output}'") From 3d3aba1b15edd0a1d2fb63ed08eea444fbb393d1 Mon Sep 17 00:00:00 2001 From: Marc Cenac Date: Thu, 7 May 2020 09:30:59 -0500 Subject: [PATCH 10/11] Update all-covid-calls to read 2 csv files --- cleanup.py | 39 +++++++++++++++++++++++++++----------- cleanup_all_covid_calls.py | 6 ++---- 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/cleanup.py b/cleanup.py index a6cf935..14f5969 100644 --- a/cleanup.py +++ b/cleanup.py @@ -31,25 +31,37 @@ def cleanup(ctx, debug): @cleanup.command() @click.pass_context @click.option( - "--input", - "infile", + "--vialink-input", + "vl_infile", required=True, - help="Path to the input spreadsheet (.xlsx file)", + help="Path to the VIA LINK input csv file", +) +@click.option( + "--232-input", + "two32_infile", + required=True, + help="Path to the 232 HELP input csv file", ) -@click.option("--sheetname", default=None, help="Name of the sheet to use") @click.option( "--output", default="data/all_covid_calls_cleaned.xlsx", help="Path to the output spreadsheet (cleaned .xlsx file)", ) -def all_covid_calls(ctx, infile, sheetname, output): +def all_covid_calls(ctx, vl_infile, two32_infile, output): if ctx.obj["DEBUG"]: logging.getLogger().setLevel(logging.DEBUG) logging.debug("Running in debug mode") - logging.debug(f"Reading input file '{infile}'") - df = pd.read_excel(infile, sheet_name=sheetname) + logging.debug(f"Reading VIALINK file from '{vl_infile}'") + logging.debug(f"Reading 232-HELP file from '{two32_infile}'") + dfs = {} + dfvl = pd.read_csv(vl_infile, encoding="ISO-8859-1") + dfvl = remove_first_rows(dfvl) + dfs["VIALINK"] = dfvl + df232 = pd.read_csv(two32_infile, encoding="ISO-8859-1") + df232 = remove_first_rows(df232) + dfs["TWO32"] = df232 logging.info("Cleaning data for All COVID Calls Dashboard") - df = cleanup_all_covid_calls(df) + df = cleanup_all_covid_calls(dfs) logging.info(f"Writing data for All COVID Calls Dashboard to '{output}'") write_output_file(df, output) @@ -70,14 +82,19 @@ def keep_calm_with_covid(ctx, infile, output): logging.debug("Running in debug mode") logging.debug(f"Reading input file '{infile}'") df = pd.read_csv(infile, encoding="ISO-8859-1", converters=CONVERTERS) - columns = df.iloc[1].values.tolist() - df = df.iloc[2:] - df.columns = columns + df = remove_first_rows(df) logging.info("Cleaning data for Keep Calm with COVID Dashboard") cleanup_keep_calm_with_covid(df) logging.info(f"Writing data for Keep Calm with COVID Dashboard to '{output}'") write_output_file(df, output) +def remove_first_rows(df): + columns = df.iloc[1].values.tolist() + df = df.iloc[2:] + df.columns = columns + return df + + if __name__ == "__main__": cleanup(obj={}) diff --git a/cleanup_all_covid_calls.py b/cleanup_all_covid_calls.py index b5e3331..706b6b4 100644 --- a/cleanup_all_covid_calls.py +++ b/cleanup_all_covid_calls.py @@ -33,9 +33,7 @@ def cleanup(dfs): "Contact Source - Program ", # ending space is needed "Needs - Basic Needs Requested", ] - vialink1_df = dfs["Uncleaned data type 1 VIA LINK"][ - VIA_LINK_REQUIRED_COLUMNS_DISASTER - ] + vialink1_df = dfs["VIALINK"][VIA_LINK_REQUIRED_COLUMNS_DISASTER] # step 2 # select required columns from 232-Help’s Disaster Form @@ -53,7 +51,7 @@ def cleanup(dfs): "Client Information - Identifies as", "Needs - Basic Needs Requested", ] - two32_help_df = dfs["Uncleaned Data from 232-Help"][TWO32_HELP_REQUIRED_COLUMNS] + two32_help_df = dfs["TWO32"][TWO32_HELP_REQUIRED_COLUMNS] # step 3 # Create age ranges from date of birth From 9752f0d01d3c02de6ec410cd1ebc812e76e24af2 Mon Sep 17 00:00:00 2001 From: Marc Cenac Date: Thu, 7 May 2020 09:31:08 -0500 Subject: [PATCH 11/11] Update usage in readme --- README.md | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 0544215..4c1639e 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,16 @@ -This project will help cleanup data for [VIA LINK](https://vialink.org/), the non-profit that runs the 211 system and call centers for the New Orleans-based region of Louisiana. +This project will help cleanup data for [VIA LINK](https://vialink.org/), the non-profit that runs the 211 system and call centers for the New Orleans-based region of Louisiana. ## Initial setup -### install Python +### install Python -You must have Python 3 installed. You can download it [here](https://www.python.org/downloads/). +You must have Python 3 installed. You can download it [here](https://www.python.org/downloads/). -> If you are using Windows, be sure to select the "Add Python to PATH" option +> If you are using Windows, be sure to select the "Add Python to PATH" option -You can confirm it is installed correctly by running `python3 --version` in a terminal or command prompt. +You can confirm it is installed correctly by running `python3 --version` in a terminal or command prompt. -### create and activate a Python virtual environment +### create and activate a Python virtual environment This step is optional, but if you have more than one project using Python, it is recommended. @@ -18,6 +18,7 @@ A [virtual environment](https://docs.python.org/3/library/venv.html#creating-vir of each project, which is helpful when working with mulitple projects with different depenencies (or different versions of the same dependency). For macOS or Linux + ``` python3 -m venv .venv source .venv/bin/activate @@ -30,27 +31,31 @@ py -m venv env .\env\Scripts\activate ``` -> Note that you need to [activate the virutal environment](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/#activating-a-virtual-environment) -> before running a script but you only need to create the virtual envrionment once. +> Note that you need to [activate the virutal environment](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/#activating-a-virtual-environment) +> before running a script but you only need to create the virtual envrionment once. ### install the dependencies In Python, dependencies are often installed using [pip](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/#installing-pip) You can install all the dependencies for this project by running: + ``` pip install -r requirements.txt ``` - ## Running the scripts -The basic format is `python cleanup.py --input ` +The basic format looks like `python cleanup.py script_name --inputfile1 ~/path/to/input.csv` For example: + ``` -python cleanup.py all-covid-calls --input "Data from 4.2.20 Fake Data.xlsx" -python cleanup.py --debug keep-calm-with-covid --input "Data from 4.2.20 Fake Data.xlsx" --sheetname "Uncleaned data type 2 VIA LINK" +# the keep-calm-with-covid script only requires one input file +python cleanup.py --debug keep-calm-with-covid --input "/tmp/VL 4.29 Call Report.csv" + +# the all-covid-calls script requires 2 files +python cleanup.py --debug all-covid-calls --vialink-input ~/Downloads/VL\ 4.29\ Disaster\ Call\ Report\ .csv --232-input ~/Downloads/232-HELP.csv" ``` If you want to see the basic usage you can run `python cleanup.py` and for a specifc command you can use the `--help` flag @@ -58,3 +63,5 @@ If you want to see the basic usage you can run `python cleanup.py` and for a spe ``` python cleanup.py all-covid-calls --help ``` + +> Also, you can use the `--debug` flag to view debug logs