Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
.vscode
.vscode/
data/


# Byte-compiled / optimized / DLL files
Expand Down
37 changes: 26 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,23 +1,24 @@
This project will help cleanup data for [VIA LINK](https://vialink.org/), the non-profit that runs the 211 system and call centers for the New Orleans-based region of Louisiana.
This project will help cleanup data for [VIA LINK](https://vialink.org/), the non-profit that runs the 211 system and call centers for the New Orleans-based region of Louisiana.

## Initial setup

### install Python
### install Python

You must have Python 3 installed. You can download it [here](https://www.python.org/downloads/).
You must have Python 3 installed. You can download it [here](https://www.python.org/downloads/).

> If you are using Windows, be sure to select the "Add Python to PATH" option
> If you are using Windows, be sure to select the "Add Python to PATH" option

You can confirm it is installed correctly by running `python3 --version` in a terminal or command prompt.
You can confirm it is installed correctly by running `python3 --version` in a terminal or command prompt.

### create and activate a Python virtual environment
### create and activate a Python virtual environment

This step is optional, but if you have more than one project using Python, it is recommended.

A [virtual environment](https://docs.python.org/3/library/venv.html#creating-virtual-environments) isolates the dependencies
of each project, which is helpful when working with mulitple projects with different depenencies (or different versions of the same dependency).

For macOS or Linux

```
python3 -m venv .venv
source .venv/bin/activate
Expand All @@ -30,23 +31,37 @@ py -m venv env
.\env\Scripts\activate
```

> Note that you need to [activate the virutal environment](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/#activating-a-virtual-environment)
> before running a script but you only need to create the virtual envrionment once.
> Note that you need to [activate the virutal environment](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/#activating-a-virtual-environment)
> before running a script but you only need to create the virtual envrionment once.

### install the dependencies

In Python, dependencies are often installed using [pip](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/#installing-pip)

You can install all the dependencies for this project by running:

```
pip install -r requirements.txt
```


## Running the scripts

TBD - but probably something like this:
The basic format looks like `python cleanup.py script_name --inputfile1 ~/path/to/input.csv`

For example:

```
# the keep-calm-with-covid script only requires one input file
python cleanup.py --debug keep-calm-with-covid --input "/tmp/VL 4.29 Call Report.csv"

# the all-covid-calls script requires 2 files
python cleanup.py --debug all-covid-calls --vialink-input ~/Downloads/VL\ 4.29\ Disaster\ Call\ Report\ .csv --232-input ~/Downloads/232-HELP.csv"
```
python cleanup /path/to/file1.xlsx /path/to/file2.xlsx

If you want to see the basic usage you can run `python cleanup.py` and for a specifc command you can use the `--help` flag

```
python cleanup.py all-covid-calls --help
```

> Also, you can use the `--debug` flag to view debug logs
100 changes: 100 additions & 0 deletions cleanup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import logging

logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
handlers=[logging.StreamHandler()],
)
import os
import sys

import click
import pandas as pd

from cleanup_all_covid_calls import cleanup as cleanup_all_covid_calls
from cleanup_keep_calm_with_covid import (
CONVERTERS,
cleanup as cleanup_keep_calm_with_covid,
)
from utils import write_output_file


@click.group()
@click.option("--debug/--no-debug", default=False)
@click.pass_context
def cleanup(ctx, debug):
ctx.ensure_object(dict)
ctx.obj["DEBUG"] = debug


@cleanup.command()
@click.pass_context
@click.option(
"--vialink-input",
"vl_infile",
required=True,
help="Path to the VIA LINK input csv file",
)
@click.option(
"--232-input",
"two32_infile",
required=True,
help="Path to the 232 HELP input csv file",
)
@click.option(
"--output",
default="data/all_covid_calls_cleaned.xlsx",
help="Path to the output spreadsheet (cleaned .xlsx file)",
)
def all_covid_calls(ctx, vl_infile, two32_infile, output):
if ctx.obj["DEBUG"]:
logging.getLogger().setLevel(logging.DEBUG)
logging.debug("Running in debug mode")
logging.debug(f"Reading VIALINK file from '{vl_infile}'")
logging.debug(f"Reading 232-HELP file from '{two32_infile}'")
dfs = {}
dfvl = pd.read_csv(vl_infile, encoding="ISO-8859-1")
dfvl = remove_first_rows(dfvl)
dfs["VIALINK"] = dfvl
df232 = pd.read_csv(two32_infile, encoding="ISO-8859-1")
df232 = remove_first_rows(df232)
dfs["TWO32"] = df232
logging.info("Cleaning data for All COVID Calls Dashboard")
df = cleanup_all_covid_calls(dfs)
logging.info(f"Writing data for All COVID Calls Dashboard to '{output}'")
write_output_file(df, output)


@cleanup.command()
@click.pass_context
@click.option(
"--input", "infile", required=True, help="Path to the input csv file",
)
@click.option(
"--output",
default="data/keep_calm_with_covid_cleaned.xlsx",
help="Path to the output spreadsheet (cleaned .xlsx file)",
)
def keep_calm_with_covid(ctx, infile, output):
if ctx.obj["DEBUG"]:
logging.getLogger().setLevel(logging.DEBUG)
logging.debug("Running in debug mode")
logging.debug(f"Reading input file '{infile}'")
df = pd.read_csv(infile, encoding="ISO-8859-1", converters=CONVERTERS)
df = remove_first_rows(df)
logging.info("Cleaning data for Keep Calm with COVID Dashboard")
cleanup_keep_calm_with_covid(df)
logging.info(f"Writing data for Keep Calm with COVID Dashboard to '{output}'")
write_output_file(df, output)


def remove_first_rows(df):
columns = df.iloc[1].values.tolist()
df = df.iloc[2:]
df.columns = columns
return df


if __name__ == "__main__":
cleanup(obj={})
118 changes: 118 additions & 0 deletions cleanup_all_covid_calls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import pandas as pd
import numpy as np
from datetime import datetime
from utils import (
explode_needs,
get_lat,
get_lng,
replacements,
write_output_file,
)

pd.options.mode.chained_assignment = None


def cleanup(dfs):
### Cleanup for All COVID Calls dashboard

# step 1
# select required columns from VIA LINK’s Disaster Form
# pretty sure the distaster form is "Uncleaned data type 1 VIA LINK"
VIA_LINK_REQUIRED_COLUMNS_DISASTER = [
"CallReportNum",
"ReportVersion",
"CallDateAndTimeStart",
"CityName",
"CountyName",
"StateProvince",
"PostalCode",
"Client Information - Age Group",
"Client Information - Call Type",
"Client Information - Identifies as",
"Concerns/Needs - Concerns/Needs",
"Contact Source - Program ", # ending space is needed
"Needs - Basic Needs Requested",
]
vialink1_df = dfs["VIALINK"][VIA_LINK_REQUIRED_COLUMNS_DISASTER]

# step 2
# select required columns from 232-Help’s Disaster Form
TWO32_HELP_REQUIRED_COLUMNS = [
"CallReportNum",
"ReportVersion",
"CallDateAndTimeStart",
"CityName",
"CountyName",
"StateProvince",
"PostalCode",
"Client Information - Date of Birth",
"Client Information - Call Type",
"Call Outcome - What concerns/needs were identified?",
"Client Information - Identifies as",
"Needs - Basic Needs Requested",
]
two32_help_df = dfs["TWO32"][TWO32_HELP_REQUIRED_COLUMNS]

# step 3
# Create age ranges from date of birth
# use ranges 0-5, 6-12, 13-17, 18-24, 25-40, 41-59, 60+.
now = datetime.now()
bins = [0, 5, 12, 17, 24, 40, 59, 150]
labels = ["0-5", "6-12", "13-17", "18-24", "24-40", "41-49", "60+"]
dob = pd.to_datetime(
two32_help_df["Client Information - Date of Birth"], errors="coerce"
)
years_old = (now - dob).astype("timedelta64[Y]")
age_range = pd.cut(years_old, bins=bins, labels=labels, include_lowest=True)
two32_help_df["Client Information - Age Group"] = age_range
# remove original Date of Birth column
two32_help_df.drop(columns=["Client Information - Date of Birth"], inplace=True)

# step 4
# add "Data From" column
vialink1_df["Data From"] = "VIA LINK"
two32_help_df["Data From"] = "232-HELP"

# step 5
# add data to master spreadsheet
# first merge "Call Outcome - What concerns/needs were identified" from 232-HELP
# into "Concerns/Needs - Concerns/Needs"
two32_help_df.rename(
columns={
"Call Outcome - What concerns/needs were identified?": "Concerns/Needs - Concerns/Needs"
},
inplace=True,
)

# new steps
# cleanup invalid values
vialink1_df["Contact Source - Program "].replace(
to_replace=datetime(2001, 2, 1, 0, 0), value=np.nan, inplace=True
)

# then combine data
master_df = pd.concat([vialink1_df, two32_help_df], join="outer", ignore_index=True)

# step 6
# add lat/lon columns
master_df["Latitude"] = master_df["PostalCode"].apply(get_lat)
master_df["Longitude"] = master_df["PostalCode"].apply(get_lng)

# step 7
# first put the values from "Needs - Basic Needs Requested" into "Concerns/Needs - Concerns/Needs"
cn = "Concerns/Needs - Concerns/Needs"
master_df["all_needs"] = master_df[[cn, "Needs - Basic Needs Requested"]].apply(
lambda x: "; ".join(x[x.notnull()]), axis=1
)
master_df.drop(columns=[cn, "Needs - Basic Needs Requested"], inplace=True)
master_df.rename(columns={"all_needs": cn}, inplace=True)
master_df = explode_needs(master_df, cn)

# step 8
# cleanup Concerns/Needs
master_df[cn] = master_df[cn].str.strip()
master_df = master_df[master_df[cn] != "Hangup / Wrong Number"]
master_df = master_df[master_df[cn] != "Hangup / Wrong #"]
master_df.replace(to_replace=replacements, value=None, inplace=True)

return master_df
94 changes: 94 additions & 0 deletions cleanup_keep_calm_with_covid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import pandas as pd
import numpy as np
from datetime import datetime
from utils import explode_needs, get_lat, get_lng, replacements

CONVERTERS = {
"Concerns/Needs - Disaster Services ": str,
"Concerns/Needs - Domestic Abuse/IPV": str,
"Concerns/Needs - Early Childhood Education ": str,
"Concerns/Needs - Education/ Employment ": str,
"Concerns/Needs - Environmental Quality & Prtcn ": str,
"Concerns/Needs - Health Care ": str,
"Concerns/Needs - Interpersonal": str,
"Concerns/Needs - Mental Health": str,
"Concerns/Needs - Mental Health Concerns": str,
"Concerns/Needs - Organizational Development": str,
"Concerns/Needs - Other ": str,
"Concerns/Needs - Other Community Services": str,
"Concerns/Needs - Protective Service/Abuse": str,
"Concerns/Needs - Public Asst & Social Insurance": str,
"Concerns/Needs - Relationship Concerns / Issues ": str,
"Concerns/Needs - Self-Harm": str,
"Concerns/Needs - Sexuality": str,
}


def cleanup(df):
### Cleanup for Keeping Calm with COVID dashboard
# step 1
# select only the required columns
needs_columns = [
"Concerns/Needs - Disaster Services ",
"Concerns/Needs - Domestic Abuse/IPV",
"Concerns/Needs - Early Childhood Education ",
"Concerns/Needs - Education/ Employment ",
"Concerns/Needs - Environmental Quality & Prtcn ",
"Concerns/Needs - Health Care ",
"Concerns/Needs - Interpersonal",
"Concerns/Needs - Mental Health",
"Concerns/Needs - Mental Health Concerns",
"Concerns/Needs - Organizational Development",
"Concerns/Needs - Other ",
"Concerns/Needs - Other Community Services",
"Concerns/Needs - Protective Service/Abuse",
"Concerns/Needs - Public Asst & Social Insurance",
"Concerns/Needs - Relationship Concerns / Issues ",
"Concerns/Needs - Self-Harm",
"Concerns/Needs - Sexuality",
]
VIA_LINK_REQUIRED_COLUMNS_CALLS = [
"CallReportNum",
"ReportVersion",
"CallDateAndTimeStart",
"CityName",
"CountyName",
"StateProvince",
"PostalCode",
"Call Information - Program",
"Demographics - Age",
"Demographics - Gender",
] + needs_columns
df = df[VIA_LINK_REQUIRED_COLUMNS_CALLS]

# step 2
# remove calls not from LA Spirit line
df = df[df["Call Information - Program"] == "LA Spirit Crisis Line"]

# step 3
# combine all needs column into 1 column
all_needs = "Concerns/Needs - Concerns/Needs"
df[all_needs] = df[needs_columns].apply(lambda x: "; ".join(x[x.notnull()]), axis=1)
df = explode_needs(df, all_needs)

# step 4
# add "Data From" column
df["Data From"] = "VIA LINK"

# step 5
# cleanup Concerns/Needs Data
df[all_needs] = df[all_needs].str.strip()
df = df[df[all_needs] != "Wrong #"]
df = df[df[all_needs] != "hangup"]
df.replace(to_replace=replacements, value=None, inplace=True)

# step 6
# drop all the original needs columns
df.drop(columns=needs_columns, inplace=True)

# step 7
# add the Lat/Lng columns
df["Latitude"] = df["PostalCode"].apply(get_lat)
df["Longitude"] = df["PostalCode"].apply(get_lng)

return df
Loading