# Perform End-to-End Analysis of DineSafe Infractions

In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [2]:
import configparser

from sqlalchemy import create_engine

In [3]:
%aimport src.prefect_utils
from src.prefect_utils import analyze_infractions

## User Inputs

In [15]:
# Data file names to download (these are timestamps at which data snapshot was
# captured by WayBackMachine)
zip_filenames = [
    "20130723222156",
    "20150603085055",
    "20151012004454",
    "20160129205023",
    "20160317045436",
    "20160915001010",
    "20170303162206",
    "20170330001043",
    "20170726115444",
    "20190116215713",
    "20190126084933",
    "20190614092848",
    "20210626163552",
]

# Order of DataFrame columns (to re-order raw data) in order to match column order in database table
cols_order_wanted = [
    "row_id",
    "establishment_id",
    "inspection_id",
    "establishment_name",
    "establishmenttype",
    "establishment_address",
    "latitude",
    "longitude",
    "establishment_status",
    "minimum_inspections_peryear",
    "infraction_details",
    "inspection_date",
    "severity",
    "action",
    "court_outcome",
    "amount_fined",
    "filename",
]

establishment_types_wanted = [
    "Restaurant",
    "Food Take Out",
    "Food Store (Convenience / Variety)",  # equivalent to grocery store
    "Food Court Vendor",
    "Supermarket",  # equivalent to grocery store
    "Bakery",  # equivalent to grocery store
    # "Food Caterer",
    "Butcher Shop",  # equivalent to grocery store
    "Cafeteria - Public Access",
    # "Boarding / Lodging Home - Kitchen",
    "Cocktail Bar / Beverage Room",
    # "Food Depot",
    # "Private Club",
    "Fish Shop",  # equivalent to grocery store
    "Bake Shop",  # equivalent to grocery store
    # "Food Bank",
    "Flea Market",  # equivalent to grocery store
    "Farmer\\'s Market",  # equivalent to grocery store
    # "Bed & Breakfast",
]

In [4]:
config = configparser.ConfigParser()
config.read("../sql.ini")
default_cfg = config["default"]

In [5]:
DB_TYPE = default_cfg["DB_TYPE"]
DB_DRIVER = default_cfg["DB_DRIVER"]
DB_USER = default_cfg["DB_USER"]
DB_PASS = default_cfg["DB_PASS"]
DB_HOST = default_cfg["DB_HOST"]
DB_PORT = default_cfg["DB_PORT"]
DB_NAME = default_cfg["DB_NAME"]

In [6]:
# Connect to single database (required to create database)
URI_NO_DB = f"{DB_TYPE}+{DB_DRIVER}://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}"

# Connect to all databases (required to perform CRUD operations and submit queries)
URI = f"{DB_TYPE}+{DB_DRIVER}://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"

## About

Run through end-to-end workflow.

## Database Administration

The inspections data will be stored locally in a MySQL database. We'll first create the `dinesafe` database

In [7]:
# engine = create_engine(URI_NO_DB)
# conn = engine.connect()

In [8]:
# _ = conn.execute(f"DROP DATABASE IF EXISTS {DB_NAME};")
# _ = conn.execute(f"CREATE DATABASE IF NOT EXISTS {DB_NAME};")

In [9]:
# conn.close()
# engine.dispose()

## Create Database Table

In [10]:
# engine = create_engine(URI)
# conn = engine.connect()

Create the `inspections` table in the `dinesafe` database

In [11]:
# Name of database table
table_name = "inspections2"

In [12]:
# _ = conn.execute(f"DROP TABLE IF EXISTS {table_name}")

In [13]:
# create_table_query = f"""
#                      CREATE TABLE IF NOT EXISTS {table_name} (
#                          row_id INT,
#                          establishment_id INT,
#                          inspection_id INT,
#                          establishment_name TEXT,
#                          establishmenttype TEXT,
#                          establishment_address TEXT,
#                          latitude FLOAT,
#                          longitude FLOAT,
#                          establishment_status TEXT,
#                          minimum_inspections_peryear INT,
#                          infraction_details TEXT,
#                          inspection_date DATE,
#                          severity TEXT,
#                          action TEXT,
#                          court_outcome TEXT,
#                          amount_fined FLOAT,
#                          filename VARCHAR(20)
#                      )
#                      """
# _ = conn.execute(create_table_query)

In [14]:
# conn.close()
# engine.dispose()

## End-to-End Workflow

In [16]:
%%time
state = analyze_infractions(
    zip_filenames,
    URI,
    cols_order_wanted,
    establishment_types_wanted,
    table_name,
    "addressinfo"
)
df = state.result().result()
display(
    df["is_infraction"]
    .value_counts(normalize=True)
    .rename("fraction")
    .to_frame()
    .merge(
        df["is_infraction"].value_counts().rename("num_inspections").to_frame(),
        left_index=True,
        right_index=True,
        how="inner",
    )
)

00:22:12.225 | Beginning flow run 'lovely-oriole' for flow 'Run through end-to-end analysis workflow'...
00:22:12.226 | Starting task runner `SequentialTaskRunner`...
00:22:12.329 | Submitting task run 'Retrieve DineSafe infractions data from WayBackMachine-69cac2d9-0' to task runner...
00:22:12.356 | Found 20130723222156 locally at data/raw/20130723222156/dinesafe.xml. Did nothing.
00:22:12.357 | Found 20150603085055 locally at data/raw/20150603085055/dinesafe.xml. Did nothing.
00:22:12.357 | Found 20151012004454 locally at data/raw/20151012004454/dinesafe.xml. Did nothing.
00:22:12.358 | Found 20160129205023 locally at data/raw/20160129205023/dinesafe.xml. Did nothing.
00:22:12.359 | Found 20160317045436 locally at data/raw/20160317045436/dinesafe.xml. Did nothing.
00:22:12.360 | Found 20160915001010 locally at data/raw/20160915001010/dinesafe.xml. Did nothing.
00:22:12.362 | Found 20170303162206 locally at data/raw/20170303162206/dinesafe.xml. Did nothing.
00:22:12.363 | Found 20170

Unnamed: 0,fraction,num_inspections
0,0.796951,163975
1,0.203049,41778


CPU times: user 35 s, sys: 3.7 s, total: 38.7 s
Wall time: 1min 5s
