In [1]:
import sys

# append the directory of law module to sys.path list
sys.path.append('../modules/')

In [2]:
from pathlib import Path

import arrest
import pandas as pd

## Generate fake DataFrame

In [3]:
fake_data = [{'Arrest ID': 'EABE_2641', 'Street Address': 'UNKNOWN'},
             {'Arrest ID': 'QITO_3780', 'Street Address': 'PO B 1234'},
             {'Arrest ID': '8RZH_6339', 'Street Address': 'TRANSIENT GD HWY'},
             {'Arrest ID': 'VZ2L_8354', 'Street Address': '714 4th Rd.'},
             {'Arrest ID': 'UD2Y_5453', 'Street Address': 'TANSIENT'},
             {'Arrest ID': 'QSIG_1745', 'Street Address': '864 Park Rd.'},
             {'Arrest ID': 'CXMS_4392', 'Street Address': ''},
             {'Arrest ID': 'CXMS_4392', 'Street Address': '3647 Pine Rd.'},
             {'Arrest ID': 'MWO9_2179', 'Street Address': 'TRANSEINT'},
             {'Arrest ID': 'MGOZ_6829', 'Street Address': ''},
             {'Arrest ID': 'UMDR_7294', 'Street Address': '8507 First Ct.'},
             {'Arrest ID': 'XZGK_1786', 'Street Address': '9127 Oak Pl'},
             {'Arrest ID': 'M0OF_7566', 'Street Address': 'TRANSIENT'},
             {'Arrest ID': 'WXMU_5528', 'Street Address': 'GENERAL DELIVERY'},
             {'Arrest ID': 'BBJ5_6702', 'Street Address': 'HOMELESS'},
             {'Arrest ID': 'F4YT_7829', 'Street Address': '12319   Main Rd.'},
             {'Arrest ID': 'XJJE_8818', 'Street Address': 'P O BOX 1234'},
             {'Arrest ID': '3RB1_2221', 'Street Address': 'UNKNOWN 102'},
             {'Arrest ID': 'KE7N_9749', 'Street Address': 'TRANSINET'},
             {'Arrest ID': 'OGJ5_5598', 'Street Address': 'REFUSED'}]

In [4]:
df = pd.DataFrame(fake_data)

In [5]:
df.head()

Unnamed: 0,Arrest ID,Street Address
0,EABE_2641,UNKNOWN
1,QITO_3780,PO B 1234
2,8RZH_6339,TRANSIENT GD HWY
3,VZ2L_8354,714 4th Rd.
4,UD2Y_5453,TANSIENT


## Normalize text

In [6]:
help(arrest.normalize_text)

Help on function normalize_text in module arrest:

normalize_text(value, punctuation=False)
    Optionally removes punctuation, removes redundant whitespace, normalizes case
    
    Parameters
    ----------
    value : str
        The string to operate on
    punctuation : bool, default False
        If True, sub any punctuation character with whitespace
    
    Returns
    -------
    str
    
    Examples
    --------
    >>> clean_string('1400   alder dr. ', punctuation=True)
    '1400 ALDER DR'
    
    >>> clean_string('P.O. Box 123', punctuation=True)
    'PO BOX 1234'
    
    >>> clean_string('P.O. Box 123')
    'P.O. BOX 1234'



In [7]:
df['_normalized_street_address'] = df['Street Address'].apply(
    lambda x: arrest.normalize_text(x, punctuation=True)
)

### Example regex search

In [8]:
def categorize_from_regex(category, data, field):
    """Searches an address field with a particular regular expression (can just
    be a string) and prompts user for manual review. If accepted, appends each address,
    its category, and the expression matched to a line in a CSV file.

        Parameters
        ----------
        category : str
            The category to assign to any addresses matching on this expression,
            typically "unhoused," "unknown," or "po_box."

        data : pd.DataFrame
            The DataFrame to search.
            
        field : single label
            The label of the column to search within.


        Returns
        -------
        None
    """
    try:
        cat_df = pd.read_csv(
            f'outputs/categorized_addresses.csv',
            sep='\t',
            header=None,
            names=['_street_address', '_category', '_expression_matched'],
            dtype=str,
        )
        category_subset = cat_df[cat_df['_category'] == category]
        category_addresses = list(category_subset['_street_address'])
        terms_used = list(set(category_subset['_expression_matched']))
        print(
            f'Categorized addresses found. Expressions already used: {",".join(terms_used)}'
        )
    except IOError:
        print('No file for categorized addresses found.\nStarting fresh.')
        Path(f'outputs/categorized_addresses.csv').touch()
        category_addresses = []
        terms_used = []
    term = input('Enter search expression:\n').upper()
    if term in terms_used:
        print('Expression already used.')
        return None
    else:
        categorized_filter = df['Street Address'].isin(category_addresses)
        search_filter = df['Street Address'].str.contains(
            f'{term}', regex=True)
        results = df[(search_filter) & (~categorized_filter)]
    if len(results) > 0:
        print(
            results.groupby('Street Address', dropna=False)['Arrest ID']
            .nunique()
            .sort_values()
        )
        next_step = input(
            f'Append all results under category "{category}"? y/n\n')
        if next_step == 'y':
            reviewed_results = list(set(results['Street Address']))
            export_str = ''.join(
                [f"{x}\t{category}\t{term}\n" for x in reviewed_results]
            )
            with open(f'outputs/categorized_addresses.csv', 'a') as file:
                file.write(export_str)
            print(
                f'{len(reviewed_results)} addresses added to categorized_addresses.csv.'
            )
            category_addresses.extend(reviewed_results)
            terms_used.append(term)
        else:
            print(f'Starting over. Last term searched: "{term}"\n')
    else:
        print(f'"{term}" returned no results.')

In [9]:
categorize_from_regex('unknown', df)