## Clean Street component of an address

This script uses the python library **usaddress** to parse the street component of an address. The script does some minor preprocessing of the address before passing input to `usaddress`.

- See: https://parserator.datamade.us/usaddress
   

In [1]:
import usaddress
import pandas as pd
import re
import math
import usaddress

In [2]:
pd.__version__

'0.20.3'

In [3]:
# Disable chained assignments
pd.options.mode.chained_assignment = None 

## Big function to pre-clean an address and submit to `usaddress` parser.

In [4]:
counter = 1  # a global to keep track of number of addresses processed

def get_clean_street(address, debug=0):
    '''
    Function to clean and tag street address component
    Returns a clean street address component if the input type is Street Address
    Else returns minimally cleaned version of the input.
    '''
    
    cleaning_note = ""
    
    global counter
    if debug == 1 :
        print(str(counter)+" input address: " + address)
    
    #add some spaces where they are missing around odd chars
    address =  re.sub('[.\(){}<>\']', ' ', address)
    if debug == 1:
        print("we now have: " + address)
                    
        
    # Dashes 
    # If number-number, remove dash
    # else leave it or replace with a space
    if re.search(r'^\d+-\d+\s+', address):
        # EG '49-866 AVE EL RIO' where 866 is the street name 
        address = re.sub('[.-]', '', address)
        cleaning_note = "Removed dash in num-dash-num|"
        if debug == 1:
            print("we removed dash in num-dash-num and now have: " + address)
     
    # Now if there is a dash still replace with a space
    #'12-23rd st' or '#5-36 Main St'
    if re.search(r'\-', address):
    
        address =  re.sub('[.-]', ' ', address)
        cleaning_note = cleaning_note + "Removed dash in str|"
        
        if debug ==1:
            print("we removed a dash in string and now have: " + address)
        
    # See if there is a forward slash in the address
    # and make sure it has spaces around it
    # if it is not a fraction
    if not re.search(r' \d+/\d+', address):
        if debug > 0:
            print("no fraction so check and fix slashes")
        address = re.sub('[./]', ' ', address)
    
    #strip leading hastags
    address = address.lstrip("#")
    
    # strip leading spaces
    address = address.lstrip()
    
    # replace multiple spaces with one space
    address = re.sub('\s+',' ', address ).strip()
    
    if debug == 1:
        print("Pre-cleaned address: ", address)
    
    ############################################################
    # Tagging address with usaddress
    ############################################################
    
    try:
        tagged_address, address_type = usaddress.tag(address)
   
        if debug == 2 :
            print(tagged_address)

        #addr_type = tagged_addr[1]
        cleaning_note = cleaning_note + "|usaddressType=" + address_type
            
        if debug == 1:
            print("Type of tagged addresss: " + address_type)
            
        if (address_type == 'Street Address'):
            y = tagged_address #tagged_addr[0]

            paddr = ""
            if 'AddressNumber' in y.keys():
                paddr = (y['AddressNumber'])

            if 'StreetNamePreType' in y.keys():
                paddr = " ".join([paddr, y['StreetNamePreType']])

            if 'StreetNamePreDirectional' in y.keys():
                paddr = " ".join([paddr, y["StreetNamePreDirectional"]])

            if 'StreetName' in y.keys():
                paddr = " ".join([paddr, y["StreetName"]])

            if 'StreetNamePostDirectional' in y.keys():
                paddr = " ".join([paddr, y["StreetNamePostDirectional"]])

            if 'StreetNamePostType' in y.keys():
                if y["StreetNamePostType"].lower() in usaddress.STREET_NAMES:
                    paddr = " ".join([paddr, y["StreetNamePostType"]])

            if 'OccupancyIdentifier' in y.keys():
                if debug == 1:
                    print("we have an occupancy id")
                if not 'OccupancyType' in y.keys():
                    if debug ==1:
                        print("we do not have an occ type")
                    paddr = " ".join([paddr, y['OccupancyIdentifier']])

            if 'PlaceName' in y.keys():
                paddr = " ".join([paddr, y['PlaceName']])

            if paddr == "":
                paddr = address # set it back to original input
                cleaning_note = "|parsed usaddress.tag returned no string so resetting"

        else:
            paddr = address
            if debug == 1:
                print("Returning pre-cleaned address: " + paddr)
            cleaning_note = cleaning_note + "|returning pre-cleaned address"
    
    except usaddress.RepeatedLabelError as e :
        cleaning_note = cleaning_note + "|usaddress.RepeatedLabelError so returning pre-cleaned address"
        paddr = address
        
    if debug > 0:
        print("Cleaning note: " + cleaning_note)
    
    counter = counter + 1
   
    return paddr, cleaning_note

## Goal - normalize Street part of the address
#### City state and zip should be fine

## Read in the data to be cleaned

First identify one or more CSV files to process.

In [37]:
#my_files = ["file1", "file2"]
#my_files = ["file1"]
my_files = ['sample_addresses']
for i in my_files:
    print("Address file to be processed: ./"+ i +".csv")

Address file to be processed: ./sample_addresses.csv


### Read in the file or files into one dataframe.

In [38]:
for i in my_files:
    in_file = i +".csv"
    print("Processing File: ", in_file)
    
    # FORMAT - Update to match your data!!!
    # Set the type of the columns so that zip codes don't get init zeros trimmed!
    df = pd.read_csv(in_file, encoding="latin-1", dtype={"id": str, "address": str, "city": str, 'zip': str, 'state': str})
    

Processing File:  sample_addresses.csv


In [39]:
len(df.index)

10

In [40]:
df.head()

Unnamed: 0,id,address,city,zip,state
0,Wah Fay Liquors,2101 eigth Ave,Oakland,94606,CA
1,Vision Liquor,1615 Macarthur Blvd,Oakland,94602,CA
2,Souza's Liquors,394 12th Apt D,Oakland,94607,CA
3,Tk Liquors,1500 23th Ave,Oakland,94606,CA
4,Quadriga Wines Inc,6193 Ridgemont Dr PO Box 1212,Oakland,94619,CA


### Clean the Addresses with get_clean_street Function

Cleans only the column with the **street** componnet of the address.
And add two columns to the dataframe 

- `clean_street` which has the cleaned address (single line format) 
- `clean_notes` any notes generated by the cleaning function.

*Be sure to set the correct column to be cleaned - needs to match column label in your dataframe!*

In [41]:
df['clean_street'], df['clean_notes'] = zip(*df['address'].map(get_clean_street))

In [42]:
df.head(10)

Unnamed: 0,id,address,city,zip,state,clean_street,clean_notes
0,Wah Fay Liquors,2101 eigth Ave,Oakland,94606,CA,2101 eigth Ave,|usaddressType=Street Address
1,Vision Liquor,1615 Macarthur Blvd,Oakland,94602,CA,1615 Macarthur Blvd,|usaddressType=Street Address
2,Souza's Liquors,394 12th Apt D,Oakland,94607,CA,394 12th,|usaddressType=Street Address
3,Tk Liquors,1500 23th Ave,Oakland,94606,CA,1500 23th Ave,|usaddressType=Street Address
4,Quadriga Wines Inc,6193 Ridgemont Dr PO Box 1212,Oakland,94619,CA,6193 Ridgemont Dr,|usaddressType=Street Address
5,Bev Mo,525 W. Embarcadero,Oakland,94607,CA,525 W Embarcadero,|usaddressType=Street Address
6,Fairfax Liquor,5403 Foothill Blvd,Oakland,94601,CA,5403 Foothill Blvd,|usaddressType=Street Address
7,Saleen Market,1200 78th Ave,Oakland,94621,CA,1200 78th Ave,|usaddressType=Street Address
8,Park Liquors,828 Franklin Street,Oakland,94607,CA,828 Franklin Street,|usaddressType=Street Address
9,Los Camellos,5913 International Blvd,Oakland,94621,CA,5913 International Blvd,|usaddressType=Street Address


In [25]:
# Sort the output which will make geocoding faster
# UPDATE TO MATCH COLUMN NAMES IN YOUR DATA
df.sort_values(by=['city', 'zip'], inplace=True)


### Save output to file

Assumes a subfolder called `cleaned`.


In [None]:
# Write to csv for geocoding
out_file = "./cleaned/"+ i +"_cleaned.csv"
df.to_csv(out_file, index=False)

In [None]:
df.head()

In [None]:
!ls ./cleaned

In [None]:
!wc -l ../*.csv

### DONE

---
Created by Patty Frontiera, UC Berkeley
Last updated Dec 19, 2018