Downloaded Open Street Map data for the Berkeley area using the API:
http://overpass-api.de/query_form.html
With query:
(node(37.7983,-122.3504,37.8873,-122.1929);<;);out meta;

Saved the file as berkeley.osm

In [1]:
import xml.etree.cElementTree as ET
import re
from collections import defaultdict
import pprint
import codecs
import json

In [2]:
osm_file = 'berkeley.osm'

In [3]:
expected_street_type_list = ['Street','Avenue','Boulevard','Drive','Court',
                             'Place','Alameda','Broadway','Road','Parkway','Way',
                            'Plaza','Square','Telegraph']
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
special_chars_re = re.compile(r'[=\+/&<>;"\?%#$@\,\.]')
abbreviations_re = re.compile(r'[Bb]e?twe?e?n ')
cardinal_direction_re = re.compile(r'[\s]*[NSEW][\s]+')
starts_numeric_re = re.compile(r'^[\d]+')
numbered_street_re = re.compile(r'^[\d]+(st|nd|rd|th)')

#find numbers that start street names, except for 
#1st, 2nd, 3rd 4th, 5th street etc
#numbers probably should go with the street number field
non_numeric_re = re.compile(r'^(\d+)') 
word_replace = {'Btwn': 'Between',
               'btwn': 'btwn',
               'St': 'Street',
               'St.': 'Street',
               'Ct': 'Court',
               'Ct.': 'Court',
               'Pl': 'Plaza',
               'Pl.': 'Plaza',
               'Ave': 'Avenue',
               'Ave.': 'Avenue',
               'Sq': 'Square',
               'Sq.': 'Square'}

In [4]:
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

In [5]:
def is_postal(elem):
    return (elem.attrib['k'] == "addr:postcode")

In [6]:
def is_housenumber(elem):
    return (elem.attrib['k'] == "addr:housenumber")

In [7]:
def is_lanes(elem):
    return (elem.get('k') == "lanes")

In [8]:
def is_amenity(elem):
    return (elem.get('k') == "amenity")

In [9]:
def audit_street(street_dict, val):
    m = street_type_re.search(val)
    if m:
        street_type = m.group()
        if street_type not in expected_street_type_list:
            street_dict[street_type].add(val)
            
    m = special_chars_re.search(val)
    if m:
        street_dict[m.group()].add(val)
        
    m = abbreviations_re.search(val)
    if m:
        street_dict[m.group()].add(val)
        
    m = cardinal_direction_re.search(val)
    if m:
        street_dict[m.group()].add(val)
        
    m = starts_numeric_re.search(val)
    n = numbered_street_re.search(val)
    if m and not n:
        street_dict[m.group()].add(val)

In [10]:
def audit_postal(audit_dict,postal):
    postal_length = len(postal)
    audit_dict[postal_length].add(postal)

In [11]:
def audit_housenumber(audit_dict,val):
    m = non_numeric_re.search(val)
    if m:
        audit_dict[m.group()].add(val)

In [12]:
def audit_lanes(audit_dict,val):
    audit_dict[val] = val

In [13]:
def audit_amenity(audit_dict,val):
    audit_dict[val] = val

In [14]:
def audit(osm_file):
    counter = 0
    counter_max = 500000
    street_d = defaultdict(set)
    postal_d = defaultdict(set)
    housenumber_d = defaultdict(set)
    lanes_d = defaultdict(set)
    amenity_d = defaultdict(set)
    
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        
        if elem.tag in ["way",'node']:
            counter +=1
            for tag in elem.iter("tag"):
                val = tag.attrib['v']
                if is_street_name(tag):
                    audit_street(street_d,tag.attrib['v'])
                    
                if is_postal(tag):
                    audit_postal(postal_d,tag.attrib['v'])
                    
                if is_housenumber(tag):
                    audit_housenumber(housenumber_d,tag.attrib['v'])
                    
                if is_lanes(tag):
                    audit_lanes(lanes_d,val)
                    
                if is_amenity(tag):
                    audit_amenity(amenity_d,val)
                    

        else:
            continue
            
        if counter > counter_max:
            break

    return (street_d,postal_d, housenumber_d, lanes_d, amenity_d)

In [15]:
if __name__ == '__main__':
    (street_d,postal_d, housenumber_d, lanes_d, amenity_d) = audit(osm_file)

Some expected street names are Plaza, Court, and Square.  The abbreviations Ct and Pl can be replaced by Court and Plaza.

I noticed that some street names are more like descriptions of intersections of boundaries between two streets.  The abbreviation [Bb]twn can be replaced with 'between'.  The @ can be replaced with 'at'.

Also, a few street names include the street number, such as 111 Grand Avenue, 3605 Telegraph.  I may want to put the number in the addr:housenumber field.

In [16]:
street_d

defaultdict(set,
            {'&': {'Kempton Way & Fairmont Avenue'},
             '.': {'Frank H. Ogawa Plaza', 'Thomas L. Berkley Way'},
             '/': {'San Francisco/Oakland Bridge Toll Pl',
              'Under I-580 Btwn Fruitvale / Champion St'},
             '111': {'111 Grand Avenue'},
             '3605': {'3605 Telegraph'},
             ';': {'La Salle Ave;Crest Road', 'Portal Ave;Florada Avenue'},
             '@': {'Under I-880 @ 7th St & Linden St'},
             'Alley': {"Kahn's Alley"},
             'Ave': {'Shattuck Ave'},
             'Btwn ': {'Under I-580 Btwn Fruitvale / Champion St'},
             'Circle': {'Columbia Circle',
              'Croydon Circle',
              'Harding Circle',
              'Saint James Circle',
              'Tyson Circle',
              'Wilson Circle'},
             'Crescent': {'Clarendon Crescent'},
             'Ct': {'Conrad Ct'},
             'Floor': {'Washington St 2nd Floor'},
             'Freeway': {'MacArthur Freeway

For postcode, some include the state abbreviation 'CA' followed by the 5-digit postal code, some have the 'ca' state only, and some have the hyphenated zip code extension with four additional digits.  If there are non-numeric values in postcode, I will remove the letters.  I'll also remove the hyphen and extension.

In [17]:
postal_d

defaultdict(set,
            {2: {'ca'},
             5: {'93710',
              '94109',
              '94110',
              '94601',
              '94602',
              '94605',
              '94606',
              '94607',
              '94608',
              '94609',
              '94610',
              '94611',
              '94612',
              '94618',
              '94702',
              '94703',
              '94704',
              '94705',
              '94706',
              '94707',
              '94708',
              '94709',
              '94710',
              '94720',
              '95476'},
             8: {'CA 94607'},
             10: {'94612-2202', '94720-1076'}})

For housenumber, some include alpha chars and dashes to represent a range, or are comma separated.  For those that are '-' or ',' separated, it might make sense to turn that into a list.

In [18]:
housenumber_d

defaultdict(set,
            {'4020': {'4020'},
             '4029': {'4029'},
             '5980': {'5980'},
             '5987': {'5987'},
             '344': {'344'},
             '345': {'345'},
             '346': {'346'},
             '347': {'347'},
             '340': {'340'},
             '341': {'341'},
             '342': {'342'},
             '343': {'343'},
             '348': {'348'},
             '349': {'349'},
             '6150': {'6150'},
             '6155': {'6155'},
             '1653': {'1653'},
             '2318': {'2318'},
             '2319': {'2319'},
             '2317': {'2317'},
             '2315': {'2315'},
             '2310': {'2310'},
             '2311': {'2311', '2311A'},
             '5854': {'5854'},
             '5855': {'5855'},
             '5856': {'5856'},
             '298': {'298'},
             '299': {'299'},
             '296': {'296'},
             '297': {'297'},
             '294': {'294'},
             '295': {'295'},
             '

For lanes, one of the values is 18 lanes.  This refers to the toll area on the I-80 highway in Oakland, before the San Francisco-Oakland Bay Bridge.

http://www.openstreetmap.org/way/236348366#map=13/37.8247/-122.3141

In [19]:
lanes_d

defaultdict(set,
            {'1': '1',
             '18': '18',
             '2': '2',
             '3': '3',
             '4': '4',
             '5': '5',
             '6': '6',
             '7': '7',
             '8': '8'})

Looking at the node associated with this way, it's a toll booth, so 18 lanes probably makes sense.  It's proably the toll booth for the San Francisco Bay Bridge.

In [110]:
  <node id="293598417" lat="37.8247804" lon="-122.3138369" version="6" timestamp="2015-04-26T17:19:50Z" changeset="30510468" uid="616774" user="mueschel">
    <tag k="barrier" v="toll_booth"/>
    <tag k="lanes" v="18"/>

SyntaxError: invalid syntax (<ipython-input-110-4249a41f3ebc>, line 1)

Amenity has some values that similar, and probably can be standardized:

Standardize these
 'car_share'
 'car_sharing'
 
 These are different: a post_box is probably just a box for dropping off letters, whereas a post office has a name referring to an post office building with employees
 'post_box'
 'post_office'
 
 Simplify these to just parking
 'parking'
 'parking_entrance'
 'parking_space'
 
 Standardize these
 'car_share'
 'car_sharing'

The values 'college', 'university', are used somewhat interchangeably, so it may make sense to put these in a sub-dictionary 

 This may be useful information for taxi drivers
 'toilets'
 


In [20]:
amenity_d

defaultdict(set,
            {'animal_boarding': 'animal_boarding',
             'animal_shelter': 'animal_shelter',
             'arts_centre': 'arts_centre',
             'atm': 'atm',
             'bank': 'bank',
             'bar': 'bar',
             'barber': 'barber',
             'bbq': 'bbq',
             'bench': 'bench',
             'bicycle_parking': 'bicycle_parking',
             'bicycle_rental': 'bicycle_rental',
             'bicycle_repair_station': 'bicycle_repair_station',
             'biergarten': 'biergarten',
             'boat_rental': 'boat_rental',
             'bus_station': 'bus_station',
             'cafe': 'cafe',
             'car_rental': 'car_rental',
             'car_share': 'car_share',
             'car_sharing': 'car_sharing',
             'car_wash': 'car_wash',
             'casual carpoool pickup': 'casual carpoool pickup',
             'childcare': 'childcare',
             'chiropractor': 'chiropractor',
             'cinema': 'cinema',
   

Toilets are either denoted by k="amenity" v="toilets", or in the case of BART public train stations, by k="toilets" v="yes"