Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 1102c84
Showing
6 changed files
with
318 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
pkg | ||
doc | ||
Manifest |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
h4. Copyright and License | ||
|
||
The MIT License | ||
|
||
Copyright (c) 2008 Jim Garvin | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in | ||
all copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
THE SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
h1. AddressExtractor | ||
|
||
Find and/or replace mailing addresses in strings. | ||
|
||
h2. Examples | ||
|
||
<pre><code> | ||
string = <<EOF | ||
Please send the package to 123 Foo St., Someplace FL | ||
|
||
My phone number is 123-1234 and St. Marc of Israel can be reached | ||
via mail at: | ||
123 Goob Avenue | ||
Apt 123 | ||
Nice Town CA 123456 | ||
EOF | ||
|
||
# Find first address | ||
AddressExtractor.first_address(string) # => { :street1 => "123 Foo St.", :city => "Someplace", :state => "FL" } | ||
|
||
# Find all addresses | ||
AddressExtractor.find_addresses(string) # => | ||
# [ | ||
# { :street1 => "123 Foo St.", :city => "Someplace", :state => "FL" } | ||
# { :street1 => "123 Goob Avenue.", :street2 => "Apt 123", :city => "Nice Town", :state => "CA", :zip => "123456" } | ||
# ] | ||
|
||
# Do a gsub on first address | ||
new_string = AddressExtractor.replace_first_address(string) do |address_hash, address_string| | ||
map_link_to(address_string) | ||
end | ||
|
||
# Do a gsub on all addresses | ||
new_string = AddressExtractor.replace_addresses(string) do |address_hash, address_string| | ||
map_link_to(address_string) | ||
end | ||
</code></pre> | ||
|
||
h3. About | ||
|
||
Written by Jim Garvin at RubyConf '08 at the request of Chris Murphy and Ryan McGeary so they could use it in Yarp.com. | ||
|
||
You can use it, too. | ||
|
||
The address-finding regex may be a bit naive, I'll gladly accept pull requests that add to the test data and tests. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
require 'rubygems' | ||
require 'rake' | ||
require 'echoe' | ||
|
||
Echoe.new('address_extractor', '0.1.0') do |p| | ||
p.description = "Give it text. It finds addresses in it." | ||
p.url = "http://github.com/coderifous/address_extractor" | ||
p.author = "Jim Garvin" | ||
p.email = "jim at thegarvin dot com" | ||
p.ignore_pattern = ["tmp/*", "script/*"] | ||
p.development_dependencies = [] | ||
end | ||
|
||
Dir["#{File.dirname(__FILE__)}/tasks/*.rake"].sort.each { |ext| load ext } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,162 @@ | ||
class AddressExtractor | ||
class << self | ||
|
||
def first_address(string) | ||
hashify_results string.scan(ADDRESS_PATTERN).first | ||
end | ||
|
||
def find_addresses(string) | ||
string.scan(ADDRESS_PATTERN).collect { |a| hashify_results(a) }.compact | ||
end | ||
|
||
def replace_first_address(string) | ||
hash = first_address(string) | ||
string.sub(ADDRESS_PATTERN) do |match| | ||
yield(hash, $&) | ||
end | ||
end | ||
|
||
def replace_addresses(string) | ||
string.gsub(ADDRESS_PATTERN) do |match| | ||
hash = hashify_results match.scan(ADDRESS_PATTERN).first | ||
useful_address?(hash) ? yield(hash, $&) : match | ||
end | ||
end | ||
|
||
def hashify_results(matches) | ||
result = { } | ||
capture_index = 0 | ||
CAPTURE_MAP.each do |field| | ||
result[field] = matches[capture_index].to_s.chomp if matches[capture_index] | ||
capture_index += 1 | ||
end | ||
useful_address?(result) ? result : nil | ||
end | ||
|
||
def useful_address?(hash) | ||
hash && | ||
hash[:street1] && ( hash[:zip] || hash[:city] && hash[:state] ) | ||
end | ||
|
||
end | ||
|
||
CAPTURE_MAP = [ :street1, :street2, :city, :state, :zip, :zip ] | ||
|
||
STATES = <<-EOF | ||
ALABAMA AL | ||
ALASKA AK | ||
AMERICAN SAMOA AS | ||
ARIZONA AZ | ||
ARKANSAS AR | ||
CALIFORNIA CA | ||
COLORADO CO | ||
CONNECTICUT CT | ||
DELAWARE DE | ||
DISTRICT OF COLUMBIA DC | ||
FEDERATED STATES OF MICRONESIA FM | ||
FLORIDA FL | ||
GEORGIA GA | ||
GUAM GU | ||
HAWAII HI | ||
IDAHO ID | ||
ILLINOIS IL | ||
INDIANA IN | ||
IOWA IA | ||
KANSAS KS | ||
KENTUCKY KY | ||
LOUISIANA LA | ||
MAINE ME | ||
MARSHALL ISLANDS MH | ||
MARYLAND MD | ||
MASSACHUSETTS MA | ||
MICHIGAN MI | ||
MINNESOTA MN | ||
MISSISSIPPI MS | ||
MISSOURI MO | ||
MONTANA MT | ||
NEBRASKA NE | ||
NEVADA NV | ||
NEW HAMPSHIRE NH | ||
NEW JERSEY NJ | ||
NEW MEXICO NM | ||
NEW YORK NY | ||
NORTH CAROLINA NC | ||
NORTH DAKOTA ND | ||
NORTHERN MARIANA ISLANDS MP | ||
OHIO OH | ||
OKLAHOMA OK | ||
OREGON OR | ||
PALAU PW | ||
PENNSYLVANIA PA | ||
PUERTO RICO PR | ||
RHODE ISLAND RI | ||
SOUTH CAROLINA SC | ||
SOUTH DAKOTA SD | ||
TENNESSEE TN | ||
TEXAS TX | ||
UTAH UT | ||
VERMONT VT | ||
VIRGIN ISLANDS VI | ||
VIRGINIA VA | ||
WASHINGTON WA | ||
WEST VIRGINIA WV | ||
WISCONSIN WI | ||
WYOMING WY | ||
EOF | ||
|
||
STATE_REGEX = STATES.split(/\n/).collect{ |n| n.scan(/(\w.*\w)\s*([A-Z]{2})\s*$/) }.join("|") | ||
|
||
SECONDARY_UNIT_DESIGNATORS = <<-EOF | ||
APARTMENT APT | ||
BASEMENT BSMT | ||
BUILDING BLDG | ||
DEPARTMENT DEPT | ||
FLOOR FL | ||
FRONT FRNT | ||
HANGAR HNGR | ||
LOBBY LBBY | ||
LOT LOT | ||
LOWER LOWR | ||
OFFICE OFC | ||
PENTHOUSE PH | ||
PIER PIER | ||
REAR REAR | ||
ROOM RM | ||
SIDE SIDE | ||
SLIP SLIP | ||
SPACE SPC | ||
STOP STOP | ||
SUITE STE | ||
TRAILER TRLR | ||
UNIT UNIT | ||
UPPER UPPR | ||
EOF | ||
|
||
SECONDARY_UNIT_DESIGNATORS_REGEX = SECONDARY_UNIT_DESIGNATORS.split(/\n/).collect{ |n| n.scan(/(\w+)\s*(\w+)\s*$/) }.join("|") | ||
|
||
ADDRESS_PATTERN = / | ||
( | ||
\d+ # A few numbers | ||
\s+ | ||
(?:[A-Za-z'.-]+\s?){1,3} # Followed by a street name | ||
) | ||
\s* ,? \s* | ||
( | ||
(?:\d+\s+)? # a secondary unit, optionally | ||
(?:#{SECONDARY_UNIT_DESIGNATORS_REGEX}) | ||
(?:\s+\d+)? | ||
)? | ||
\s* ,? \s* # a comma, optionally | ||
(?: | ||
(?: | ||
((?:[A-Za-z]+\s?){1,3}) # city | ||
\s+ | ||
\b(#{STATE_REGEX})\b # state | ||
\s* ,? \s* # a comma, optionally | ||
(\d{6})? # a zip code, optionally | ||
) | ||
| # or, instead of city and state | ||
(\d{6})? # a lone zip code will do | ||
) | ||
/xi | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
$: << File.dirname(__FILE__)+"/../lib" | ||
|
||
require 'test/unit' | ||
require 'address_extractor.rb' | ||
|
||
class AddressExtractorTest < Test::Unit::TestCase | ||
|
||
def test_first_address_extraction | ||
address = AddressExtractor.first_address(DATA1) | ||
assert_first_address(address) | ||
end | ||
|
||
def test_find_addresses | ||
addresses = AddressExtractor.find_addresses(DATA1) | ||
assert_first_address addresses[0] | ||
assert_second_address addresses[1] | ||
end | ||
|
||
def test_replace_first_address | ||
string = AddressExtractor.replace_first_address(DATA1) do |address_hash, address| | ||
assert_first_address address_hash | ||
assert_first_address_string address | ||
"skidoosh" | ||
end | ||
assert string =~ /Please send the package to skidoosh/ | ||
end | ||
|
||
def test_replace_addresses | ||
string = AddressExtractor.replace_addresses(DATA1) do |address_hash, address| | ||
"skidoosh" | ||
end | ||
assert string =~ /Please send the package to skidoosh/ | ||
assert string =~ /via mail at:\n skidoosh/ | ||
end | ||
|
||
module Helpers | ||
def assert_first_address(a) | ||
assert_not_nil a | ||
assert_equal "123 Foo St.", a[:street1] | ||
assert_equal nil, a[:street2] | ||
assert_equal "Someplace", a[:city] | ||
assert_equal "FL", a[:state] | ||
assert_equal nil, a[:zip] | ||
end | ||
|
||
def assert_first_address_string(string) | ||
assert_match /^123 Foo St\., Someplace FL\s*$/, string | ||
end | ||
|
||
|
||
def assert_second_address(a) | ||
assert_not_nil a | ||
assert_equal "123 Goob Avenue", a[:street1] | ||
assert_equal "Apt 123", a[:street2] | ||
assert_equal "Nice Town", a[:city] | ||
assert_equal "CA", a[:state] | ||
assert_equal "123456", a[:zip] | ||
end | ||
end | ||
include Helpers | ||
end | ||
|
||
DATA1 = <<EOF | ||
Please send the package to 123 Foo St., Someplace FL | ||
My phone number is 123-1234 and St. Marc of Israel can be reached | ||
via mail at: | ||
123 Goob Avenue | ||
Apt 123 | ||
Nice Town CA 123456 | ||
EOF |