Permalink
Browse files

Working on refactoring Normalic::Address.parse

  • Loading branch information...
mkscrg committed Oct 10, 2011
1 parent 6f4c3aa commit 7448f6eecf6e90f2e968fa0e340a7af58248bfc6
Showing with 135 additions and 86 deletions.
  1. +82 −82 lib/constants.rb
  2. +53 −4 lib/normalic.rb
View
@@ -15,88 +15,88 @@ class Address
Directional_code = Directional.invert
StateCodes = {
- "alabama" => "AL",
- "alaska" => "AK",
- "american samoa" => "AS",
- "arizona" => "AZ",
- "arkansas" => "AR",
- "california" => "CA",
- "colorado" => "CO",
- "connecticut" => "CT",
- "delaware" => "DE",
- "district of columbia" => "DC",
- "districtofcolumbia" => "DC",
- "federated states of micronesia" => "FM",
- "florida" => "FL",
- "georgia" => "GA",
- "guam" => "GU",
- "hawaii" => "HI",
- "idaho" => "ID",
- "illinois" => "IL",
- "indiana" => "IN",
- "iowa" => "IA",
- "kansas" => "KS",
- "kentucky" => "KY",
- "louisiana" => "LA",
- "maine" => "ME",
- "marshall islands" => "MH",
- "maryland" => "MD",
- "massachusetts" => "MA",
- "michigan" => "MI",
- "minnesota" => "MN",
- "mississippi" => "MS",
- "missouri" => "MO",
- "montana" => "MT",
- "nebraska" => "NE",
- "nevada" => "NV",
- "new hampshire" => "NH",
- "newhampshire" => "NH",
- "new jersey" => "NJ",
- "newjersey" => "NJ",
- "new mexico" => "NM",
- "newmexico" => "NM",
- "new york" => "NY",
- "newyork" => "NY",
- "north carolina" => "NC",
- "northcarolina" => "NC",
- "north dakota" => "ND",
- "northdakota" => "ND",
- "northern mariana islands" => "MP",
- "ohio" => "OH",
- "oklahoma" => "OK",
- "oregon" => "OR",
- "palau" => "PW",
- "pennsylvania" => "PA",
- "puerto rico" => "PR",
- "rhode island" => "RI",
- "south carolina" => "SC",
- "southcarolina" => "SC",
- "south dakota" => "SD",
- "southdakota" => "SD",
- "tennessee" => "TN",
- "texas" => "TX",
- "utah" => "UT",
- "vermont" => "VT",
- "virgin islands" => "VI",
- "virginia" => "VA",
- "washington" => "WA",
- "west virginia" => "WV",
- "westvirginia" => "WV",
- "wisconsin" => "WI",
- "wyoming" => "WY",
- "alberta" => "AB",
- "british columbia" => "BC",
- "newfoundland and labrador" => "NL",
- "nova scotia" => "NS",
- "prince edward island" => "PE",
- "new brunswick" => "NB",
- "quebec" => "QC",
- "ontario" => "ON",
- "manitoba" => "MB",
- "saskatchewan" => "SK",
- "nunavut" => "NU",
- "northwest territories" => "NT",
- "yukon territory" => "YT"
+ "alabama" => "al",
+ "alaska" => "ak",
+ "american samoa" => "as",
+ "arizona" => "az",
+ "arkansas" => "ar",
+ "california" => "ca",
+ "colorado" => "co",
+ "connecticut" => "ct",
+ "delaware" => "de",
+ "district of columbia" => "dc",
+ "districtofcolumbia" => "dc",
+ "federated states of micronesia" => "fm",
+ "florida" => "fl",
+ "georgia" => "ga",
+ "guam" => "gu",
+ "hawaii" => "hi",
+ "idaho" => "id",
+ "illinois" => "il",
+ "indiana" => "in",
+ "iowa" => "ia",
+ "kansas" => "ks",
+ "kentucky" => "ky",
+ "louisiana" => "la",
+ "maine" => "me",
+ "marshall islands" => "mh",
+ "maryland" => "md",
+ "massachusetts" => "ma",
+ "michigan" => "mi",
+ "minnesota" => "mn",
+ "mississippi" => "ms",
+ "missouri" => "mo",
+ "montana" => "mt",
+ "nebraska" => "ne",
+ "nevada" => "nv",
+ "new hampshire" => "nh",
+ "newhampshire" => "nh",
+ "new jersey" => "nj",
+ "newjersey" => "nj",
+ "new mexico" => "nm",
+ "newmexico" => "nm",
+ "new york" => "ny",
+ "newyork" => "ny",
+ "north carolina" => "nc",
+ "northcarolina" => "nc",
+ "north dakota" => "nd",
+ "northdakota" => "nd",
+ "northern mariana islands" => "mp",
+ "ohio" => "oh",
+ "oklahoma" => "ok",
+ "oregon" => "or",
+ "palau" => "pw",
+ "pennsylvania" => "pa",
+ "puerto rico" => "pr",
+ "rhode island" => "ri",
+ "south carolina" => "sc",
+ "southcarolina" => "sc",
+ "south dakota" => "sd",
+ "southdakota" => "sd",
+ "tennessee" => "tn",
+ "texas" => "tx",
+ "utah" => "ut",
+ "vermont" => "vt",
+ "virgin islands" => "vi",
+ "virginia" => "va",
+ "washington" => "wa",
+ "west virginia" => "wv",
+ "westvirginia" => "wv",
+ "wisconsin" => "wi",
+ "wyoming" => "wy",
+ "alberta" => "ab",
+ "british columbia" => "bc",
+ "newfoundland and labrador" => "nl",
+ "nova scotia" => "ns",
+ "prince edward island" => "pe",
+ "new brunswick" => "nb",
+ "quebec" => "qc",
+ "ontario" => "on",
+ "manitoba" => "mb",
+ "saskatchewan" => "sk",
+ "nunavut" => "nu",
+ "northwest territories" => "nt",
+ "yukon territory" => "yt"
}
StateCodes_key = StateCodes.map{|k,v| [v, k]}
View
@@ -50,12 +50,16 @@ def []=(field_name, value)
end
end
- class AddressR
-
- end
-
# only handles U.S. addresses
class Address
+ UNIT_TYPE_REGEX = /ap(artmen)?t|box|building|bldg|dep(artmen)?t|fl(oor)?|po( box)?|r(oo)?m|s(ui)?te|un(i)?t/
+ REGEXES = {:country => /usa/,
+ :zipcode => /\d{5}(-\d{4})?/,
+ :state => Regexp.new(StateCodes.values * '|' + '|' +
+ StateCodes.keys * '|'),
+ :city => /\w+(\s\w+)*/,
+ :unit => Regexp.new('((#?\w+\W+)?(' + UNIT_TYPE_REGEX.source + '))|' +
+ '((' + UNIT_TYPE_REGEX.source + ')(\W+#?\w+)?)')}
attr_accessor :number, :direction, :street, :type, :city, :state, :zipcode
@@ -104,6 +108,31 @@ def line1
"#{number}#{" " + direction if direction}#{" " + street if street}#{" " + type if type}"
end
+ def self.parseR(str)
+ address = String.new(str)
+ address.downcase!
+ address.gsub!("\n",', ')
+ address.strip!
+ address.gsub!(/\s+/,' ')
+ address.gsub!('.', '')
+
+ address.detoken!(REGEXES[:country])
+
+ zipcode = address.detoken!(REGEXES[:zipcode])
+
+ state = address.detoken!(REGEXES[:state])
+ state = StateCodes[state] || state
+
+ city = address.detoken!(REGEXES[:city])
+ city = ZipCityMap[zipcode] if zipcode && ZipCityMap[zipcode]
+
+ unit = address.detoken!(REGEXES[:unit])
+
+ self.new(:city => city,
+ :state => state,
+ :zipcode => zipcode)
+ end
+
#Iteratively take chunks off of the string.
def self.parse(address)
address.strip!
@@ -185,4 +214,24 @@ def self.parse(address)
end
class ParseError < StandardError; end
+
+ private
+
+ String.class_eval do
+ def detoken!(regex)
+ regex_p = Regexp.new('\W+(' + regex.source + ')$', regex.source)
+ token_p = self.cut!(regex_p)
+ token_p ? token_p.cut!(regex_p, 1) : nil
+ end
+
+ def cut!(regex, match_index=0)
+ if match = self.match(regex)
+ i1, i2 = match.offset(match_index)
+ self[i1...i2] = ''
+ match[match_index]
+ else
+ nil
+ end
+ end
+ end
end

0 comments on commit 7448f6e

Please sign in to comment.