Permalink
Browse files

initial commit

  • Loading branch information...
0 parents commit 6923549b8547b96cb5558af2dfe0bf9259dcc0ed @ewheeler ewheeler committed Aug 16, 2011
Showing with 190 additions and 0 deletions.
  1. +43 −0 README
  2. +146 −0 get_countries_of_earth.py
  3. +1 −0 requirements.txt
43 README
@@ -0,0 +1,43 @@
+Python script to fetch current ISO 3466 country information
+and output a JSON document of combined country code information.
+Per-country JSON documents may be keyed by any of the fields below.
+
+
+Usage: get_countries_of_earth.py [options]
+
+Options:
+ -h, --help show this help message and exit
+ -o OUTFILE, --output=OUTFILE
+ write data to OUTFILE
+ -l, --list export objects as a list of objects
+ -k KEY, --key=KEY export objects as a dict of objects keyed by KEY
+
+
+Description of fields given per country, as defined by data sources (note that fields may be blank, depending on availability):
+ Entity: Country name as shown in "Administrative Subdivisions of Countries"
+ short_name_en: Country's official English short name
+ short_name_fr: Country's offical French short name
+ ISO3166-1-Alpha-2: Alpha-2 codes from ISO 3166-1
+ ISO3166-1-Alpha-3: Alpha-3 codes from ISO 3166-1 (synonymous with World Bank Codes)
+ ISO3166-1-numeric: Numeric codes from ISO 3166-1
+ ITU: Codes assigned by the International Telecommunications Union
+ FIPS: Codes from the U.S. standard FIPS PUB 10-4
+ IOC: Codes assigned by the International Olympics Committee
+ FIFA: Codes assigned by the Fédération Internationale de Football Association
+ DS: Distinguishing signs of vehicles in international traffic (oval bumper sticker codes)
+ WMO: Country abbreviations used in weather reports from the World Meteorological Organization
+ GAUL: Global Administrative Unit Layers from the Food and Agriculture Organization
+ MARC: MAchine-Readable Cataloging codes from the Library of Congress
+ Dial: Country code from ITU-T recommendation E.164 (international dialing code), sometimes followed by area code
+ Independent: Country status, based on the CIA World Factbook
+
+
+Sources:
+ISO 3166 offical English and French short names from:
+ http://www.iso.org/iso/country_codes/iso_3166_code_lists.htm
+
+Many country codes from:
+ http://www.statoids.com/wab.html
+
+Special thanks to Gwillim Law for his excellent statoids.com site (some of the field descriptions are excerpted from his site),
+which is more up-to-date than most similar resources and is much easier to scrape than multiple Wikipedia pages.
@@ -0,0 +1,146 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim: ai ts=4 sts=4 et sw=4
+import codecs
+import urllib
+from optparse import OptionParser
+from lxml import html
+
+try:
+ import simplejson as json
+except ImportError:
+ import json
+
+def fetch_and_write(options):
+ # fetch ISO short names in English and French
+ iso_names_en = urllib.urlretrieve('http://www.iso.org/iso/list-en1-semic-3.txt')
+ iso_names_fr = urllib.urlretrieve('http://www.iso.org/iso/list-fr1-semic.txt')
+
+ # dict for combining en and fr names
+ # {alpha2: {'short_name_en': en, 'short_name_fr': fr}}
+ iso_names = {}
+
+ # urllib.urlretrieve returns a tuple of (localfile, headers)
+ with open(iso_names_en[0], "rU") as fin:
+ for line in fin:
+ # decode iso.org's silly windows encoding
+ line = line.decode('cp1252')
+ # strip line endings, etc
+ line = line.rstrip()
+ # fields are semicolon delineated,
+ # so split into separate parts
+ if ';' in line:
+ semi = line.index(';')
+ name = line[:semi]
+ alpha2 = line[semi+1:]
+ if name and alpha2:
+ iso_names.update({alpha2: {'short_name_en': name}})
+
+ with open(iso_names_fr[0], "rU") as fin:
+ for line in fin:
+ line = line.decode('cp1252')
+ line = line.rstrip()
+ if ';' in line:
+ semi = line.index(';')
+ name = line[:semi]
+ alpha2 = line[semi+1:]
+ if name and alpha2:
+ if alpha2 in iso_names:
+ # alpha2 should be in iso_names because
+ # english was parsed first,
+ # so append french name to list
+ names = iso_names[alpha2]
+ names.update({'short_name_fr': name})
+ iso_names.update({alpha2: names})
+ else:
+ # hopefully this doesnt happen, but
+ # in case there was no english name,
+ # add french with a blank space where
+ # english should be
+ names = {'short_name_en': '', 'short_name_fr': name}
+ iso_names.update({alpha2: names})
+
+ # fetch content of statoids.com country code page
+ statoids_url = "http://www.statoids.com/wab.html"
+ content = urllib.urlopen(statoids_url).read()
+ doc = html.fromstring(content)
+
+ # i dislike some of statoid's column names, so here i have renamed
+ # a few to be more descriptive
+ column_names = ["Entity", "ISO3166-1-Alpha-2","ISO3166-1-Alpha-3","ISO3166-1-numeric","ITU","FIPS","IOC","FIFA","DS","WMO","GAUL","MARC","Dial","is_independent"]
+ alpha2_key = "ISO3166-1-Alpha-2"
+
+ """
+ # comment out the preceding two lines and
+ # uncomment these lines to use statoids.com column names
+ column_names = []
+ alpha2_key = 'A-2'
+ for tr in doc.find_class('hd'):
+ for th in tr.iterchildren():
+ column_names.append(th.text_content())
+ """
+
+ # dict to hold dicts of all table rows
+ table_rows = {}
+
+ # the country code info is in a table where the trs have
+ # alternating classes of `e` and `o`
+ # so fetch half of the rows and zip each row together
+ # with the corresponding column name
+ for tr in doc.find_class('e'):
+ row = []
+ for td in tr.iterchildren():
+ row.append(td.text_content())
+ row_dict = dict(zip(column_names, row))
+ table_rows.update({row_dict[alpha2_key]: row_dict})
+
+ # now do the same for the other half
+ for tr in doc.find_class('o'):
+ row = []
+ for td in tr.iterchildren():
+ row.append(td.text_content())
+ row_dict = dict(zip(column_names, row))
+ table_rows.update({row_dict[alpha2_key]: row_dict})
+
+ if options.as_list:
+ # list to hold combined country info
+ country_info = []
+ else:
+ # dict to hold combined country info
+ country_info = {}
+ keyed_by = options.key
+
+ # iterate through all the table_rows
+ # TODO this assumes that statoids will have all of
+ # the items that are pulled from iso.org
+ for alpha2, info in table_rows.iteritems():
+ # ignore this crap that was parsed from other tables on the page
+ if alpha2 in ['Codes', 'Codes Codes', 'Codes Codes Codes']:
+ continue
+ cinfo = info
+ # add iso.org's names to combined dict of this country's info
+ cinfo.update(iso_names[alpha2])
+ # add combined dict to global (pun intented) data structure
+ if options.as_list:
+ country_info.append(cinfo)
+ else:
+ ckey = cinfo[keyed_by]
+ country_info.update({ckey: cinfo})
+
+ # dump dict as json to file
+ f = open("countries-of-earth.json", mode='w')
+ stream = codecs.getwriter('utf8')(f)
+ json.dump(country_info, stream, ensure_ascii=False, indent=2, encoding='utf-8')
+
+if __name__ == "__main__":
+ parser = OptionParser()
+ parser.add_option("-o", "--output", dest="outfile", default="countries-of-earth.json",
+ help="write data to OUTFILE", metavar="OUTFILE")
+ parser.add_option("-l", "--list", dest="as_list", default=False, action="store_true",
+ help="export objects as a list of objects")
+ parser.add_option("-k", "--key", dest="key", default="ISO3166-1-Alpha-2",
+ help="export objects as a dict of objects keyed by KEY", metavar="KEY")
+
+ (options, args) = parser.parse_args()
+
+ fetch_and_write(options)
@@ -0,0 +1 @@
+lxml==2.3

0 comments on commit 6923549

Please sign in to comment.