Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

convert IndexErrors during MARC8 decoding

If a field contains an invalid multi-byte encoding, translate
would throw an IndexError that would percolate to the user.
The user most likely expected to catch only a UnicodeDecodeError.
  • Loading branch information...
commit 2928b786306854cb299a4a2f930fa6e21695c3ae 1 parent 04427a9
godmar godmar authored
Showing with 14 additions and 1 deletion.
  1. +5 −1 pymarc/marc8.py
  2. +1 −0  test/bad_eacc_encoding.dat
  3. +8 −0 test/marc8.py
6 pymarc/marc8.py
View
@@ -16,7 +16,11 @@ def marc8_to_unicode(marc8, hide_utf8_warnings=False):
# XXX: might be good to stash away a converter somehow
# instead of always re-creating it
converter = MARC8ToUnicode(quiet=hide_utf8_warnings)
- return converter.translate(marc8)
+ try:
+ return converter.translate(marc8)
+ except IndexError, ie:
+ # convert IndexError into UnicodeDecodeErrors
+ raise UnicodeDecodeError("marc8_to_unicode", marc8, 0, len(marc8), "invalid multibyte character encoding")
class MARC8ToUnicode:
1  test/bad_eacc_encoding.dat
View
@@ -0,0 +1 @@
+01632nam 2200421Ia 4500001001300000003000600013005001700019006001900036007001500055008004100070035001800111040001800129043001200147049000900159066000700168086002200175099002500197100002000222240006400242245006500306260014400371300004800515500005900563546002200622650003000644650003900674650003700713650003800750650002700788651004400815710008300859710002900942856006200971880007801033907003501111998004301146910002101189ocn503001208OCoLC20100201124721.0m d f cr cn|||||||||100201s2004 vaua s f000 0 jpn d a(GPO)99290207 aGPOcGPOdMvI an-us--- aVPII c$10 aS 20.2:AM 3/JAPN. aDocs S20.2:AM3/JAPN.1 aArnold, Paul A.10aAbout America: how the United States is governed.lJapanese106880-01aBeikoku no t�ochi no shikumih[electronic resource]. aHerndon, Va. :bBraddock Communications ;a[Washington, D.C.] :bU.S. Dept. of State, Bureau of International Information Programs,cc2004. a1 online resource (36, [1]) p. :bcol. ill. aTitle from caption (America.gov, viewed Feb. 1, 2010). aText in Japanese. 0aDemocracyzUnited States. 0aFederal governmentzUnited States. 0aLocal governmentzUnited States. 0aState governmentszUnited States. 0aVotingzUnited States. 0aUnited StatesxPolitics and government.1 aUnited States.bDept. of State.bBureau of International Information Programs.2 aBraddock Communications.40uhttp://purl.access.gpo.gov/GPO/LPS119031zOnline resource006245-01/$1a$1!PVK7oi$N!Q1!G4i$N!0p!Q+{6924f6}(Bh[electronic resource]. a.b29202231b09-02-11c11-08-10 awwwb11-08-10cmdze-fjpngvauh0i0 asthum 11/29/2010
8 test/marc8.py
View
@@ -23,6 +23,14 @@ def test_marc8_reader_to_unicode(self):
self.assertEquals(type(utitle), unicode)
self.assertEquals(utitle, u'De la solitude \xe0 la communaut\xe9.')
+ def test_marc8_reader_to_unicode_bad_eacc_sequence(self):
+ reader = MARCReader(file('test/bad_eacc_encoding.dat'), to_unicode=True, hide_utf8_warnings=True)
+ try:
+ r = reader.next()
+ self.assertFalse("Was able to decode invalid MARC8")
+ except UnicodeDecodeError:
+ self.assertTrue("Caught UnicodeDecodeError as expected")
+
def test_marc8_reader_to_unicode_bad_escape(self):
reader = MARCReader(file('test/bad_marc8_escape.dat'), to_unicode=True)
r = reader.next()
Please sign in to comment.
Something went wrong with that request. Please try again.