Permalink
Browse files

convert IndexErrors during MARC8 decoding

If a field contains an invalid multi-byte encoding, translate
would throw an IndexError that would percolate to the user.
The user most likely expected to catch only a UnicodeDecodeError.
  • Loading branch information...
1 parent 04427a9 commit 2928b786306854cb299a4a2f930fa6e21695c3ae @godmar godmar committed Mar 22, 2012
Showing with 14 additions and 1 deletion.
  1. +5 −1 pymarc/marc8.py
  2. +1 −0 test/bad_eacc_encoding.dat
  3. +8 −0 test/marc8.py
View
@@ -16,7 +16,11 @@ def marc8_to_unicode(marc8, hide_utf8_warnings=False):
# XXX: might be good to stash away a converter somehow
# instead of always re-creating it
converter = MARC8ToUnicode(quiet=hide_utf8_warnings)
- return converter.translate(marc8)
+ try:
+ return converter.translate(marc8)
+ except IndexError, ie:
+ # convert IndexError into UnicodeDecodeErrors
+ raise UnicodeDecodeError("marc8_to_unicode", marc8, 0, len(marc8), "invalid multibyte character encoding")
class MARC8ToUnicode:
@@ -0,0 +1 @@
+01632nam 2200421Ia 4500001001300000003000600013005001700019006001900036007001500055008004100070035001800111040001800129043001200147049000900159066000700168086002200175099002500197100002000222240006400242245006500306260014400371300004800515500005900563546002200622650003000644650003900674650003700713650003800750650002700788651004400815710008300859710002900942856006200971880007801033907003501111998004301146910002101189ocn503001208OCoLC20100201124721.0m d f cr cn|||||||||100201s2004 vaua s f000 0 jpn d a(GPO)99290207 aGPOcGPOdMvI an-us--- aVPII c$10 aS 20.2:AM 3/JAPN. aDocs S20.2:AM3/JAPN.1 aArnold, Paul A.10aAbout America: how the United States is governed.lJapanese106880-01aBeikoku no t�ochi no shikumih[electronic resource]. aHerndon, Va. :bBraddock Communications ;a[Washington, D.C.] :bU.S. Dept. of State, Bureau of International Information Programs,cc2004. a1 online resource (36, [1]) p. :bcol. ill. aTitle from caption (America.gov, viewed Feb. 1, 2010). aText in Japanese. 0aDemocracyzUnited States. 0aFederal governmentzUnited States. 0aLocal governmentzUnited States. 0aState governmentszUnited States. 0aVotingzUnited States. 0aUnited StatesxPolitics and government.1 aUnited States.bDept. of State.bBureau of International Information Programs.2 aBraddock Communications.40uhttp://purl.access.gpo.gov/GPO/LPS119031zOnline resource006245-01/$1a$1!PVK7oi$N!Q1!G4i$N!0p!Q+{6924f6}(Bh[electronic resource]. a.b29202231b09-02-11c11-08-10 awwwb11-08-10cmdze-fjpngvauh0i0 asthum 11/29/2010
View
@@ -23,6 +23,14 @@ def test_marc8_reader_to_unicode(self):
self.assertEquals(type(utitle), unicode)
self.assertEquals(utitle, u'De la solitude \xe0 la communaut\xe9.')
+ def test_marc8_reader_to_unicode_bad_eacc_sequence(self):
+ reader = MARCReader(file('test/bad_eacc_encoding.dat'), to_unicode=True, hide_utf8_warnings=True)
+ try:
+ r = reader.next()
+ self.assertFalse("Was able to decode invalid MARC8")
+ except UnicodeDecodeError:
+ self.assertTrue("Caught UnicodeDecodeError as expected")
+
def test_marc8_reader_to_unicode_bad_escape(self):
reader = MARCReader(file('test/bad_marc8_escape.dat'), to_unicode=True)
r = reader.next()

0 comments on commit 2928b78

Please sign in to comment.