edsu · edsu · Jul 8, 2019 · Jul 3, 2019 · Jul 3, 2019
diff --git a/pymarc/exceptions.py b/pymarc/exceptions.py
@@ -38,3 +38,6 @@ def __str__(self):
 class FieldNotFound(PymarcException):
     def __str__(self):
         return "Record does not contain the specified field"
+
+class BadSubfieldCodeWarning(Warning):
+    """Warning about a non-ASCII subfield code"""
diff --git a/pymarc/record.py b/pymarc/record.py
@@ -1,12 +1,14 @@
 import re
 import six
 import logging
+import unicodedata
+import warnings
 
 from six import Iterator
 
 from pymarc.exceptions import BaseAddressInvalid, RecordLeaderInvalid, \
         BaseAddressNotFound, RecordDirectoryInvalid, NoFieldsFound, \
-        FieldNotFound
+        FieldNotFound, BadSubfieldCodeWarning
 from pymarc.constants import LEADER_LEN, DIRECTORY_ENTRY_LEN, END_OF_RECORD
 from pymarc.field import Field, SUBFIELD_INDICATOR, END_OF_FIELD, \
         map_marc8_field, RawField
@@ -300,10 +302,15 @@ def decode_marc(self, marc, to_unicode=True, force_utf8=False,
                     second_indicator = subs[0][1]
 
                 for subfield in subs[1:]:
+                    skip_bytes = 1
                     if len(subfield) == 0:
                         continue
-                    code = subfield[0:1].decode('ascii')
-                    data = subfield[1:]
+                    try:
+                        code = subfield[0:1].decode('ascii')
+                    except UnicodeDecodeError:
+                        warnings.warn(BadSubfieldCodeWarning())
+                        code, skip_bytes = normalize_subfield_code(subfield)
+                    data = subfield[skip_bytes:]
 
                     if to_unicode:
                         if self.leader[9] == 'a' or force_utf8:
@@ -551,3 +558,14 @@ def map_marc8_record(r):
     l[9] = 'a' # see http://www.loc.gov/marc/specifications/speccharucs.html
     r.leader = "".join(l)
     return r
+
+def normalize_subfield_code(subfield):
+    skip_bytes = 1
+    try:
+        text_subfield = subfield.decode('utf-8')
+        skip_bytes = len(text_subfield[0].encode('utf-8'))
+    except UnicodeDecodeError:
+        text_subfield = subfield.decode('latin-1')
+    decomposed = unicodedata.normalize('NFKD', text_subfield)
+    without_diacritics = decomposed.encode('ascii', 'ignore').decode('ascii')
+    return without_diacritics[0], skip_bytes
diff --git a/test/bad_subfield_code.dat b/test/bad_subfield_code.dat
@@ -0,0 +1 @@
+00755cam  22002414a 4500001001300000003000600013005001700019008004100036010001700077020004300094040001800137042000800155050002600163082001700189100003100206245005400237260004200291300007200333500003300405650003700438630002500475630001300500fol05731351 IMchF20000613133448.0000107s2000    nyua          001 0 eng    a   00020737   a0471383147 (paper/cd-rom : alk. paper)  aDLCcDLCdDLC  apcc00aQA76.73.P22bM33 200000a005.13/32211 aMartinsson, Tobias,d1976-10áActivePerl with ASP and ADO /cTobias Martinsson.  aNew York :bJohn Wiley & Sons,c2000.  axxi, 289 p. :bill. ;c23 cm. +e1 computer  laser disc (4 3/4 in.)  a"Wiley Computer Publishing." 0aPerl (Computer program language)00aActive server pages.00aActiveX.
diff --git a/test/reader.py b/test/reader.py
@@ -61,6 +61,12 @@ def disabled_test_codecs(self):
             record = next(reader)
             self.assertEqual(record['245']['a'], u'ActivePerl with ASP and ADO /')
 
+    def test_bad_subfield(self):
+        with open('test/bad_subfield_code.dat', 'rb') as fh:
+            reader = pymarc.MARCReader(fh)
+            record = next(reader)
+            self.assertEqual(record['245']['a'], u'ActivePerl with ASP and ADO /')
+
     def test_bad_indicator(self):
         with open('test/bad_indicator.dat', 'rb') as fh:
             reader = pymarc.MARCReader(fh)