Skip to content
This repository has been archived by the owner on Feb 4, 2020. It is now read-only.

normalize bad (i.e., non-ASCII) subfield codes #135

Merged
merged 2 commits into from Jul 8, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions pymarc/exceptions.py
Expand Up @@ -38,3 +38,6 @@ def __str__(self):
class FieldNotFound(PymarcException):
def __str__(self):
return "Record does not contain the specified field"

class BadSubfieldCodeWarning(Warning):
"""Warning about a non-ASCII subfield code"""
24 changes: 21 additions & 3 deletions pymarc/record.py
@@ -1,12 +1,14 @@
import re
import six
import logging
import unicodedata
import warnings

from six import Iterator

from pymarc.exceptions import BaseAddressInvalid, RecordLeaderInvalid, \
BaseAddressNotFound, RecordDirectoryInvalid, NoFieldsFound, \
FieldNotFound
FieldNotFound, BadSubfieldCodeWarning
from pymarc.constants import LEADER_LEN, DIRECTORY_ENTRY_LEN, END_OF_RECORD
from pymarc.field import Field, SUBFIELD_INDICATOR, END_OF_FIELD, \
map_marc8_field, RawField
Expand Down Expand Up @@ -300,10 +302,15 @@ def decode_marc(self, marc, to_unicode=True, force_utf8=False,
second_indicator = subs[0][1]

for subfield in subs[1:]:
skip_bytes = 1
if len(subfield) == 0:
continue
code = subfield[0:1].decode('ascii')
data = subfield[1:]
try:
code = subfield[0:1].decode('ascii')
except UnicodeDecodeError:
warnings.warn(BadSubfieldCodeWarning())
code, skip_bytes = normalize_subfield_code(subfield)
data = subfield[skip_bytes:]

if to_unicode:
if self.leader[9] == 'a' or force_utf8:
Expand Down Expand Up @@ -551,3 +558,14 @@ def map_marc8_record(r):
l[9] = 'a' # see http://www.loc.gov/marc/specifications/speccharucs.html
r.leader = "".join(l)
return r

def normalize_subfield_code(subfield):
skip_bytes = 1
try:
text_subfield = subfield.decode('utf-8')
skip_bytes = len(text_subfield[0].encode('utf-8'))
except UnicodeDecodeError:
text_subfield = subfield.decode('latin-1')
decomposed = unicodedata.normalize('NFKD', text_subfield)
without_diacritics = decomposed.encode('ascii', 'ignore').decode('ascii')
return without_diacritics[0], skip_bytes
1 change: 1 addition & 0 deletions test/bad_subfield_code.dat
@@ -0,0 +1 @@
00755cam 22002414a 4500001001300000003000600013005001700019008004100036010001700077020004300094040001800137042000800155050002600163082001700189100003100206245005400237260004200291300007200333500003300405650003700438630002500475630001300500fol05731351 IMchF20000613133448.0000107s2000 nyua 001 0 eng  a 00020737  a0471383147 (paper/cd-rom : alk. paper) aDLCcDLCdDLC apcc00aQA76.73.P22bM33 200000a005.13/32211 aMartinsson, Tobias,d1976-10áActivePerl with ASP and ADO /cTobias Martinsson. aNew York :bJohn Wiley & Sons,c2000. axxi, 289 p. :bill. ;c23 cm. +e1 computer laser disc (4 3/4 in.) a"Wiley Computer Publishing." 0aPerl (Computer program language)00aActive server pages.00aActiveX.
6 changes: 6 additions & 0 deletions test/reader.py
Expand Up @@ -61,6 +61,12 @@ def disabled_test_codecs(self):
record = next(reader)
self.assertEqual(record['245']['a'], u'ActivePerl with ASP and ADO /')

def test_bad_subfield(self):
with open('test/bad_subfield_code.dat', 'rb') as fh:
reader = pymarc.MARCReader(fh)
record = next(reader)
self.assertEqual(record['245']['a'], u'ActivePerl with ASP and ADO /')

def test_bad_indicator(self):
with open('test/bad_indicator.dat', 'rb') as fh:
reader = pymarc.MARCReader(fh)
Expand Down