Skip to content

Commit

Permalink
Merge remote-tracking branch 'internetarchive/master'
Browse files Browse the repository at this point in the history
  • Loading branch information
deborahgu committed Sep 20, 2018
2 parents cc5a6a3 + 26fe7ea commit e203411
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 3 deletions.
15 changes: 12 additions & 3 deletions abbyy_to_epub3/parse_abbyy.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,14 +217,23 @@ def parse_metadata(self):
self.metadata[term.tag] = [term.text, ]

# if the language isn't explicitly set, assume English
# convert to the correct ISO standard
# if the language code is invalid, assume English
# language might be ISO 639-6, ISO 639-2/B, ISO 639-2/T, or ISO 639-1
# (in pycountry, called: name, alpha_3, bibliographic, and alpha_2)
if 'language' not in self.metadata:
self.metadata['language'] = ['en']
else:
lang_code = self.metadata['language'][0]
if len(lang_code) == 3:
lang = pycountry.languages.get(alpha_3=lang_code)
try:
lang = pycountry.languages.lookup(lang_code)
self.metadata['language'][0] = lang.alpha_2
except LookupError:
self.logger.debug(
"Invalid language code {}. Setting to English".format(
lang_code
)
)
self.metadata['language'][0] = 'en'

def parse_abbyy(self):
"""
Expand Down
44 changes: 44 additions & 0 deletions abbyy_to_epub3/tests/test_parse_abbyy.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,47 @@ def test_sanitize(self):
result = sanitize_xml(text)

assert result == good

def test_parse_iso639_1(self, finereader10):
""" Understands an ISO 639-1 (alpha-2) language entry. """
parser = finereader10
self.metadata['language'] = ['wo']
parser.parse_metadata()

assert self.metadata['language'][0] == 'wo'

def test_parse_iso639_2T(self, finereader10):
"""
Understands an ISO 639-2/T (alpha-3 terminological) language entry.
"""
parser = finereader10
self.metadata['language'] = ['deu']
parser.parse_metadata()

assert self.metadata['language'][0] == 'de'

def test_parse_iso639_2B(self, finereader10):
"""
Understands an ISO 639-2/B (alpha-3 bibliographic) language entry.
"""
parser = finereader10
self.metadata['language'] = ['ger']
parser.parse_metadata()

assert self.metadata['language'][0] == 'de'

def test_parse_iso639_6(self, finereader10):
""" Understands an ISO 639-6 (English name) language entry. """
parser = finereader10
self.metadata['language'] = ['Cree']
parser.parse_metadata()

assert self.metadata['language'][0] == 'cr'

def test_parse_bad_lang(self, finereader10):
""" If language entry is bogus, set to English. """
parser = finereader10
self.metadata['language'] = ['Rikchik']
parser.parse_metadata()

assert self.metadata['language'][0] == 'en'

0 comments on commit e203411

Please sign in to comment.