Improve regex bib parsing somewhat, implement full text search crossr…

…ef API lookup, add minimal bibtex parser
chbrown · Dec 1, 2013 · c445cbe · c445cbe
1 parent 9c30c2f
commit c445cbe
Show file tree

Hide file tree

Showing 13 changed files with 575 additions and 87 deletions.
diff --git a/README.md b/README.md
@@ -25,9 +25,13 @@ It's rough, and still needs a lot of work, but it's better than copy & pasting.
 The output presumes that `natbib` and `amssym` and friends are within reach.
 
 
-## Dependencies:
+## Development
 
-    easy_install -U lxml
+### TODO:
+
+* Shrink whitespace left from the right edge (non-greedy)
+* Handle styles in footnotes without breaking the footnote due to unstyled whitespace
+* Read tabs that are surrounded by text at least as single spaces.
 
 
 ## License

diff --git a/setup.py b/setup.py
@@ -13,7 +13,11 @@
     packages=find_packages(),
     include_package_data=True,
     install_requires=[
-        'lxml'
+        'lxml',
+        'requests',
+        'requests-cache',
+        'unidecode',
+        'viz',
     ],
     entry_points={
         'console_scripts': [

diff --git a/xdoc/bibliography.py b/xdoc/bibliography.py
@@ -1,67 +1,78 @@
 import re
+import requests
 from xdoc.dom import Author, Reference
+from xdoc.lib.regex import named, maybe, anything, some, s, sep, end
+from xdoc.formats.tex.bibliography import parse_bibtex
+from unidecode import unidecode
 
-import logging
+from xdoc.lib.log import logging
 logger = logging.getLogger(__name__)
 
 
-def named(name, pattern):
-    return '\s*(?P<%s>%s)' % (name, pattern)
-
-anything = r'.*?'
-some = r'.+?'
-s = r'\s*'
-sep = r'\.'
-year = '\d{4}'
-
 # \((\d{4}\w?,?)+\)/
 # re_authors = r'(?P<authors>.+?)\s*'
 # re_authors_editors = r'(?P<authors>.+?)\s*(?P<editor>\(ed(itor)?s\.?\)\s+)?\s*'
 re_editors = r'(?P<editor>.+?)\s*\(ed(itor)?s?\.?\)\s*'
-# re_year = r'\((?P<year>\)\s*'
+re_year = named('year', '\d{4}') + named('subyear', r'\w?')
 re_title = r'(?P<title>[^.]+)\.\s*'
 re_title_i = r'(?P<title>.+?)[.,]?\s*'
-# re_journal = r'(?P<journal>.+?)\.?\s*'
-re_page = r'(?P<page_begin>\d+)-(?P<page_end>\d+)'
-re_vol = r'((Volume)?\s*(?P<volume>\d+(\.\d+)?):?\s*' + re_page + ')?'
+# \u2013 is the em-dash
+re_page = ur'(?P<page_begin>\d+)(-|--|\u2013)(?P<page_end>\d+)'
+# :?\s*' + re_page + '
+re_vol = r'(Volume\s+)?(?P<volume>\d+(\.\d+)?)'
+re_edition = r'\((?P<edition>\d+)\)'
 re_pub_address = r'(?P<publisher>[^,]+)([.,]|, (?P<address>.*[^.])[.,]?)\s*'
+re_doi = r'(http://dx.doi.org/(?P<doi>\S+))?'
 
 
 media_regex = [{
+    # Horn, Larry. 1972. On the semantic properties of logical operators in English: UCLA dissertation.
+    'medium': 'phdthesis',
+    'pattern':
+        s + named('authors', some) + sep +
+        s + re_year + sep +
+        s + named('title', some) + r'[.:]' + s +
+        named('school', some) + s + 'dissertation' + sep +
+        end
+}, {
     # Berman, Steve (1991) \emph{On the Semantics and Logical Form of Wh-Clauses}. Ph.D. dissertation, University of Massachusetts at Amherst.
     'medium': 'phdthesis',
     'pattern':
-        named('authors', some) + sep +
-        named('year', year) + sep +
-        named('title', some) + sep +
+        s + named('authors', some) + sep +
+        s + re_year + sep +
+        s + named('title', some) + sep +
         anything + 'dissertation,?' +
-        named('school', some)
+        s + named('school', some) + sep +
+        end
 }, {
     # von , Kai (1995) A minimal theory of adverbial quantification. Ms., MIT, Cambridge, MA.
     'medium': 'unpublished',
     'defaults': dict(note=u'Manuscript'),
     'pattern':
-        named('authors', some) + sep +
-        named('year', year) + sep +
-        named('title', some) + sep +
-        '(ms|manuscript|unpublished),' + re_pub_address,
+        s + named('authors', some) + sep +
+        s + re_year + sep +
+        s + named('title', some) + sep +
+        '(ms|manuscript|unpublished),' + re_pub_address + sep +
+        end
 }, {
     # Beckman, Mary E., and Janet Pierrehumbert (1986) Intonational structure in Japanese and English. \emph{Phonology Yearbook} 3:15-70.
     'medium': 'article',
     'pattern':
-        named('authors', some) + sep +
-        named('year', year) + sep +
-        named('title', some) + sep +
-        named('journal', some) + re_vol
-
+        s + named('authors', some) + sep +
+        s + re_year + sep +
+        s + named('title', some) + sep +
+        s + named('journal', some) + s + re_vol + maybe(s + re_edition) + sep + s + re_page + sep +
+        s + re_doi +
+        end
 }, {
     # Bratman, Michael E. (1987) \emph{Intentions, Plans, and Practical Reason}. Harvard University Press, Cambridge, MA.
     'medium': 'book',
     'pattern':
-        named('authors', some) + sep +
-        named('year', year) + sep +
-        named('title', some) + sep +
-        re_pub_address
+        s + named('authors', some) + sep +
+        s + re_year + sep +
+        s + named('title', some) + sep +
+        s + named('publisher', some) + ':' + s + named('address', some) + sep +
+        end
 }, {
     # B\"uring, Daniel (1994) Topic. In Bosch & van der Sandt (1994), Volume 2: 271-280.
     'medium': '...?',
@@ -71,25 +82,26 @@ def named(name, pattern):
     # Roberts, Craige (1995b) Anaphora in intensional contexts. In Shalom Lappin (ed.) \emph{Handbook of Semantics}. Blackwell, London.
     'medium': 'inbook',
     'pattern':
-        named('authors', some) + sep +
-        named('year', year) + sep +
-        named('title', some) + sep +
+        s + named('authors', some) + sep +
+        s + re_year + sep +
+        s + named('title', some) + sep +
         'In ' +
-        named('editors', some) + sep +
-        named('booktitle', some) + sep +
+        s + named('editors', some) + sep +
+        s + named('booktitle', some) + sep +
         re_pub_address + '(' + re_page + ')?',
 }, {
     'medium': 'article',
     'defaults': dict(note=u'FIXME'),
     'pattern':
-        named('authors', some) + sep +
-        named('year', year) + sep +
+        s + named('authors', some) + sep +
+        s + re_year + sep +
+        maybe(s + named('title', some) + sep) +
         anything +
-        named('title', some)
+        maybe(s + re_doi)
 }]
 
 
-def parse_string(text):
+def regex_parse_string(text):
     # given a raw (unformatted) string that contains some number of bibliographic-like entries
     #   parse the fields in each one (and the medium based on the regex that matches)
     # yields Reference objects
@@ -101,10 +113,14 @@ def parse_string(text):
             logger.debug('Using medium_regex "%s" to parse reference "%s"', medium_regex, text)
             groups = match.groupdict()
 
-            authors_strings = groups.pop('authors').split(r',?\s*?(?:\band\b|&)')
+            authors_strings = re.split(r'\s*(?:\band\b|&)\s*', groups.pop('authors'))
             authors = [Author.from_string(author) for author in authors_strings]
 
-            last_names = '-'.join(author.last_name.lower() for author in authors)
+            last_names = ' '.join(author.last_name.lower() for author in authors)
+            last_names = unidecode(last_names).replace(' ', '-')
+            name = '%s:%s' % (last_names, groups['year'])
+            if 'subyear' in groups:
+                name += groups.pop('subyear')
 
             attrs = medium_regex.get('defaults', {}).copy()
             attrs['author'] = ' and '.join(map(unicode, authors))
@@ -118,4 +134,51 @@ def parse_string(text):
                 if value:
                     attrs[field] = value
 
-            yield Reference('%s:%s' % (last_names, groups['year']), medium_regex['medium'], **attrs)
+            yield Reference(name, medium_regex['medium'], **attrs)
+
+
+def crossref_lookup(text, minimum_score=0.01):
+    import requests_cache
+    requests_cache.install_cache('/tmp/crossref_cache')
+    # requests_cache.install_cache('/tmp/crossref_cache', backend='redis')
+    # See the Crossref labs blog at this link for the general approach:
+    #   http://labs.crossref.org/resolving-citations-we-dont-need-no-stinkin-parser/
+    # and the api help for documentation:
+    #   http://search.labs.crossref.org/help/api
+
+    dois_response = requests.get('http://search.labs.crossref.org/dois', params={
+        'q': text,
+        'rows': 1,  # results per page
+        'page': 1,
+        'sort': 'score',
+    })
+    search_results = dois_response.json()
+    for search_result in search_results[:1]:
+        # each search_result looks like this:
+        # {
+        #     "coins": "ctx_ver=Z39.88-2004&amp;rft_id=info%3Adoi%2Fhttp%3A%2F%2Fdx.doi.org%2F10.3765%2Fsp.6.2&amp;rfr_id=info%3Asid%2Fcrossref.org%3Asearch&amp;rft.atitle=Strategic+conversation&amp;rft.jtitle=Semantics+and+Pragmatics&amp;rft.date=2013&amp;rft.volume=6&amp;rft.aufirst=Nicholas&amp;rft.aulast=Asher&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.au=Nicholas+Asher&amp;rft.au=+Alex+Lascarides",
+        #     "doi": "http://dx.doi.org/10.3765/sp.6.2",
+        #     "fullCitation": "Nicholas Asher, Alex Lascarides, 2013, 'Strategic conversation', <i>Semantics and Pragmatics</i>, vol. 6",
+        #     "normalizedScore": 100,
+        #     "score": 5.295812,
+        #     "title": "Strategic conversation",
+        #     "year": "2013"
+        # },
+        if search_result['score'] > minimum_score:
+            logger.info('Using top result; score = %.2f.', search_result['score'])
+            doi_url = search_result['doi']
+            # See http://www.crosscite.org/cn/ for details on content negotiation here
+            bibtex_response = requests.get(doi_url, headers={'Accept': 'application/x-bibtex'})
+            # for some reason, chardet thinks it's iso-8859-2 = latin2 (european), but just decode with utf-8
+            bibtex_response.encoding = 'UTF-8'
+
+            # the returned bibtex looks like this (but without line breaks)
+            # @article{Asher_Lascarides_2013, title={Strategic conversation},
+            #   volume={6}, url={http://dx.doi.org/10.3765/sp.6.2},
+            #   DOI={10.3765/sp.6.2}, journal={Semantics and Pragmatics},
+            #   publisher={Semantics and Pragmatics}, year={2013}, month={Aug},
+            #   author={Asher, Nicholas and Lascarides, Alex}}
+            for reference in parse_bibtex(bibtex_response.text):
+                yield reference
+        else:
+            logger.warn('No result has a score > %.2f', minimum_score)
diff --git a/xdoc/cli.py b/xdoc/cli.py
@@ -1,37 +1,13 @@
+import sys
 import os
 import argparse
-import logging
-logging.addLevelName(5, 'SILLY')
 
+from xdoc.lib.log import logging
 
-class Logger(logging.Logger):
-    def silly(self, msg, *args, **kwargs):
-        level = logging.getLevelName('SILLY')
-        if self.isEnabledFor(level):
-            self._log(level, msg, args, **kwargs)
 
-    def notset(self, msg, *args, **kwargs):
-        level = logging.getLevelName('NOTSET')
-        if self.isEnabledFor(level):
-            self._log(level, msg, args, **kwargs)
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description='Usage: xdoc original.docx converted.tex',
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
-    parser.add_argument('input', help='Input filename')
-    parser.add_argument('output', help='Output filename')
-    parser.add_argument('-v', '--verbose', action='store_true', help='Log extra output')
+def translate(parser):
     opts = parser.parse_args()
-
-    # max(map(len, [logging.getLevelName(level) for level in range(0, 60, 10)])) == 8
-    level = logging.DEBUG if opts.verbose else logging.INFO
-    logging.basicConfig(format='%(levelname)-8s %(asctime)14s (%(name)s): %(message)s', level=level)
-    logging.setLoggerClass(Logger)
     logger = logging.getLogger(__name__)
-    logger.info('Logging with level >= %s (%s)', logging.root.level, logging.getLevelName(logging.root.level))
 
     # read input
     input_root, input_extension = os.path.splitext(opts.input)
@@ -58,4 +34,59 @@ def main():
     else:
         raise NotImplementedError('File extension "%s" not supported as output' % output_extension)
 
+
+def parsebib(parser):
+    opts = parser.parse_args()
+    logger = logging.getLogger(__name__)
+
+    from xdoc.bibliography import crossref_lookup
+    from xdoc.formats.tex import serialize_reference
+
+    input = sys.stdin if (opts.input == '-') else open(opts.input)
+    output = sys.stdout if (opts.output == '-') else open(opts.output)
+
+    for line in input:
+        line = line.strip().decode('utf8')
+        logger.info('Resolving "%s" via CrossRef API', line)
+
+        for bibitem in crossref_lookup(line):
+            print >> output, serialize_reference(bibitem)
+            break
+        else:
+            logger.error('FIXME: could not parse bib item: %s', line)
+
+    # import unicodedata
+    # crf = train_crf()
+        # if line:
+            # strip tex if needed (only simple commands; if there are environments, too bad)
+            # line = tex_command.sub(line, 'a\1z')
+            # line = re.sub(r'\\[a-z]+\{([^\}]+)\}', r'\1', line)
+
+            # tokens = unidecode(line).split()
+            # features = [list(token_features(token)) for token in tokens]
+            # labels = crf.predict(features)
+            # pairs = zip(tokens, labels)
+            # print gloss.gloss(pairs, prefixes=('', Fore.GREEN), postfixes=(Fore.RESET, Fore.RESET), groupsep='\n')
+
+actions = dict(translate=translate, parsebib=parsebib)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Usage: xdoc original.docx converted.tex',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    # parser.add_argument('input', nargs='?', type=argparse.FileType('r'), default=sys.stdin)
+    parser.add_argument('input', help='input filename')
+    # parser.add_argument('output', nargs='?', type=argparse.FileType('w'), default=sys.stdout)
+    parser.add_argument('output', help='output filename')
+    parser.add_argument('-a', '--action', choices=actions, default='translate', help='xdoc action')
+    parser.add_argument('-v', '--verbose', action='store_true', help='Log extra output')
+    opts = parser.parse_args()
+
+    logging.root.setLevel(logging.DEBUG if opts.verbose else logging.INFO)
+    logger = logging.getLogger(__name__)
+    logger.info('Logging with level >= %s (%s)', logging.root.level, logging.getLevelName(logging.root.level))
+
+    actions[opts.action](parser)
+
     logger.debug('Done')
diff --git a/xdoc/dom.py b/xdoc/dom.py
@@ -3,14 +3,15 @@
 import copy
 
 from xdoc.lib.base import object_ustr
+from xdoc.lib.regex import named, maybe, anything, some, s, space, sep, end
 
 
 class Document(object_ustr):
     '''
     The D in DOM
 
     `metadata` is for things like author, title, date, etc.,
-        that might be printed on each page (in headers/footers)
+        that might appear on each page (in headers/footers)
 
     `spans` are the meat of the document
     `bibliography` is a list of References
@@ -111,14 +112,23 @@ def __init__(self, first_name, middle_name, last_name):
 
     @classmethod
     def from_string(cls, string):
-        parts = re.split(r',\s*', string, maxsplit=2)
-        # TODO: handle von-type last names
-        if len(parts) == 3:
-            return cls(parts[2], parts[1], parts[0])
-        elif len(parts) == 2:
-            return cls(parts[1], None, parts[0])
-        else:
-            raise Exception('One-part names are not yet handled: "%s"' % string)
+        first = named('first', some)
+        middle = named('middle', some)
+        last = named('last', some)
+
+        patterns = [
+            last + ',\s+' + first + '\s+' + middle + end,  # Auden, Wystan Hugh
+            last + ',\s+' + first + end,  # Auden, Wystan
+            first + '\s+' + middle + '\s+' + last + end,  # Wystan Hugh Auden
+            first + '\s+' + last + end,  # Wystan Auden
+        ]
+        for pattern in patterns:
+            match = re.match(pattern, string)
+            if match:
+                groups = match.groupdict()
+                return cls(groups['first'], groups.get('middle'), groups['last'])
+
+        raise Exception('Cannot parse name: "%s"' % string)
 
     def __unicode__(self):
         return ' '.join(filter(None, [self.first_name, self.middle_name, self.last_name]))