Skip to content

Commit

Permalink
Improve regex bib parsing somewhat, implement full text search crossr…
Browse files Browse the repository at this point in the history
…ef API lookup, add minimal bibtex parser
  • Loading branch information
chbrown committed Dec 1, 2013
1 parent 9c30c2f commit c445cbe
Show file tree
Hide file tree
Showing 13 changed files with 575 additions and 87 deletions.
8 changes: 6 additions & 2 deletions README.md
Expand Up @@ -25,9 +25,13 @@ It's rough, and still needs a lot of work, but it's better than copy & pasting.
The output presumes that `natbib` and `amssym` and friends are within reach.


## Dependencies:
## Development

easy_install -U lxml
### TODO:

* Shrink whitespace left from the right edge (non-greedy)
* Handle styles in footnotes without breaking the footnote due to unstyled whitespace
* Read tabs that are surrounded by text at least as single spaces.


## License
Expand Down
6 changes: 5 additions & 1 deletion setup.py
Expand Up @@ -13,7 +13,11 @@
packages=find_packages(),
include_package_data=True,
install_requires=[
'lxml'
'lxml',
'requests',
'requests-cache',
'unidecode',
'viz',
],
entry_points={
'console_scripts': [
Expand Down
149 changes: 106 additions & 43 deletions xdoc/bibliography.py
@@ -1,67 +1,78 @@
import re
import requests
from xdoc.dom import Author, Reference
from xdoc.lib.regex import named, maybe, anything, some, s, sep, end
from xdoc.formats.tex.bibliography import parse_bibtex
from unidecode import unidecode

import logging
from xdoc.lib.log import logging
logger = logging.getLogger(__name__)


def named(name, pattern):
return '\s*(?P<%s>%s)' % (name, pattern)

anything = r'.*?'
some = r'.+?'
s = r'\s*'
sep = r'\.'
year = '\d{4}'

# \((\d{4}\w?,?)+\)/
# re_authors = r'(?P<authors>.+?)\s*'
# re_authors_editors = r'(?P<authors>.+?)\s*(?P<editor>\(ed(itor)?s\.?\)\s+)?\s*'
re_editors = r'(?P<editor>.+?)\s*\(ed(itor)?s?\.?\)\s*'
# re_year = r'\((?P<year>\)\s*'
re_year = named('year', '\d{4}') + named('subyear', r'\w?')
re_title = r'(?P<title>[^.]+)\.\s*'
re_title_i = r'(?P<title>.+?)[.,]?\s*'
# re_journal = r'(?P<journal>.+?)\.?\s*'
re_page = r'(?P<page_begin>\d+)-(?P<page_end>\d+)'
re_vol = r'((Volume)?\s*(?P<volume>\d+(\.\d+)?):?\s*' + re_page + ')?'
# \u2013 is the em-dash
re_page = ur'(?P<page_begin>\d+)(-|--|\u2013)(?P<page_end>\d+)'
# :?\s*' + re_page + '
re_vol = r'(Volume\s+)?(?P<volume>\d+(\.\d+)?)'
re_edition = r'\((?P<edition>\d+)\)'
re_pub_address = r'(?P<publisher>[^,]+)([.,]|, (?P<address>.*[^.])[.,]?)\s*'
re_doi = r'(http://dx.doi.org/(?P<doi>\S+))?'


media_regex = [{
# Horn, Larry. 1972. On the semantic properties of logical operators in English: UCLA dissertation.
'medium': 'phdthesis',
'pattern':
s + named('authors', some) + sep +
s + re_year + sep +
s + named('title', some) + r'[.:]' + s +
named('school', some) + s + 'dissertation' + sep +
end
}, {
# Berman, Steve (1991) \emph{On the Semantics and Logical Form of Wh-Clauses}. Ph.D. dissertation, University of Massachusetts at Amherst.
'medium': 'phdthesis',
'pattern':
named('authors', some) + sep +
named('year', year) + sep +
named('title', some) + sep +
s + named('authors', some) + sep +
s + re_year + sep +
s + named('title', some) + sep +
anything + 'dissertation,?' +
named('school', some)
s + named('school', some) + sep +
end
}, {
# von , Kai (1995) A minimal theory of adverbial quantification. Ms., MIT, Cambridge, MA.
'medium': 'unpublished',
'defaults': dict(note=u'Manuscript'),
'pattern':
named('authors', some) + sep +
named('year', year) + sep +
named('title', some) + sep +
'(ms|manuscript|unpublished),' + re_pub_address,
s + named('authors', some) + sep +
s + re_year + sep +
s + named('title', some) + sep +
'(ms|manuscript|unpublished),' + re_pub_address + sep +
end
}, {
# Beckman, Mary E., and Janet Pierrehumbert (1986) Intonational structure in Japanese and English. \emph{Phonology Yearbook} 3:15-70.
'medium': 'article',
'pattern':
named('authors', some) + sep +
named('year', year) + sep +
named('title', some) + sep +
named('journal', some) + re_vol

s + named('authors', some) + sep +
s + re_year + sep +
s + named('title', some) + sep +
s + named('journal', some) + s + re_vol + maybe(s + re_edition) + sep + s + re_page + sep +
s + re_doi +
end
}, {
# Bratman, Michael E. (1987) \emph{Intentions, Plans, and Practical Reason}. Harvard University Press, Cambridge, MA.
'medium': 'book',
'pattern':
named('authors', some) + sep +
named('year', year) + sep +
named('title', some) + sep +
re_pub_address
s + named('authors', some) + sep +
s + re_year + sep +
s + named('title', some) + sep +
s + named('publisher', some) + ':' + s + named('address', some) + sep +
end
}, {
# B\"uring, Daniel (1994) Topic. In Bosch & van der Sandt (1994), Volume 2: 271-280.
'medium': '...?',
Expand All @@ -71,25 +82,26 @@ def named(name, pattern):
# Roberts, Craige (1995b) Anaphora in intensional contexts. In Shalom Lappin (ed.) \emph{Handbook of Semantics}. Blackwell, London.
'medium': 'inbook',
'pattern':
named('authors', some) + sep +
named('year', year) + sep +
named('title', some) + sep +
s + named('authors', some) + sep +
s + re_year + sep +
s + named('title', some) + sep +
'In ' +
named('editors', some) + sep +
named('booktitle', some) + sep +
s + named('editors', some) + sep +
s + named('booktitle', some) + sep +
re_pub_address + '(' + re_page + ')?',
}, {
'medium': 'article',
'defaults': dict(note=u'FIXME'),
'pattern':
named('authors', some) + sep +
named('year', year) + sep +
s + named('authors', some) + sep +
s + re_year + sep +
maybe(s + named('title', some) + sep) +
anything +
named('title', some)
maybe(s + re_doi)
}]


def parse_string(text):
def regex_parse_string(text):
# given a raw (unformatted) string that contains some number of bibliographic-like entries
# parse the fields in each one (and the medium based on the regex that matches)
# yields Reference objects
Expand All @@ -101,10 +113,14 @@ def parse_string(text):
logger.debug('Using medium_regex "%s" to parse reference "%s"', medium_regex, text)
groups = match.groupdict()

authors_strings = groups.pop('authors').split(r',?\s*?(?:\band\b|&)')
authors_strings = re.split(r'\s*(?:\band\b|&)\s*', groups.pop('authors'))
authors = [Author.from_string(author) for author in authors_strings]

last_names = '-'.join(author.last_name.lower() for author in authors)
last_names = ' '.join(author.last_name.lower() for author in authors)
last_names = unidecode(last_names).replace(' ', '-')
name = '%s:%s' % (last_names, groups['year'])
if 'subyear' in groups:
name += groups.pop('subyear')

attrs = medium_regex.get('defaults', {}).copy()
attrs['author'] = ' and '.join(map(unicode, authors))
Expand All @@ -118,4 +134,51 @@ def parse_string(text):
if value:
attrs[field] = value

yield Reference('%s:%s' % (last_names, groups['year']), medium_regex['medium'], **attrs)
yield Reference(name, medium_regex['medium'], **attrs)


def crossref_lookup(text, minimum_score=0.01):
import requests_cache
requests_cache.install_cache('/tmp/crossref_cache')
# requests_cache.install_cache('/tmp/crossref_cache', backend='redis')
# See the Crossref labs blog at this link for the general approach:
# http://labs.crossref.org/resolving-citations-we-dont-need-no-stinkin-parser/
# and the api help for documentation:
# http://search.labs.crossref.org/help/api

dois_response = requests.get('http://search.labs.crossref.org/dois', params={
'q': text,
'rows': 1, # results per page
'page': 1,
'sort': 'score',
})
search_results = dois_response.json()
for search_result in search_results[:1]:
# each search_result looks like this:
# {
# "coins": "ctx_ver=Z39.88-2004&amp;rft_id=info%3Adoi%2Fhttp%3A%2F%2Fdx.doi.org%2F10.3765%2Fsp.6.2&amp;rfr_id=info%3Asid%2Fcrossref.org%3Asearch&amp;rft.atitle=Strategic+conversation&amp;rft.jtitle=Semantics+and+Pragmatics&amp;rft.date=2013&amp;rft.volume=6&amp;rft.aufirst=Nicholas&amp;rft.aulast=Asher&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.au=Nicholas+Asher&amp;rft.au=+Alex+Lascarides",
# "doi": "http://dx.doi.org/10.3765/sp.6.2",
# "fullCitation": "Nicholas Asher, Alex Lascarides, 2013, 'Strategic conversation', <i>Semantics and Pragmatics</i>, vol. 6",
# "normalizedScore": 100,
# "score": 5.295812,
# "title": "Strategic conversation",
# "year": "2013"
# },
if search_result['score'] > minimum_score:
logger.info('Using top result; score = %.2f.', search_result['score'])
doi_url = search_result['doi']
# See http://www.crosscite.org/cn/ for details on content negotiation here
bibtex_response = requests.get(doi_url, headers={'Accept': 'application/x-bibtex'})
# for some reason, chardet thinks it's iso-8859-2 = latin2 (european), but just decode with utf-8
bibtex_response.encoding = 'UTF-8'

# the returned bibtex looks like this (but without line breaks)
# @article{Asher_Lascarides_2013, title={Strategic conversation},
# volume={6}, url={http://dx.doi.org/10.3765/sp.6.2},
# DOI={10.3765/sp.6.2}, journal={Semantics and Pragmatics},
# publisher={Semantics and Pragmatics}, year={2013}, month={Aug},
# author={Asher, Nicholas and Lascarides, Alex}}
for reference in parse_bibtex(bibtex_response.text):
yield reference
else:
logger.warn('No result has a score > %.2f', minimum_score)
85 changes: 58 additions & 27 deletions xdoc/cli.py
@@ -1,37 +1,13 @@
import sys
import os
import argparse
import logging
logging.addLevelName(5, 'SILLY')

from xdoc.lib.log import logging

class Logger(logging.Logger):
def silly(self, msg, *args, **kwargs):
level = logging.getLevelName('SILLY')
if self.isEnabledFor(level):
self._log(level, msg, args, **kwargs)

def notset(self, msg, *args, **kwargs):
level = logging.getLevelName('NOTSET')
if self.isEnabledFor(level):
self._log(level, msg, args, **kwargs)


def main():
parser = argparse.ArgumentParser(
description='Usage: xdoc original.docx converted.tex',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)

parser.add_argument('input', help='Input filename')
parser.add_argument('output', help='Output filename')
parser.add_argument('-v', '--verbose', action='store_true', help='Log extra output')
def translate(parser):
opts = parser.parse_args()

# max(map(len, [logging.getLevelName(level) for level in range(0, 60, 10)])) == 8
level = logging.DEBUG if opts.verbose else logging.INFO
logging.basicConfig(format='%(levelname)-8s %(asctime)14s (%(name)s): %(message)s', level=level)
logging.setLoggerClass(Logger)
logger = logging.getLogger(__name__)
logger.info('Logging with level >= %s (%s)', logging.root.level, logging.getLevelName(logging.root.level))

# read input
input_root, input_extension = os.path.splitext(opts.input)
Expand All @@ -58,4 +34,59 @@ def main():
else:
raise NotImplementedError('File extension "%s" not supported as output' % output_extension)


def parsebib(parser):
opts = parser.parse_args()
logger = logging.getLogger(__name__)

from xdoc.bibliography import crossref_lookup
from xdoc.formats.tex import serialize_reference

input = sys.stdin if (opts.input == '-') else open(opts.input)
output = sys.stdout if (opts.output == '-') else open(opts.output)

for line in input:
line = line.strip().decode('utf8')
logger.info('Resolving "%s" via CrossRef API', line)

for bibitem in crossref_lookup(line):
print >> output, serialize_reference(bibitem)
break
else:
logger.error('FIXME: could not parse bib item: %s', line)

# import unicodedata
# crf = train_crf()
# if line:
# strip tex if needed (only simple commands; if there are environments, too bad)
# line = tex_command.sub(line, 'a\1z')
# line = re.sub(r'\\[a-z]+\{([^\}]+)\}', r'\1', line)

# tokens = unidecode(line).split()
# features = [list(token_features(token)) for token in tokens]
# labels = crf.predict(features)
# pairs = zip(tokens, labels)
# print gloss.gloss(pairs, prefixes=('', Fore.GREEN), postfixes=(Fore.RESET, Fore.RESET), groupsep='\n')

actions = dict(translate=translate, parsebib=parsebib)


def main():
parser = argparse.ArgumentParser(
description='Usage: xdoc original.docx converted.tex',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
# parser.add_argument('input', nargs='?', type=argparse.FileType('r'), default=sys.stdin)
parser.add_argument('input', help='input filename')
# parser.add_argument('output', nargs='?', type=argparse.FileType('w'), default=sys.stdout)
parser.add_argument('output', help='output filename')
parser.add_argument('-a', '--action', choices=actions, default='translate', help='xdoc action')
parser.add_argument('-v', '--verbose', action='store_true', help='Log extra output')
opts = parser.parse_args()

logging.root.setLevel(logging.DEBUG if opts.verbose else logging.INFO)
logger = logging.getLogger(__name__)
logger.info('Logging with level >= %s (%s)', logging.root.level, logging.getLevelName(logging.root.level))

actions[opts.action](parser)

logger.debug('Done')
28 changes: 19 additions & 9 deletions xdoc/dom.py
Expand Up @@ -3,14 +3,15 @@
import copy

from xdoc.lib.base import object_ustr
from xdoc.lib.regex import named, maybe, anything, some, s, space, sep, end


class Document(object_ustr):
'''
The D in DOM
`metadata` is for things like author, title, date, etc.,
that might be printed on each page (in headers/footers)
that might appear on each page (in headers/footers)
`spans` are the meat of the document
`bibliography` is a list of References
Expand Down Expand Up @@ -111,14 +112,23 @@ def __init__(self, first_name, middle_name, last_name):

@classmethod
def from_string(cls, string):
parts = re.split(r',\s*', string, maxsplit=2)
# TODO: handle von-type last names
if len(parts) == 3:
return cls(parts[2], parts[1], parts[0])
elif len(parts) == 2:
return cls(parts[1], None, parts[0])
else:
raise Exception('One-part names are not yet handled: "%s"' % string)
first = named('first', some)
middle = named('middle', some)
last = named('last', some)

patterns = [
last + ',\s+' + first + '\s+' + middle + end, # Auden, Wystan Hugh
last + ',\s+' + first + end, # Auden, Wystan
first + '\s+' + middle + '\s+' + last + end, # Wystan Hugh Auden
first + '\s+' + last + end, # Wystan Auden
]
for pattern in patterns:
match = re.match(pattern, string)
if match:
groups = match.groupdict()
return cls(groups['first'], groups.get('middle'), groups['last'])

raise Exception('Cannot parse name: "%s"' % string)

def __unicode__(self):
return ' '.join(filter(None, [self.first_name, self.middle_name, self.last_name]))
Expand Down

0 comments on commit c445cbe

Please sign in to comment.