Permalink
Browse files

Build a mentions-based recommendation engine

See latexrec and mentionsrec for implementation and the
scripts/astrorecs.py script for an example.

Idea is to list papers cited by your article, and count the number of
times mentioned (these are the primary citations).

Of these primary references, find the set of all secondary references
(papers referred to by primary references, but that are not also
primary references).

For each secondary reference, find all tertiary references that are also
primary references. Count the number of mentions.
Then score the secondary reference by the cosine similarity of their
mention count vectors to those of the origin document to primary
references.
  • Loading branch information...
jonathansick committed Dec 10, 2014
1 parent a4dac08 commit 3b2afbda6232f0d21690596b17d7f61ae5444d1e
Showing with 128 additions and 7 deletions.
  1. +31 −4 astrorec/latexrec.py
  2. +87 −0 astrorec/mentionsrec.py
  3. +9 −2 scripts/astrorecs.py
  4. +1 −1 setup.py
View
@@ -6,16 +6,43 @@
from paperweight.document import FilesystemTexDocument
from starlit.bib.bibtexdb import BibTexDB
from .mentionsrec import MentionsRecs
class LaTeXRecommender(object):
"""Recommend papers to be cited in your LaTeX file."""
def __init__(self, tex_filepath):
def __init__(self, tex_filepath, ads_cache=None):
super(LaTeXRecommender, self).__init__()
self._filepath = tex_filepath
self._ads_cache = ads_cache
self._doc = FilesystemTexDocument(tex_filepath)
self._doc.inline_inputs()
self._doc.remove_comments()
rich_cites = self._doc.rich_bib_keys
print rich_cites
print type(rich_cites)
# self._doc.bibtex_path # we're using bibtex, right?
bib_path = self._doc.bib_path # we're using bibtex, right?
assert bib_path is not None, "You need to use BibTeX"
# FIXME could also parse those bibitems
bibdb = BibTexDB(bib_path, ads_cache=ads_cache)
mention_recs = MentionsRecs(self._ads_cache)
# Initialize with the set of ADS bibcodes we've already cited
for bib_key in rich_cites:
try:
ref_pub = bibdb[bib_key]
except:
continue
try:
arxiv_id = ref_pub.arxiv_id
except:
continue
if arxiv_id is None:
continue
n_mentions = len(rich_cites)
mention_recs.append(ref_pub, n_mentions)
mention_recs.analyze_secondary()
# TODO get top *N* recommendations by score
View
@@ -0,0 +1,87 @@
#!/usr/bin/env python
# encoding: utf-8
"""
2014-12-10 - Created by Jonathan Sick
"""
import numpy as np
from starlit.bib.adsdb import ADSBibDB
class MentionsRecs(object):
"""Citation recommendations based on mention frequency analysis."""
def __init__(self, ads_cache):
super(MentionsRecs, self).__init__()
self._adsdb = ADSBibDB(cache=ads_cache)
# List of B-level publications
self._cited_pubs = []
self._cited_bibcodes = []
self._cited_mention_counts = []
def add_cited_pub(self, pub, n_mentions):
self._cited_pubs.append(pub)
self._cited_bibcodes.append(pub.bibcode)
self._cited_mention_counts.append(n_mentions)
def analyze_secondary(self):
"""Build a secondary set of references to recommend from."""
# First build the unique set of secondary-level publications.
# that are not in the B-level (directly cited)
secondary_bibcodes = []
for cited_pub in self._cited_pubs:
secondary_bibcodes += cited_pub.reference_bibcodes
secondary_bibcodes = list(set(secondary_bibcodes)
- set(self._cited_bibcodes))
self._secondary_pubs = []
cited_mentions = np.array(self._cited_mention_counts)
for bibcode in secondary_bibcodes:
spub = SecondaryPub(bibcode, self._adsdb, self._cited_bibcodes,
cited_mentions)
self._secondary_pubs.append(spub)
self._secondary_scores = []
for spub in self._secondary_pubs:
self._secondary_scores.append(spub.score)
# TODO way to return top *n* publications
class SecondaryPub(object):
"""A publication at the seconary level that will be scored for relevance
to the original paper via mentions to the tertiary papers
"""
def __init__(self, bibcode, adsdb, cited_bibcodes, cited_mentions):
super(SecondaryPub, self).__init__()
self._bibcode = bibcode
self._adsdb = adsdb
self._cited_bibcodes = cited_bibcodes
# Mentions vector for primary references
self._cited_mentions = cited_mentions
# Mentions vector for tertiary reference
self._tertiary_mentions = np.zeros(self._cited_mentions.shape)
# TODO read and build the rich citations for this publication
# query ADS for this paper
pub = adsdb[bibcode]
# Analyze only quaternay references that appear in the orginal
# paper too (and thus are likely to be relevant).
for bibcode in pub.reference_bibcodes:
if bibcode not in self._cited_bibcodes:
continue
# TODO combine bibcode to number of mentions to fill in
# self._tertiary_mentions
@property
def score(self):
"""http://en.wikipedia.org/wiki/Cosine_similarity"""
return np.sum(self._cited_mentions * self._tertiary_mentions) \
/ (np.hypot(self._cited_mentions)
* np.hypot(self._tertiary_mentions))
View
@@ -10,16 +10,23 @@
from astrorec.latexrec import LaTeXRecommender
from astrorec.arxivrec import ArXivRecommender
from starlit.bib.adscache import ADSCacheDB
from starlit.bib.adsdb import ADSBibDB
def main():
args = parse_args()
cachedb = ADSCacheDB(host='localhost',
port=27017,
ads_db=ADSBibDB())
if os.path.exists(args.input_token):
# assume it's a latex file
paper_rec = LaTeXRecommender(args.input_token)
rec = LaTeXRecommender(args.input_token, ads_cache=cachedb)
else:
# assume it's an arXiv ID. Could also be a ADS bibcode eventually
arxiv_rec = ArXivRecommender(args.input_token)
rec = ArXivRecommender(args.input_token)
def parse_args():
View
@@ -28,7 +28,7 @@ def get_version():
setup(
name='astrorec',
version=get_version(),
author="Adam Becker, Matthew Sofftie, Jonathan Sick, Bekkie Smethurst, "
author="Adam Becker, Matthew Sottile, Jonathan Sick, Bekkie Smethurst, "
"Chris Lintott and .Astronomy",
# author_email='jonathansick@mac.com',
license='MIT',

0 comments on commit 3b2afbd

Please sign in to comment.