Generate the term frequency (reference venue) from a set of paper information

In [1]:
# Extensions
%load_ext autoreload
%autoreload 2

In [2]:
# Imports
import scipy.sparse    as sparse
import numpy           as np

from core.search.query_paper_mag import paper_mag_multiquery
from core.search.query_info      import paper_info_mag_check_multiquery
from core.utils.entity_type      import Entity_type

from bag_of_venues import BagOfWords

In [3]:
# Conf id
conf_id = 1127352206 #PLDI

# Generate papers for the conference
papers = paper_mag_multiquery(Entity_type.CONF, [conf_id])

print("Papers for conference:", len(papers))

# Import paper information here!
paper_informations = paper_info_mag_check_multiquery(papers)

Papers for conference: 1386
Complete cache entries found: 1386
Partial cache entries found: 0
No cache entries found: 0
Total ids to query: 1386


In [70]:
bov = BagOfWords()
bov.fit(paper_informations)

In [71]:
print(bov.tf_matrix.toarray())
print(bov.title_dim, bov.venue_dim)
print(np.sum(bov.tf_matrix.toarray()))

[[1 2 1 ... 0 0 0]
 [2 2 2 ... 0 0 0]
 [0 3 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 1]
 [1 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
1186 1024
28209


In [72]:
title1 = 'memsat checking axiomatic specifications of memory models'
title2 = 'mechanized verification of fine grained concurrent programs'

In [73]:
bov.title_to_vec(title1)

array([0, 1, 1, ..., 0, 0, 0])

In [74]:
bov.sim_titles(title1, title2)

0.36583339522585784

In [75]:
bov.most_sim(bov.title_to_vec(title1))

[{'idx': 371,
  'title': 'memsat checking axiomatic specifications of memory models',
  'sim': 1.0000000000000002},
 {'idx': 126,
  'title': 'synthesizing software verifiers from proof rules',
  'sim': 0.7089306854261541},
 {'idx': 348,
  'title': 'dynamic partial order reduction for relaxed memory models',
  'sim': 0.7056422850727971},
 {'idx': 216,
  'title': 'discovering properties about arrays in simple programs',
  'sim': 0.6857254813237417},
 {'idx': 467,
  'title': 'herding cats modelling simulation testing and data mining for weak memory',
  'sim': 0.6754836711711946}]

In [76]:
bov.most_sim(bov.title_to_vec(title2))

[{'idx': 501,
  'title': 'mechanized verification of fine grained concurrent programs',
  'sim': 1.0},
 {'idx': 987,
  'title': 'atomicity refinement for verified compilation',
  'sim': 0.86350607762354},
 {'idx': 677,
  'title': 'blame and coercion together again for the first time',
  'sim': 0.8617274844321392},
 {'idx': 135,
  'title': 'adoption and focus practical linear types for imperative programming',
  'sim': 0.8251126821451},
 {'idx': 792,
  'title': 'monadic abstract interpreters',
  'sim': 0.8124444637023874}]

In [77]:
auth_vec = bov.author_to_vec('stephen m blackburn')

[ 4 16  1 ...  0  0  0]


In [79]:
bov.most_sim(auth_vec)

[{'idx': 382,
  'title': 'beltway getting around garbage collection gridlock',
  'sim': 0.9458490602925756},
 {'idx': 149,
  'title': 'immix a mark region garbage collector with space efficiency fast collection and mutator performance',
  'sim': 0.93887134688203},
 {'idx': 447,
  'title': 'the compressor concurrent incremental and parallel compaction',
  'sim': 0.9235948757944468},
 {'idx': 444,
  'title': 'free me a static analysis for automatic individual object reclamation',
  'sim': 0.9039360825220187},
 {'idx': 683,
  'title': 'z rays divide arrays and conquer speed and flexibility',
  'sim': 0.896380999278975}]