In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument, LabeledSentence
from nltk.tokenize import MWETokenizer
import multiprocessing
import pandas as pd
import re
import sqlite3

cores = multiprocessing.cpu_count()

In [23]:
spltr = re.compile( r'\W+' )
tag_map = []
arXiv_start = 0

def get_abstracts_indices_from_DBLP():
    papers = pd.read_csv( '../data-kw-216432.csv' )
    # TODO: Get from whole dataset (too)
    for i, abstract in enumerate( papers['ABSTRACT'] ):
        index = papers['INDEX'][i]  # NOTE: This is a string too, yeah??
        if isinstance( abstract, float ):
            #print( 'NO ABSTRACT FOUND at Index = '+ index )
            continue
        #ab_words = abstract.lower().split( r'\w+' )  # TODO: CHANGE TO SOMETHING ACTUALLY MEANINGFUL
        ab_words = [ w for w in spltr.split( abstract.lower() ) if w != '' ]
        #print( ab_words )
        tag_map.append( index )
        yield TaggedDocument( words=ab_words, tags=[i] )  # NOTE: WAS tags=[i]
        #yield LabeledSentence( words=ab_words, tags=[int(index)] )
        # TODO: Consider having the list of references as multiple tags/labels?
        
def get_abstracts_indices_from_arXiv():
    arXiv_start = len( tag_map )
    conn = sqlite3.connect( '../arxiv-server/paper_trail.db' )
    curs = conn.cursor()
    for i, row in enumerate( curs.execute( 'SELECT id, abstract FROM abstracts' ) ):
        index = row[0]   # NOTE: These are strings, yeah??
        abstract = row[1]
        ab_words = [ w for w in spltr.split( abstract.lower() ) if w != '' ]
        tag_map.append( index )  # NOTE: Diff datatypes cannot be in same list! ?
        yield TaggedDocument( words=ab_words, tags=[arXiv_start + i])
    conn.close()

In [24]:
DBLP_docs = list( get_abstracts_indices_from_DBLP() )

In [25]:
arXiv_docs = list( get_abstracts_indices_from_arXiv() )

In [26]:
docs = DBLP_docs + arXiv_docs

In [11]:
def get_paper_abstract( idx ):
    """ Given a model doc tag, output the respective DBLP or arXiv paper abstract"""
    return ' '.join( docs[idx].words )

In [28]:
model = Doc2Vec( docs, size=400, window=8, min_count=2, workers=cores )

In [29]:
model.docvecs.most_similar( 397 )

[(92, 0.9642688632011414),
 (257, 0.9634057879447937),
 (264, 0.9507011771202087),
 (97, 0.9483972787857056),
 (186, 0.9463803172111511),
 (330, 0.9451543092727661),
 (272, 0.9448680877685547),
 (280, 0.9366728067398071),
 (273, 0.9333893060684204),
 (230, 0.9290798902511597)]

In [10]:
import numpy as np
# Code you'll find on nycdatascience.com:
def cossim( v1, v2 ):
    return np.dot(v1, v2) / np.sqrt( np.dot(v1, v1) ) / np.sqrt( np.dot(v2, v2) )
def argmaxn( l, n ):
    l_copy = list(l)
    args = []
    for i in range(n):
        arg = np.argmax(l_copy)
        args.append(arg)
        l_copy[arg] = -float('inf')
    return args

In [30]:
test_idx = 397
compare = 2

compare = min( compare, 10 )
print( 'TEST ABSTRACT:\n' + get_paper_abstract(test_idx) )
print( '\n  SIMILAR TO:\n' )
for idx_score in model.docvecs.most_similar( test_idx )[:compare]:
    print( get_paper_abstract(idx_score[0]) )
    print( '  (id = '+ str(idx_score[0]) + ', with a score of '+ str(idx_score[1]) +')\n' )

TEST ABSTRACT:
in this paper we present our research on social interaction in co located handheld augmented reality ar games these games are characterized by shared physical spaces that promote physical awareness among players and individual gaming devices that support both public and private information one result of our exploration of the design and evaluation of such games is a prototype called bragfish through bragfish we aim to investigate the connections between the observed game experience focusing on social and physical interaction and the designed affordances of our ar handheld game our evaluation of bragfish shows that most of our participants form strategies for social play by leveraging visual aural and physical cues from the shared space moreover we use this as an example to motivate discussions on how to improve social play experiences for co located handheld games by designing for shared spaces

  SIMILAR TO:

emergent game formats such as machinima that use game worlds 

In [13]:
# NOTE: THIS IS WHERE YOU PUT USER INPUT
vec = model.infer_vector( ['handheld', 'augmented', 'reality'] )
most_sim = 5
# Code you'll find on nycdatascience.com:
cossims = list( map( lambda v: cossim(vec, v), model.docvecs ) )
sim_ids = argmaxn( cossims, most_sim )
for i in range(most_sim):
    print( sim_ids[i], cossims[sim_ids[i]] )

172349 0.355598
198584 0.337821
151634 0.33091
169540 0.311499
197432 0.308624


In [31]:
# Code you'll find on that other website:
import lda
from sklearn.feature_extraction.text import CountVectorizer

# TODO: max_features was initially 10000
cvectorizer = CountVectorizer( min_df=4, max_features=5000, stop_words='english' )
abstracts = [ ' '.join( x.words ) for x in DBLP_docs ]
abstracts += [ ' '.join( x.words ) for x in arXiv_docs ]
cvz = cvectorizer.fit_transform( abstracts )

In [None]:
n_topics = 20
n_iter = 2000
lda_model = lda.LDA( n_topics=n_topics, n_iter=n_iter )
X_topics = lda_model.fit_transform( cvz )

In [None]:
n_top_words = 8
topic_summaries = []

topic_word = lda_model.topic_word_
vocab = cvectorizer.get_feature_names()
for i, topic_dist in enumerate( topic_word ):
    topic_words = np.array( vocab )[ np.argsort(topic_dist) ][ : -(n_top_words+1):-1 ]
    topic_summaries.append( ' '.join( topic_words ) )
    print( 'Topic {}: {}'.format(i, ' '.join( topic_words) ) )

In [62]:
tag_map[92]

4990

In [16]:
get_paper_abstract( 151634 )

'due to the increase of interest in augmented reality ar the potential uses of ar are increasing also it can benefit the user in various fields such as education business medicine and other augmented reality supports the real environment with synthetic environment to give more details and meaning to the objects in the real word ar refers to a situation in which the goal is to supplement a user s perception of the real world through the addition of virtual objects this paper is an attempt to make a survey of web based augmented reality applications and make a comparison among them'

In [48]:
model.most_similar( positive=['classification'], negative=['recognition'] )

[('clustering', 0.3813590407371521),
 ('ranking', 0.37842971086502075),
 ('ensemble', 0.3661080598831177),
 ('fusion', 0.3224477767944336),
 ('classifiers', 0.32240602374076843),
 ('inference', 0.30264925956726074),
 ('clusterings', 0.2996658682823181),
 ('pruning', 0.2980048656463623),
 ('boosting', 0.2952820360660553),
 ('encoding', 0.293839693069458)]

In [10]:
model.n_similarity( ['human', 'chess'], ['deep', 'blue'] )

0.15382886432060081

In [None]:
# When checking user's input, (if necessary) see if the keywords are present in the model or not