Requires: NLTK, Gensim and Networkx

Dataset Used: DBLP Citation Network V6 (http://arnetminer.org/citation)

To train the whole system from ground up use the following code.

This however will take at least 6 hours and use up over 6GB of RAM

In [1]:
from citemachine.corpus.dblp import DBLP
from citemachine import topic_model
from citemachine.text_process import CorpusPreprocessor
from citemachine.evaluation import precision, recall
from citemachine.graph import CommunityRank, adj_lists_to_directed_graph
from citemachine.recommender import LDARecommender, CiteMachine

#set to the location where dblp is stored
path_to_dblp = '/home/hex/acm_output.txt'
#set to a small value like 5000 to test on a subset of the data
num_docs = 1000

#parses the dataset
dblp = DBLP(path_to_dblp, max_docs=num_docs)

#preprocesses the data and trains an LDA
recommender = LDARecommender(corpus=dblp, num_topics=100, train_at_init=True)

#builds a reference graph then finds communities and ranks documents in each one using PageRank
references_graph = adj_lists_to_directed_graph(dblp.references)
comrank = CommunityRank(references_graph)

# Combines the LDA model with the community graphs to create the final recommendation system
citem = CiteMachine(recommender, comrank)



In [2]:
citem.get_recommended_docs_for_text( "This better work, or I'm gonna smack you upside the head." )

[(5583, 0.14246331098208675),
 (5533, 0),
 (5522, 0),
 (5267, 0),
 (5302, 0),
 (5591, 0),
 (5014, 0),
 (5078, 0),
 (5377, 0),
 (5553, 0),
 (5036, 0),
 (5725, 0),
 (5558, 0),
 (5676, 0),
 (5614, 0),
 (5634, 0),
 (5598, 0),
 (5555, 0),
 (5795, 0),
 (5809, 0),
 (5677, 0),
 (5581, 0),
 (5666, 0),
 (5523, 0),
 (5665, 0),
 (5486, 0),
 (5599, 0),
 (5372, 0),
 (5540, 0),
 (5762, 0),
 (5061, 0),
 (5354, 0),
 (5107, 0),
 (5081, 0),
 (5259, 0),
 (5394, 0),
 (5187, 0),
 (5129, 0),
 (5454, 0),
 (5299, 0),
 (5508, 0),
 (5503, 0),
 (5641, 0),
 (5589, 0),
 (5487, 0),
 (5610, 0),
 (5772, 0),
 (5229, 0),
 (5373, 0),
 (5717, 0)]

In [7]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument, LabeledSentence
from nltk.tokenize import MWETokenizer
import pandas as pd
import random

#include this 1st row in dataXXX-csv file 
#INDEX,1,2,3,4,CITATIONS,10,ABSTRACT,f

In [8]:
def get_abstracts_indices():
    
    #change the dir
    papers = pd.read_csv( 'data-kw-216432.csv' )
    # TODO: Get from whole dataset (too)
    for i, abstract in enumerate( papers['ABSTRACT'] ):
        index = papers['INDEX'][i]
        if isinstance( abstract, float ):
            #print( 'NO ABSTRACT FOUND at Index = '+ index )
            continue
        ab_words = abstract.lower().split()  # TODO: CHANGE TO SOMETHING ACTUALLY MEANINGFUL
        yield TaggedDocument( words=ab_words, tags=[int(index)] )
        #yield LabeledSentence( words=ab_words, tags=[int(index)] )
        # TODO: Consider having the list of references as multiple tags/labels?

In [9]:
tdocs = list( get_abstracts_indices() )
model = Doc2Vec( tdocs, size=500, window=10, min_count=4, workers=4 )

In [None]:
#read file TODO: Improve the structure because it read the file twice.
papers = pd.read_csv('data-kw-216432.csv' )

#select 2000 documens randomly
ids_sel = random.sample(list(papers['INDEX']), 2000)

#define function to compare citation
def compare_citations(ID):        
    #TODO: evaluation of this   
    #print(model.docvecs.most_similar(ID))
    citations_fromdb = list(papers[papers['INDEX'] == ID]['CITATIONS'])
    #print(citations_fromdb)

    pred_citations =[]
    for pair in model.docvecs.most_similar(ID):
        ID2, score = pair 
        pred_citations.append(ID2)
    for i in citations_fromdb:
        actual_citations = i.split(";")
    actual_citations = list(map(int, actual_citations))
    #print(pred_citations)
    #print(actual_citations)
    if len(set(pred_citations) & set(actual_citations))>0:
        print(ID,len(set(pred_citations) & set(actual_citations)))
    acc = len(set(pred_citations) & set(actual_citations))/(min(len(pred_citations),len(actual_citations)))
    return acc
    

# calculates the accuracy.    
accuracy = {}    
for ns in ids_sel:
    accuracy [ns] = compare_citations(int(ns))
print(sum(accuracy[ns] for ns in accuracy) / len(accuracy))

942619 1
419120