Requires: NLTK, Gensim and Networkx

Dataset Used: DBLP Citation Network V6 (http://arnetminer.org/citation)

To train the whole system from ground up use the following code.

This however will take at least 6 hours and use up over 6GB of RAM

In [None]:
from citemachine.corpus.dblp import DBLP
from citemachine import topic_model
from citemachine.text_process import CorpusPreprocessor
from citemachine.evaluation import precision, recall
from citemachine.graph import CommunityRank, adj_lists_to_directed_graph
from citemachine.recommender import LDARecommender, CiteMachine

#set to the location where dblp is stored
path_to_dblp = 'dblp.txt'
#set to a small value like 5000 to test on a subset of the data
num_docs = 2000

#parses the dataset
dblp = DBLP(path_to_dblp, max_docs=num_docs)

#preprocesses the data and trains an LDA
recommender = LDARecommender(corpus=dblp, num_topics=100, train_at_init=True)

#builds a reference graph then finds communities and ranks documents in each one using PageRank
references_graph = adj_lists_to_directed_graph(dblp.references)
comrank = CommunityRank(references_graph)

# Combines the LDA model with the community graphs to create the final recommendation system
citem = CiteMachine(recommender, comrank)

In [None]:
#citemachine evaluation code.
from gensim.models.doc2vec import Doc2Vec, TaggedDocument, LabeledSentence
from nltk.tokenize import MWETokenizer
import pandas as pd
import random

papers = pd.read_csv('data-kw-216432.csv')

random.seed(1)
#select 2000 documens randomly
ids_sel = random.sample(list(papers['INDEX']), 2000)

#define function to compare citation
def compare_citations(ID):        
    #TODO: evaluation of this   
    #ars: take the id as integer
    #returns the number of citations in common between sugested and actual

    abstracts_fromdb = list(papers[papers['INDEX'] == ID]['ABSTRACT'])
    citations_fromdb = list(papers[papers['INDEX'] == ID]['CITATIONS'])
    #print(abstracts_fromdb)
    pred_citations =[]
    for pair in citem.get_recommended_docs_for_text(abstracts_fromdb[0]):
        ID2, score = pair 
        pred_citations.append(ID2)
    print('pred',pred_citations)    
    for i in citations_fromdb:
        actual_citations = i.split(";")
    actual_citations = list(map(int, actual_citations))
    #print('actual',actual_citations) 
    if len(set(pred_citations) & set(actual_citations))>0:
        #print(ID,len(set(pred_citations) & set(actual_citations)))
    acc = len(set(pred_citations) & set(actual_citations))/(min(len(pred_citations),len(actual_citations)))
    return acc
    
# calculates the accuracy.    
#accuracy = {}    

for ns in ids_sel:
    #print(compare_citations(int(ns)))
    accuracy [ns] = compare_citations(int(ns))
#print('ACCURACY=',sum(accuracy[ns] for ns in accuracy) / len(accuracy))

In [8]:
#doc2 vec evaluation
from gensim.models.doc2vec import Doc2Vec, TaggedDocument, LabeledSentence
from nltk.tokenize import MWETokenizer
import pandas as pd
import random

papers = pd.read_csv('data-kw-216432.csv')

random.seed(1)
#select 2000 documens randomly
ids_sel = random.sample(list(papers['INDEX']), 2000)

def get_abstracts_indices():
    #change the dir
    #papers = pd.read_csv( 'data-kw-216432.csv' )
    # TODO: Get from whole dataset (too)
    for i, abstract in enumerate( papers['ABSTRACT'] ):
        index = papers['INDEX'][i]
        if isinstance( abstract, float ):
            #print( 'NO ABSTRACT FOUND at Index = '+ index )
            continue
        ab_words = abstract.lower().split()  # TODO: CHANGE TO SOMETHING ACTUALLY MEANINGFUL
        yield TaggedDocument( words=ab_words, tags=[int(index)] )
        #yield LabeledSentence( words=ab_words, tags=[int(index)] )
        # TODO: Consider having the list of references as multiple tags/labels?

tdocs = list( get_abstracts_indices() )
model = Doc2Vec(tdocs, size=500, window=15, min_count=4, workers=4 )        
        
#define function to compare citation
def compare_citations(ID):        
    #TODO: evaluation of this   
    #ars: take the id as integer
    #returns the number of citations in common between sugested and actual
    citations_fromdb = list(papers[papers['INDEX'] == ID]['CITATIONS'])
    pred_citations =[]
    for pair in model.docvecs.most_similar(ID):
        ID2, score = pair 
        pred_citations.append(ID2)
    for i in citations_fromdb:
        actual_citations = i.split(";")
    actual_citations = list(map(int, actual_citations))
    if len(set(pred_citations) & set(actual_citations))>0:
        print(ID,len(set(pred_citations) & set(actual_citations)))
    acc = len(set(pred_citations) & set(actual_citations))/(min(len(pred_citations),len(actual_citations)))
    return acc
    
# calculates the accuracy.    
accuracy = {}    
for ns in ids_sel:
    accuracy [ns] = compare_citations(int(ns))
print(sum(accuracy[ns] for ns in accuracy) / len(accuracy))

30045 1
385564 1
1117826 1
606014 1
593853 1
233202 1
1062285 1
1111729 1
336171 1
670535 2
873324 1
491551 1
227857 1
512848 1
1076061 1
446559 1
143888 1
842453 1
1077879 1
695506 1
479157 1
1080371 1
545580 1
283717 1
1119229 1
865464 1
1022624 1
1121705 1
542846 1
449117 1
500488 1
783922 1
1125238 1
1124661 1
1118428 1
142709 1
925300 1
779657 1
1112565 1
833247 1
449299 1
1023412 1
131480 1
1113320 1
0.0035805555555555547


In [9]:
file = pd.read_csv('data-kw copy.csv')


In [80]:
#Final evaluation separating testing and traingin

import pandas as pd
import numpy as np 
import random

#print "data dimension", ts
#print "product attributes \n", train.columns.values 

#file = pd.read_csv('data-kw copy.csv')
df = pd.DataFrame(file)
# Randomly sample 70% of your dataframe

df_test = df.sample(frac=0.1)
df_rest = df.loc[~df.index.isin(df_test.index)]

df_test.to_csv('data-kw-test.csv',header=False)
df_rest.to_csv('data-kw.csv',header=False)    

26596