In [2]:
import os, json, requests, sys, string, requests, csv
import http.client, urllib.request, urllib.parse, urllib.error, base64
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
from datetime import datetime
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from string import punctuation
from multiprocessing import Pool

from paper_filtering import filter_by_header
from paper_filtering import filter_by_page_number_keep_missing
from paper_filtering import filter_by_page_number_remove_missing
from paper_filtering import filter_by_header_and_page_number_keep_missing
from paper_filtering import filter_by_header_and_page_number_remove_missing
from paper_filtering import filter_journals
from paper_filtering import apply_filter_to_papers

In [6]:
eshost = "130.56.248.215:9200"

threads = 40

client = Elasticsearch(eshost, timeout=200, maxthreads = threads)


# years to include note: range(inclusive,exclusive, i.e. range(1,3) = [1,2]
yearrange = range(2007,2020)

# filepath to the file containing venue names and their categories (conf or journal)
venue_category_filename = os.path.join(os.pardir,"app","data","venue_list.csv")


dblp_raw_filename = lambda name,year: os.path.join( "DBLP_raw_data" , 
                                              "{}_{}_raw_dblp_papers.json".format(name,year) )

filtered_papers_filename = lambda name,year: os.path.join( "filtered_papers" , 
                                              "{}_{}_filtered_papers.json".format(name,year) )

In [3]:
def query_academic_search(type, url, query):
    headers = {
        # Request headers
        'Ocp-Apim-Subscription-Key': '4698d5e7b0244e828d1dc21134238650',    # bens
    }
    if type == "get":
        response = requests.get(url, params=urllib.parse.urlencode(query), headers=headers)
    elif type == "post":
        response = requests.post(url, json=query, headers=headers)
#     if response.status_code != 200:
#         print("return statue: " + str(response.status_code))
#         print("ERROR: problem with the request.")
#         print(response.content)
        #exit()
    return json.loads((response.content).decode("utf-8"))


def interpret_title(title):
    
        
    MAS_URL_PREFIX = "https://api.labs.cognitive.microsoft.com"
    url = os.path.join(MAS_URL_PREFIX, "academic/v1.0/interpret")
    query = {
      "query": title,
      "count": 1,
      "offset": 0,
      "attributes": "Ti"
    }

    data = query_academic_search("get", url, query)
    expr = data["interpretations"][0]["rules"][0]["output"]["value"]
    
    return expr


def evaluate_expr(query):
    
    
    MAS_URL_PREFIX = "https://api.labs.cognitive.microsoft.com"
    url = os.path.join(MAS_URL_PREFIX, "academic/v1.0/evaluate")
    query = {
      "expr": "Ti='{}'".format(query),
      "count": 20,
      "offset": 0,
      "attributes": "Id,Ti,Y,AA.AuId,AA.AfId,CC,ECC"
    }

    data = query_academic_search("get", url, query)
    
    return data
    
    
def get_title_from_MAG(title):
    
    try:
        expr = interpret_title(title)
        mag_title = expr[4:-1]
        
    except:
        mag_title = None

    return mag_title


def canonical(title):
    title = title.lower()
    title = title.translate(str.maketrans(punctuation, " "*len(punctuation)))
    title = " ".join(title.split())


    return title


In [7]:
def get_info_es_canonical_title(papers):

    year = papers[0]["year"]
    
    papers = {canonical(paper["DBLP title"]):paper for paper in papers}
        
        
    batches = divide_batches(set(papers.keys()),20)

    
    for batch in batches:
        
        canonical_titles = {canonical(title) for title in batch}

        search = Search(index = "papers*", using = client)
        query = {"query":{
            "bool":{
                "should": [{"match_phrase": {"PaperTitle": ct}} for ct in canonical_titles]
            }
        }}

        search.update_from_dict(query)

        source_fields = ['PaperId',"PaperTitle", "Year", "CitationCount", "EstimatedCitation"]

        search = search.source(source_fields)

        for res in search.scan():
            if "Year" in res and res["Year"] == year and res["PaperTitle"] in canonical_titles:
                ({field: res[field] for field in source_fields})

                papers[res["PaperTitle"]]["MAG papers"].append({field: res[field] for field in source_fields})
                papers[res["PaperTitle"]]["source"] = "ES"

    return list(papers.values())

    
def get_info_es_display_title(paper):
    search = Search(index = "papers*", using = client)
    query = {"query":{
        "bool":{
            "must": {"match_phrase": {"OriginalTitle": paper["DBLP title"]}}
        }
    }}

    search.update_from_dict(query)

    source_fields = ['PaperId',"PaperTitle", "OriginalTitle", "Year", "CitationCount", "EstimatedCitation"]

    search = search.source(source_fields)

    for res in search.scan():
        if "Year" in res and res["Year"]==paper["year"] and res["OriginalTitle"] == paper["DBLP title"]:
            paper["MAG papers"].append({field: res[field] for field in source_fields})
            
            paper["source"] = "ES"

    return paper
    

def get_info_MAG(paper):
    
    if "source" in paper:
        return paper
    
    title = get_title_from_MAG(paper["DBLP title"])
    
    if title == None:
        return paper

    paper_data = evaluate_expr(title)
    
    key_pairs = [
        ("PaperTitle","Ti"),
        ("PaperId","Id"),
        ("Year","Y"),
        ("CitationCount","CC"),
        ("EstimatedCitation", "ECC")
    ]
    
    if "entities" not in paper_data:
        return paper
    
    for entity in paper_data["entities"]:
        info = {tokey: entity[fromkey] for tokey, fromkey in key_pairs}
        info["Affiliations"] = [None if "AfId" not in author else author["AfId"] for author in entity["AA"]]
        info["Authors"] = [None if "AuId" not in author else author["AuId"] for author in entity["AA"]]
        paper["MAG papers"].append(info)
    
    if len(paper["MAG papers"]) > 0:
        paper["source"] = "MAG"
        return paper
    
    return paper

    
def get_paper_info(papers):    
    
    papers = get_info_es_canonical_title(papers)
    
    for paper in papers:
        if "source" not in paper:
            paper = get_info_es_display_title(paper)

        if "source" not in paper:
            paper = get_info_MAG(paper)

        if "source" not in paper:
            paper["source"] = None

    return papers


def get_paper_affiliations(papers):
    
    affiliations = set()
    
    mag_dict = dict()
    
    for paper in papers:
        for mag_paper in paper["MAG papers"]:
            if "Affiliations" not in mag_paper:
                mag_dict[mag_paper["PaperId"]] = mag_paper["PaperTitle"]
                mag_paper["Affiliations"] = list()
                
    
    papers = {canonical(paper["DBLP title"]):paper for paper in papers}

    
    batches = divide_batches(set(mag_dict.keys()),200)
    
    for batch in batches:

        search = Search(index = "paperauthoraffiliations*", using = client)
        query = { "query": { 
            "bool": {
                "should": [{"term": {"PaperId": pid}} for pid in batch]
            } 
        }  }
        search.update_from_dict(query)
        search = search.source(['PaperId','AuthorId','AffiliationId'])


        for res in search.scan():
            
            title = mag_dict[res["PaperId"]]
            try:
                for mag_paper in papers[title]["MAG papers"]:
                    if mag_paper["PaperId"] == res["PaperId"]:
                        if "AffiliationId" in res:
                            mag_paper["Affiliations"].append(res["AffiliationId"])
                            affiliations.add(res["AffiliationId"])
                        else:
                            mag_paper["Affiliations"].append(None)
            except:
                for paper in papers.values():
                    for mag_paper in paper["MAG papers"]:
                        if mag_paper["PaperId"] == res["PaperId"]:
                            if "AffiliationId" in res:
                                mag_paper["Affiliations"].append(res["AffiliationId"])
                                affiliations.add(res["AffiliationId"])
                            else:
                                mag_paper["Affiliations"].append(None)

    papers = list(papers.values())
    
    return papers, affiliations


def divide_batches(list_like,n):
    
    list_like = list(list_like)
    size = len(list_like)
    
    return [list_like[0+(n*x):min(n*(x+1),size)] for x in range(int(np.ceil(size/n)))]
    

def link_papers_with_affiliation_names(papers,affiliationids):

    
    aff_id_batches = divide_batches(affiliationids,100)
    
    affiliations = dict()
    
    # get affiliation names
    for batch in aff_id_batches:
    
        search = Search(index = "affiliations", using = client)
        query = {"query":{
            "bool":{
                "should": [{"match": {"AffiliationId": aid}} for aid in batch]
            }
        }}

        search.update_from_dict(query)

        source_fields = ['AffiliationId',"NormalizedName"]

        search = search.source(source_fields)

        try:
            for res in search.scan():
                affiliations[res["AffiliationId"]] = res["NormalizedName"]
        except:
            print(affiliationids)
            print(query)
            print(1[2])
        
    for paper in papers:
        for mag_paper in paper["MAG papers"]:
            mag_paper["Affiliations"] = [None if affiliation not in affiliations 
                                         else affiliations[affiliation] 
                                         for affiliation in mag_paper["Affiliations"]]

    
    return papers
    

def get_information_for_venue_papers(venue, venuetype, yearrange=yearrange, force=False):
    
    filter_f = filter_journals if venuetype == "journal" else filter_by_header_and_page_number_keep_missing
    
    
    for year in yearrange:

        in_filename = dblp_raw_filename(venue,year)
        out_filename = filtered_papers_filename(venue,year)

        
        # check whether the file already exists
        if os.path.exists(out_filename) and not force:
            continue

        with open(in_filename, "r") as fh:
            papers = json.load(fh)

        papers, _ = apply_filter_to_papers(filter_f, papers, venue, year)

        if len(papers) == 0:
            with open(out_filename,"w") as fh:
                json.dump([],fh)
            continue
        
        affiliation_ids = set()

        output = []

        for row in papers:

            paper = dict()

            paper["DBLP title"] = row["title"]
            paper["DBLP authors"] = row["authors"]
            paper["year"] = row["year"]
            paper["MAG papers"] = list()
            
            output.append(paper)

        output = get_paper_info(output)

        output, paper_affiliations = get_paper_affiliations(output)

        affiliation_ids.update(paper_affiliations)
            
        output = link_papers_with_affiliation_names(output, affiliation_ids)
        
        with open(out_filename,"w") as fh:
            json.dump(output,fh)
            
    
    print(venue)
    return None




In [8]:
def run_pools(task, lists, agg_f=None):
    pool = Pool(processes = threads)
    result = []
    for x in lists:
        result.append(pool.apply_async(task,(x,)))
    
    if agg_f is None:
        def agg_f(x):
            pass
    for rs in result:
        agg_f(rs.get())
    pool.close()
    
    
def popn(xs,n):
    popped = list()
    for i in range(n):
        if len(xs) == 0:
            break
        popped.append(xs.pop())
    return popped
            
def get_pool_lists(ls, threads):
    ls_ = ls.copy()
    if type(ls_) != list:
        ls_ = list(ls_)
    pool_lists = list()
    list_size = len(ls) // threads
    for i in range(threads-1):
        pool_lists.append(popn(ls_,list_size))
    pool_lists.append(ls_)
    return pool_lists

def task(venues):
    
    for venue, venuetype in venues:
        get_information_for_venue_papers(venue, venuetype)
    
    return None



In [9]:
start = datetime.now()

venues = list()

completed = list()

with open(venue_category_filename, "r") as fh:
    
    reader = csv.reader(fh, delimiter=",")
    
    # skip header row
    next(reader)
    
    for row in reader:
        
        venue_type = row[4]   
        name = row[0]
    
        venues.append((name, venue_type))

        
pool_lists = get_pool_lists(venues, threads)

run_pools(task, pool_lists,)

print((datetime.now()-start).total_seconds())

trets
wine
stacs
uist
sigcomm
tist
talip
tocs
tomccap
ssdbm
software
siamcomp
wcci
tissec
rtas
tpds
talg
tomacs
siamsc
ubicomp
ssd
toit
tosn
taco
siamrev
popl
tocl
wads
soda
sea
tecs
siamco
pact
naacl
rss
lats
jea
tois
micro
uai
taccess
iticse
tosem
sigsoft-fse
sdm
pods
ispa
vr
tochi
icws
interact
srds
iros
siamam
siamnum
icnp
tdsc
icebe
osdi
kr
recomb
mobisys
tog
icc
jdiq
tweb
tacas
miccai
tos
itc
eurosys
sc
tmm
edcc
sigmod
hpcc
vldb
podc
isorc
gecco
infocom
siammax
ddecs
cse
cga
icmr
colt
siamads
icst
cal
ipsn
3dim
icdt
asiacrypt
spin
oopsla
tcc
re
kdd
tods
mobisec
icassp
tvlsi
jcdl
taas
mdm
iswc
toplas
sbac-pad
tmc
eurocrypt
sigmetrics
edbt
hpca
vee
pldi
siamma
fse
dcoss
ccs
imc
ismvl
icml
cscw
coling
icsm
si3d
cacm
ipmi
icdm
ase
nsdi
aaai
spaa
tcbb
raid
jocch
todaes
mobihoc
tvcg
icaps
sysose
saint
mass
issta
jair
ton
tkde
esorics
sigir
ecoop
uss
hotos
pervasive
focs
date
ccgrid
siamjo
ijcar
crypto
iclp
ismm
cluster
icse
sensys
bpm
ipdps
icde
sp
noms
tc
aamas
pvldb
asap
mobicom
toct

NameError: name 'canonical' is not defined

The function below can be used to add additional filtered papers to a venue,year pair that has been affected by a change in the raw papers scraped or the filtering system without having to regather the information for the exisitng papers.

In [18]:
def get_information_for_venue_papers_add_additional_papers(venue, venuetype, year):
    
    filter_f = filter_journals if venuetype == "journal" else filter_by_header_and_page_number_keep_missing

    in_filename = dblp_raw_filename(venue,year)
    out_filename = filtered_papers_filename(venue,year)
    
    
    with open(in_filename, "r") as fh:
        papers = json.load(fh)


    papers, _ = apply_filter_to_papers(filter_f, papers, venue, year)
    
    
    with open(out_filename, "r") as fh:
        output = json.load(fh)

    original_output_size = len(output)
    
    existing_papers = [(paper["DBLP title"],paper["year"]) for paper in output]
    
    additional_papers = [paper for paper in papers if (paper["title"],paper["year"]) not in existing_papers]
    
    if len(additional_papers) == 0:
        print(venue,year,"nothing to add")
        return

    affiliation_ids = set()
    
    additional_output = list()
    
    for row in additional_papers:

        paper = dict()

        paper["DBLP title"] = row["title"]
        paper["DBLP authors"] = row["authors"]
        paper["year"] = row["year"]
        paper["MAG papers"] = list()

        additional_output.append(paper)

    additional_output = get_paper_info(additional_output)

    additional_output, paper_affiliations = get_paper_affiliations(additional_output)

    affiliation_ids.update(paper_affiliations)

    additional_output = link_papers_with_affiliation_names(additional_output, affiliation_ids)

    output.extend(additional_output)
    
    final_output_size = len(output)

    with open(out_filename,"w") as fh:
        json.dump(output,fh)
        
    print(venue,year,"from",original_output_size,"to",final_output_size)

    return None

In [25]:
start = datetime.now()

threads = 20

def additional_papers_task(venues):
    
    for venue, venuetype, year in venues:
        get_information_for_venue_papers_add_additional_papers(venue, venuetype, year)
    
    return None

venues = list()

with open("single_page_number_exclusions.csv","r") as fh:
    reader = csv.reader(fh,delimiter=",")
    next(reader)
    for row in reader:
        key = row[0]
        year = int(row[1])
        venue_type = row[-1]
        venues.append((key,venue_type,year))
        
pool_lists = get_pool_lists(venues, threads)

run_pools(additional_papers_task, pool_lists,)

print((datetime.now()-start).total_seconds())

wimob 2007 nothing to add
sigsoft-fse 2019 nothing to add
iticse 2009 nothing to add
podc 2008 nothing to add
sigcse 2015 nothing to add
mobisys 2014 nothing to add
mobisys 2017 nothing to add
mobisys 2015 nothing to add
tocs 2007 from 1 to 11
tap 2007 from 0 to 17
jeric 2007 from 0 to 6
tosn 2007 from 0 to 22
toit 2007 from 0 to 26
toms 2007 from 1 to 28
sigmetrics 2017 from 0 to 29
sigcse 2018 nothing to add
securecomm 2008 from 0 to 36
mobihoc 2011 from 0 to 25
talip 2007 from 3 to 14
tois 2013 from 11 to 22
tosem 2007 from 7 to 19
jacm 2007 from 4 to 32
iticse 2011 nothing to add
tomccap 2007 from 8 to 26
jetc 2007 from 0 to 13
tomacs 2007 from 4 to 22
ismvl 2007 from 0 to 58
tois 2007 from 0 to 25
tocl 2007 from 0 to 31
toplas 2007 from 1 to 43
sc 2008 from 0 to 65
talg 2007 from 9 to 50
tochi 2007 from 0 to 14
tog 2011 from 40 to 190
sc 2012 from 0 to 105
icpp 2007 from 0 to 77
tog 2007 from 0 to 128
sc 2007 from 0 to 58
saint 2007 from 0 to 18
tods 2007 from 0 to 29
todaes 2007 