In [1]:
import os, json, requests, sys, string, requests, csv
import http.client, urllib.request, urllib.parse, urllib.error, base64
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
from graph.config import conf
from datetime import datetime
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from string import punctuation
from multiprocessing import Pool

from paper_filtering import filter_by_header
from paper_filtering import filter_by_page_number_keep_missing
from paper_filtering import filter_by_page_number_remove_missing
from paper_filtering import filter_by_header_and_page_number_keep_missing
from paper_filtering import filter_by_header_and_page_number_remove_missing
from paper_filtering import filter_journals
from paper_filtering import apply_filter_to_papers

In [2]:
eshost = "130.56.248.215:9200"

threads = 40

client = Elasticsearch(eshost, timeout=200, maxthreads = threads)


# years to include note: range(inclusive,exclusive, i.e. range(1,3) = [1,2]
yearrange = range(2007,2020)

# filepath to the file containing venue names and their categories (conf or journal)
venue_category_filename = os.path.join(os.pardir,"app","data","venue_category.csv")


dblp_raw_filename = lambda name,year: os.path.join( "DBLP_raw_data" , 
                                              "{}_{}_raw_dblp_papers.json".format(name,year) )

filtered_papers_filename = lambda name,year: os.path.join( "filtered_papers" , 
                                              "{}_{}_filtered_papers.json".format(name,year) )

In [4]:
def query_academic_search(type, url, query):
    headers = {
        # Request headers
        'Ocp-Apim-Subscription-Key': '4698d5e7b0244e828d1dc21134238650',    # bens
    }
    if type == "get":
        response = requests.get(url, params=urllib.parse.urlencode(query), headers=headers)
    elif type == "post":
        response = requests.post(url, json=query, headers=headers)
#     if response.status_code != 200:
#         print("return statue: " + str(response.status_code))
#         print("ERROR: problem with the request.")
#         print(response.content)
        #exit()
    return json.loads((response.content).decode("utf-8"))


def interpret_title(title):
    
        
    MAS_URL_PREFIX = "https://api.labs.cognitive.microsoft.com"
    url = os.path.join(MAS_URL_PREFIX, "academic/v1.0/interpret")
    query = {
      "query": title,
      "count": 1,
      "offset": 0,
      "attributes": "Ti"
    }

    data = query_academic_search("get", url, query)
    expr = data["interpretations"][0]["rules"][0]["output"]["value"]
    
    return expr


def evaluate_expr(query):
    
    
    MAS_URL_PREFIX = "https://api.labs.cognitive.microsoft.com"
    url = os.path.join(MAS_URL_PREFIX, "academic/v1.0/evaluate")
    query = {
      "expr": "Ti='{}'".format(query),
      "count": 20,
      "offset": 0,
      "attributes": "Id,Ti,Y,AA.AuId,AA.AfId,CC,ECC"
    }

    data = query_academic_search("get", url, query)
    
    return data
    
    
def get_title_from_MAG(title):
    
    try:
        expr = interpret_title(title)
        mag_title = expr[4:-1]
        
    except:
        mag_title = None

    return mag_title


def canonical(title):
    title = title.lower()
    title = title.translate(str.maketrans(punctuation, " "*len(punctuation)))
    title = " ".join(title.split())


    return title


In [5]:
# def get_info_es_canonical_title(paper):

#     canonical_title = canonical(paper["DBLP title"])
    
#     search = Search(index = "papers*", using = client)
#     query = {"query":{
#         "bool":{
#             "must": {"match_phrase": {"PaperTitle": canonical_title}}
#         }
#     }}

#     search.update_from_dict(query)

#     source_fields = ['PaperId',"PaperTitle", "Year", "CitationCount", "EstimatedCitation"]

#     search = search.source(source_fields)

#     for res in search.scan():
#         if "Year" in res and res["Year"]==paper["year"] and res["PaperTitle"] == canonical_title:
#             paper["MAG papers"].append({field: res[field] for field in source_fields})
            
#             paper["source"] = "ES"

#     return paper

    
# def get_info_es_display_title(paper):
#     search = Search(index = "papers*", using = client)
#     query = {"query":{
#         "bool":{
#             "must": {"match_phrase": {"OriginalTitle": paper["DBLP title"]}}
#         }
#     }}

#     search.update_from_dict(query)

#     source_fields = ['PaperId',"PaperTitle", "OriginalTitle", "Year", "CitationCount", "EstimatedCitation"]

#     search = search.source(source_fields)

#     for res in search.scan():
#         if "Year" in res and res["Year"]==paper["year"] and res["OriginalTitle"] == paper["DBLP title"]:
#             paper["MAG papers"].append({field: res[field] for key in source_fields})
            
#             paper["source"] = "ES"

#     return paper
    

# def get_info_MAG(paper):
    
#     if "source" in paper:
#         return paper
    
#     title = get_title_from_MAG(paper["DBLP title"])
    
#     if title == None:
#         return paper

#     paper_data = evaluate_expr(title)
    
#     key_pairs = [
#         ("PaperTitle","Ti"),
#         ("PaperId","Id"),
#         ("Year","Y"),
#         ("CitationCount","CC"),
#         ("EstimatedCitation", "ECC")
#     ]`
    
#     if "entities" not in paper_data:
#         return paper
    
#     for entity in paper_data["entities"]:
#         info = {tokey: entity[fromkey] for tokey, fromkey in key_pairs}
#         info["Affiliations"] = [None if "AfId" not in author else author["AfId"] for author in entity["AA"]]
#         info["Authors"] = [None if "AuId" not in author else author["AuId"] for author in entity["AA"]]
#         paper["MAG papers"].append(info)
    
#     if len(paper["MAG papers"]) > 0:
#         paper["source"] = "MAG"
#         return paper
    
#     return paper

    
# def get_paper_info(paper):    
    
#     paper = get_info_es_canonical_title(paper)
    
#     if "source" not in paper:
#         paper = get_info_es_display_title(paper)
    
#     if "source" not in paper:
#         paper = get_info_MAG(paper)

#     if "source" not in paper:
#         paper["source"] = None

#     return paper


# def get_paper_affiliations(paper):
    
#     affiliations = set()
    
#     for mag_paper in paper["MAG papers"]:
#         if "Affiliations" in mag_paper:
#             affiliations.update(mag_paper["Affiliations"])
#             continue
        
#         mag_paper["Authors"] = list()
#         mag_paper["Affiliations"] = list()
        
#         search = Search(index = "paperauthoraffiliations*", using = client)
#         query = { "query": { "term": {"PaperId": mag_paper["PaperId"] }  }  }
#         search.update_from_dict(query)
#         search = search.source(['PaperId','AuthorId','AffiliationId'])

#         for res in search.scan():
#             if "AuthorId" in res:
#                 mag_paper["Authors"].append(res["AuthorId"])
#             else:
#                 mag_paper["Authos"].append(None)

#             if "AffiliationId" in res:
#                 mag_paper["Affiliations"].append(res["AffiliationId"])
                
#             else:
#                 mag_paper["Affiliations"].append(None)
        
#         affiliations.update(mag_paper["Affiliations"])
    
#     return paper, affiliations


# def divide_batches(list_like,n):
    
#     list_like = list(list_like)
#     size = len(list_like)
    
#     return [list_like[0+(n*x):min(n*(x+1),size)] for x in range(int(np.ceil(size/n)))]
    

# def link_papers_with_affiliation_names(papers,affiliationids):
    
#     affiliationids.remove(None)
    
#     aff_id_batches = divide_batches(affiliationids,100)
    
#     affiliations = dict()
    
#     # get affiliation names
#     for batch in aff_id_batches:
    
#         search = Search(index = "affiliations", using = client)
#         query = {"query":{
#             "bool":{
#                 "should": [{"match": {"AffiliationId": aid}} for aid in batch]
#             }
#         }}

#         search.update_from_dict(query)

#         source_fields = ['AffiliationId',"NormalizedName"]

#         search = search.source(source_fields)

#         try:
#             for res in search.scan():
#                 affiliations[res["AffiliationId"]] = res["NormalizedName"]
#         except:
#             print(query)
#             print(1[2])
        
#     for paper in papers:
#         for mag_paper in paper["MAG papers"]:
#             mag_paper["Affiliations"] = [None if affiliation not in affiliations 
#                                          else affiliations[affiliation] 
#                                          for affiliation in mag_paper["Affiliations"]]

    
#     return papers
    

# def get_information_for_venue_papers(venue, venuetype):
    
#     filter_f = filter_journals if venuetype == "journal" else filter_by_header_and_page_number_keep_missing
    
    
#     for year in [2010]:#yearrange:

#         in_filename = dblp_raw_filename(venue,year)
#         out_filename = filtered_papers_filename(venue,year)

#         with open(in_filename, "r") as fh:
#             papers = json.load(fh)

#         papers, _ = apply_filter_to_papers(filter_f, papers, venue, year)

#         affiliation_ids = set()

#         output = []

#         for row in papers:

#             paper = dict()

#             paper["DBLP title"] = row["title"]
#             paper["DBLP authors"] = row["authors"]
#             paper["year"] = row["year"]
#             paper["MAG papers"] = list()

#             paper = get_paper_info(paper)

#             paper, paper_affiliations = get_paper_affiliations(paper)
            
#             affiliation_ids.update(paper_affiliations)
            

#             output.append(paper)

#         output = link_papers_with_affiliation_names(output, affiliation_ids)
        
        
#         with open(out_filename,"w") as fh:
#             json.dump(output,fh)
    
#     return output

In [6]:
# start = datetime.now()


# %lprun -f get_information_for_venue_papers papers = get_information_for_venue_papers("icsm","conference")
# sources = dict.fromkeys(set([paper["source"] for paper in papers]),0)
# for paper in papers:
#     sources[paper["source"]] += 1

# print(sources, (datetime.now()-start).total_seconds())

In [7]:
#  main("icsm",2010, "conference")

In [8]:
# with open(venue_category_filename, "r") as fh:
    
#     reader = csv.reader(fh, delimiter=",")
    
#     # skip header row
#     next(reader)
    
#     for row in reader:
        
#         venue_type = row[4]   
#         name = row[0]
    
#         # 2007-2019
#         for year in range(2007,2020):
        
        
#             if name=="icra" and year==2014:
#                 force_conference = True

#             papers = main(name,year,venue_type)
#             print(name,year,venue_type, len(papers))

In [9]:
# with open(dblp_raw_filename("icsm",2008), "r") as fh:
#     papers = json.load(fh)

In [10]:
# for paper in papers["noheader"]:
#     print(paper["title"])
# len(papers["noheader"])

In [11]:
def get_info_es_canonical_title(papers):

    year = papers[0]["year"]
    
    papers = {canonical(paper["DBLP title"]):paper for paper in papers}
        
        
    batches = divide_batches(set(papers.keys()),20)

    
    for batch in batches:
        
        canonical_titles = {canonical(title) for title in batch}

        search = Search(index = "papers*", using = client)
        query = {"query":{
            "bool":{
                "should": [{"match_phrase": {"PaperTitle": ct}} for ct in canonical_titles]
            }
        }}

        search.update_from_dict(query)

        source_fields = ['PaperId',"PaperTitle", "Year", "CitationCount", "EstimatedCitation"]

        search = search.source(source_fields)

        for res in search.scan():
            if "Year" in res and res["Year"] == year and res["PaperTitle"] in canonical_titles:
                ({field: res[field] for field in source_fields})

                papers[res["PaperTitle"]]["MAG papers"].append({field: res[field] for field in source_fields})
                papers[res["PaperTitle"]]["source"] = "ES"

    return list(papers.values())

    
def get_info_es_display_title(paper):
    search = Search(index = "papers*", using = client)
    query = {"query":{
        "bool":{
            "must": {"match_phrase": {"OriginalTitle": paper["DBLP title"]}}
        }
    }}

    search.update_from_dict(query)

    source_fields = ['PaperId',"PaperTitle", "OriginalTitle", "Year", "CitationCount", "EstimatedCitation"]

    search = search.source(source_fields)

    for res in search.scan():
        if "Year" in res and res["Year"]==paper["year"] and res["OriginalTitle"] == paper["DBLP title"]:
            paper["MAG papers"].append({field: res[field] for field in source_fields})
            
            paper["source"] = "ES"

    return paper
    

def get_info_MAG(paper):
    
    if "source" in paper:
        return paper
    
    title = get_title_from_MAG(paper["DBLP title"])
    
    if title == None:
        return paper

    paper_data = evaluate_expr(title)
    
    key_pairs = [
        ("PaperTitle","Ti"),
        ("PaperId","Id"),
        ("Year","Y"),
        ("CitationCount","CC"),
        ("EstimatedCitation", "ECC")
    ]
    
    if "entities" not in paper_data:
        return paper
    
    for entity in paper_data["entities"]:
        info = {tokey: entity[fromkey] for tokey, fromkey in key_pairs}
        info["Affiliations"] = [None if "AfId" not in author else author["AfId"] for author in entity["AA"]]
        info["Authors"] = [None if "AuId" not in author else author["AuId"] for author in entity["AA"]]
        paper["MAG papers"].append(info)
    
    if len(paper["MAG papers"]) > 0:
        paper["source"] = "MAG"
        return paper
    
    return paper

    
def get_paper_info(papers):    
    
    papers = get_info_es_canonical_title(papers)
    
    for paper in papers:
        if "source" not in paper:
            paper = get_info_es_display_title(paper)

        if "source" not in paper:
            paper = get_info_MAG(paper)

        if "source" not in paper:
            paper["source"] = None

    return papers


def get_paper_affiliations(papers):
    
    affiliations = set()
    
    mag_dict = dict()
    
    for paper in papers:
        for mag_paper in paper["MAG papers"]:
            if "Affiliations" not in mag_paper:
                mag_dict[mag_paper["PaperId"]] = mag_paper["PaperTitle"]
                mag_paper["Affiliations"] = list()
                
    
    papers = {canonical(paper["DBLP title"]):paper for paper in papers}

    
    batches = divide_batches(set(mag_dict.keys()),200)
    
    for batch in batches:

        search = Search(index = "paperauthoraffiliations*", using = client)
        query = { "query": { 
            "bool": {
                "should": [{"term": {"PaperId": pid}} for pid in batch]
            } 
        }  }
        search.update_from_dict(query)
        search = search.source(['PaperId','AuthorId','AffiliationId'])


        for res in search.scan():
            
            title = mag_dict[res["PaperId"]]
            try:
                for mag_paper in papers[title]["MAG papers"]:
                    if mag_paper["PaperId"] == res["PaperId"]:
                        if "AffiliationId" in res:
                            mag_paper["Affiliations"].append(res["AffiliationId"])
                            affiliations.add(res["AffiliationId"])
                        else:
                            mag_paper["Affiliations"].append(None)
            except:
                for paper in papers.values():
                    for mag_paper in paper["MAG papers"]:
                        if mag_paper["PaperId"] == res["PaperId"]:
                            if "AffiliationId" in res:
                                mag_paper["Affiliations"].append(res["AffiliationId"])
                                affiliations.add(res["AffiliationId"])
                            else:
                                mag_paper["Affiliations"].append(None)

    papers = list(papers.values())
    
    return papers, affiliations


def divide_batches(list_like,n):
    
    list_like = list(list_like)
    size = len(list_like)
    
    return [list_like[0+(n*x):min(n*(x+1),size)] for x in range(int(np.ceil(size/n)))]
    

def link_papers_with_affiliation_names(papers,affiliationids):

    
    aff_id_batches = divide_batches(affiliationids,100)
    
    affiliations = dict()
    
    # get affiliation names
    for batch in aff_id_batches:
    
        search = Search(index = "affiliations", using = client)
        query = {"query":{
            "bool":{
                "should": [{"match": {"AffiliationId": aid}} for aid in batch]
            }
        }}

        search.update_from_dict(query)

        source_fields = ['AffiliationId',"NormalizedName"]

        search = search.source(source_fields)

        try:
            for res in search.scan():
                affiliations[res["AffiliationId"]] = res["NormalizedName"]
        except:
            print(affiliationids)
            print(query)
            print(1[2])
        
    for paper in papers:
        for mag_paper in paper["MAG papers"]:
            mag_paper["Affiliations"] = [None if affiliation not in affiliations 
                                         else affiliations[affiliation] 
                                         for affiliation in mag_paper["Affiliations"]]

    
    return papers
    

def get_information_for_venue_papers(venue, venuetype):
    
    filter_f = filter_journals if venuetype == "journal" else filter_by_header_and_page_number_keep_missing
    
    this=True
    
    for year in yearrange:

        in_filename = dblp_raw_filename(venue,year)
        out_filename = filtered_papers_filename(venue,year)

        
        # check whether the file already exists
        if os.path.exists(out_filename):
            continue

        with open(in_filename, "r") as fh:
            papers = json.load(fh)

        papers, _ = apply_filter_to_papers(filter_f, papers)

        if len(papers) == 0:
            with open(out_filename,"w") as fh:
                json.dump([],fh)
            continue
        
        affiliation_ids = set()

        output = []

        for row in papers:

            paper = dict()

            paper["DBLP title"] = row["title"]
            paper["DBLP authors"] = row["authors"]
            paper["year"] = row["year"]
            paper["MAG papers"] = list()
            
            output.append(paper)

        output = get_paper_info(output)

        output, paper_affiliations = get_paper_affiliations(output)

        affiliation_ids.update(paper_affiliations)
            
        output = link_papers_with_affiliation_names(output, affiliation_ids)
        
        with open(out_filename,"w") as fh:
            json.dump(output,fh)
            
        this = False
    
    print(venue)
    return None

In [12]:
# start = datetime.now()
# %lprun -f get_information_for_venue_papers papers = get_information_for_venue_papers("icws","conference")
# sources = dict.fromkeys(set([paper["source"] for paper in papers]),0)
# for paper in papers:
#     sources[paper["source"]] += 1

# print(sources, (datetime.now()-start).total_seconds())

In [13]:
def run_pools(task, lists, agg_f=None):
    pool = Pool(processes = threads)
    result = []
    for x in lists:
        result.append(pool.apply_async(task,(x,)))
    
    if agg_f is None:
        def agg_f(x):
            pass
    for rs in result:
        agg_f(rs.get())
    pool.close()
    
    
def popn(xs,n):
    popped = list()
    for i in range(n):
        if len(xs) == 0:
            break
        popped.append(xs.pop())
    return popped
            
def get_pool_lists(ls, threads):
    ls_ = ls.copy()
    if type(ls_) != list:
        ls_ = list(ls_)
    pool_lists = list()
    list_size = len(ls) // threads
    for i in range(threads-1):
        pool_lists.append(popn(ls_,list_size))
    pool_lists.append(ls_)
    return pool_lists

def task(venues):
    
    for venue, venuetype in venues:
        get_information_for_venue_papers(venue, venuetype)
    
    return None



In [14]:
start = datetime.now()

venues = list()

completed = list()

with open(venue_category_filename, "r") as fh:
    
    reader = csv.reader(fh, delimiter=",")
    
    # skip header row
    next(reader)
    
    for row in reader:
        
        venue_type = row[4]   
        name = row[0]
    
        venues.append((name, venue_type))

        
pool_lists = get_pool_lists(venues, threads)

run_pools(task, pool_lists,)

print((datetime.now()-start).total_seconds())

vldb
www
tocs
tomacs
titb
tweb
wsdm
toit
tc
vee
tosn
tocl
wowmom
tois
tap
tochi
tosem
tvlsi
talip
tmm
tos
talg
toplas
wimob
tog
tvcg
tmc
taco
ton
wcci
tods
taccess
tslp
sigmetrics
siamnum
siamam
spaa
taas
sdm
toms
wads
todaes
tacas
siammax
siamads
rtas
tomccap
si3d
sp
vr
sosp
sc
software
sigir
siamma
sbac-pad
pami
toct
nfm
rss
ispa
itc
jocch
sysose
mmas
ppopp
jcdl
mascots
jair
icis
sensys
isca
sigecom
lics
icde
isorc
pact
recomb
saint
popl
ics
iiswc
securecomm
jacm
hpca
icra
icgse
ieeemm
lctes
icdcs
pods
icaps
iswc
gecco
stoc
osdi
icpr
icfp
sec
ismvl
re
iui
sacmat
lcn
iccv
podc
hotos
issta
fse
expert
sea
ismm
itpro
icer
oopsla
dt
dac
focs
icpp
emsoft
siamjo
eurosys
itng
ismb
icebe
icnp
raid
lats
pldi
icccn
emnlp
hoti
issre
conext
computer
pervasive
ismar
iticse
3dim
kr
pvldb
civr
icdt
cvpr
nsdi
fmcad
iccad
kdd
compsac
icdm
noms
edcc
percom
csur
fccm
cikm
edbt
fast
eurocrypt
hipeac
chi
esop
ecoop
fase
esorics
islped
cgo
hipc
grid
gis
uss
mm
disc
digitel
soda
esa
isscc
icwsm
cgf
tse
ical

NameError: name 'output' is not defined

In [18]:
with open(filtered_papers_filename("www",2007),"r") as fh:
    papers = json.load(fh)

In [19]:
papers

[{'DBLP title': 'Homepage live: automatic block tracing for web personalization.',
  'DBLP authors': ['Jie Han',
   'Dingyi Han',
   'Chenxi Lin',
   'Hua-Jun Zeng',
   'Zheng Chen',
   'Yong Yu'],
  'year': 2007,
  'MAG papers': [{'PaperId': 2125047283,
    'PaperTitle': 'homepage live automatic block tracing for web personalization',
    'Year': 2007,
    'CitationCount': 6,
    'EstimatedCitation': 6,
    'Affiliations': ['shanghai jiao tong university',
     'shanghai jiao tong university',
     'microsoft',
     'microsoft',
     'shanghai jiao tong university',
     'microsoft']}],
  'source': 'ES'},
 {'DBLP title': 'Open user profiles for adaptive news systems: help or harm?',
  'DBLP authors': ['Jae-wook Ahn',
   'Peter Brusilovsky',
   'Jonathan Grady',
   'Daqing He',
   'Sue Yeon Syn'],
  'year': 2007,
  'MAG papers': [{'PaperId': 2116578632,
    'PaperTitle': 'open user profiles for adaptive news systems help or harm',
    'Year': 2007,
    'CitationCount': 150,
    'Estima