In [24]:
import os, json, requests, sys, string, requests, csv
import http.client, urllib.request, urllib.parse, urllib.error, base64
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
from datetime import datetime
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from string import punctuation
from multiprocessing import Pool

from paper_filtering import filter_by_header
from paper_filtering import filter_by_page_number_keep_missing
from paper_filtering import filter_by_page_number_remove_missing
from paper_filtering import filter_by_header_and_page_number_keep_missing
from paper_filtering import filter_by_header_and_page_number_remove_missing
from paper_filtering import filter_journals
from paper_filtering import apply_filter_to_papers

In [25]:
threads = 8

# years to include note: range(inclusive,exclusive, i.e. range(1,3) = [1,2]
yearrange = range(2007,2023)

# filepath to the file containing venue names and their categories (conf or journal)
venue_category_filename = os.path.join(os.pardir,"app","data","venue_list.csv")


dblp_raw_filename = lambda name,year: os.path.join( "DBLP_raw_data" , 
                                              "{}_{}_raw_dblp_papers.json".format(name,year) )

filtered_papers_filename = lambda name,year: os.path.join( "filtered_papers" , 
                                              "{}_{}_filtered_papers.json".format(name,year) )

In [32]:
OPENALEX_WORK_API = "https://api.openalex.org/works/"
def get_openalex_paper_info(paper):
    query_url = f'{OPENALEX_WORK_API}{paper["doi"]}?select=id,doi,display_name,publication_year,authorships,cited_by_count'
#     print(query_url)
    response = requests.get(query_url)

    if response.status_code == 200:
        data = response.json()
        institutions = {}
        for authorship in data["authorships"]:
            num_insts = len(authorship["institutions"])
            for inst in authorship["institutions"]:
                inst_name = inst["display_name"] if "display_name" in inst else ""
                if inst_name not in institutions:
                    institutions[inst_name] = 0
                institutions[inst_name] += 1/num_insts
                
        return [{
            "PaperId": data["id"],
            "PaperTitle": data["display_name"],
            "Year": data["publication_year"],
            "CitationCount": data["cited_by_count"],
            "EstimatedCitation": data["cited_by_count"],
            "Affiliations": institutions,
            "Authors": [a["author"]["display_name"] for a in data["authorships"]]
        }]
    else:
        return []

        
def get_information_for_venue_papers(venue, venuetype, yearrange=yearrange, force=False):
    
    filter_f = filter_journals if venuetype == "journal" else filter_by_header_and_page_number_keep_missing
    
    for year in yearrange:

        in_filename = dblp_raw_filename(venue,year)
        out_filename = filtered_papers_filename(venue,year)
        
        if not os.path.exists(in_filename):
            print(in_filename, "does not exist!")
            continue
        
        # check whether the file already exists
        if os.path.exists(out_filename) and not force:
            continue

        with open(in_filename, "r") as fh:
            papers = json.load(fh)

        papers, _ = apply_filter_to_papers(filter_f, papers, venue, year)

        if len(papers) == 0:
            with open(out_filename,"w") as fh:
                json.dump([],fh)
            continue
        

        output = []
        for row in papers:

            paper = dict()

            paper["DBLP title"] = row["title"]
            paper["DBLP authors"] = row["authors"]
            paper["year"] = row["year"]
            paper["doi"] = row["doi"]
            paper["OA papers"] = get_openalex_paper_info(row)

#             print(paper)
            output.append(paper)

        with open(out_filename,"w") as fh:
            json.dump(output,fh)
            
    
    print(venue)
    return None




In [33]:
# for test
# get_information_for_venue_papers("3dim", "journal")

In [34]:
def run_pools(task, lists, agg_f=None):
    pool = Pool(processes = threads)
    result = []
    for x in lists:
        result.append(pool.apply_async(task,(x,)))
    
    if agg_f is None:
        def agg_f(x):
            pass
    for rs in result:
        agg_f(rs.get())
    pool.close()
    
    
def popn(xs,n):
    popped = list()
    for i in range(n):
        if len(xs) == 0:
            break
        popped.append(xs.pop())
    return popped
            
def get_pool_lists(ls, threads):
    ls_ = ls.copy()
    if type(ls_) != list:
        ls_ = list(ls_)
    pool_lists = list()
    list_size = len(ls) // threads
    for i in range(threads-1):
        pool_lists.append(popn(ls_,list_size))
    pool_lists.append(ls_)
    return pool_lists

def task(venues):
    
    for venue, venuetype in venues:
        get_information_for_venue_papers(venue, venuetype)
    
    return None

In [29]:
start = datetime.now()

venues = list()

completed = list()

with open(venue_category_filename, "r") as fh:
    
    reader = csv.reader(fh, delimiter=",")
    
    # skip header row
    next(reader)
    
    for row in reader:
        
        venue_type = row[4]   
        name = row[0]
    
        venues.append((name, venue_type))

        
pool_lists = get_pool_lists(venues, threads)

run_pools(task, pool_lists,)

print((datetime.now()-start).total_seconds())

Process SpawnPoolWorker-150:
Traceback (most recent call last):
  File "/Users/minjeong.shin/opt/miniconda3/envs/csmetrics/lib/python3.8/multiprocessing/process.py", line 313, in _bootstrap
    self.run()
  File "/Users/minjeong.shin/opt/miniconda3/envs/csmetrics/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/minjeong.shin/opt/miniconda3/envs/csmetrics/lib/python3.8/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/Users/minjeong.shin/opt/miniconda3/envs/csmetrics/lib/python3.8/multiprocessing/queues.py", line 358, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'task' on <module '__main__' (built-in)>
Process SpawnPoolWorker-151:
Traceback (most recent call last):
  File "/Users/minjeong.shin/opt/miniconda3/envs/csmetrics/lib/python3.8/multiprocessing/process.py", line 313, in _bootstrap
    self.run()
  File "/Users/minjeong.shin/opt/miniconda3/envs/csmetri

KeyboardInterrupt: 

/python3.8/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
Process SpawnPoolWorker-164:
Traceback (most recent call last):
  File "/Users/minjeong.shin/opt/miniconda3/envs/csmetrics/lib/python3.8/multiprocessing/process.py", line 313, in _bootstrap
    self.run()
  File "/Users/minjeong.shin/opt/miniconda3/envs/csmetrics/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/minjeong.shin/opt/miniconda3/envs/csmetrics/lib/python3.8/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/Users/minjeong.shin/opt/miniconda3/envs/csmetrics/lib/python3.8/multiprocessing/queues.py", line 355, in get
    with self._rlock:
  File "/Users/minjeong.shin/opt/miniconda3/envs/csmetrics/lib/python3.8/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt


In [36]:
# for single thread running

venues = list()
with open(venue_category_filename, "r") as fh:
    reader = csv.reader(fh, delimiter=",")
    # skip header row
    next(reader)
    venues = [(row[0], row[4]) for row in reader]
        
for venue, venue_type in venues:
    start = datetime.now()
    get_information_for_venue_papers(venue, venue_type)
    print(venue, (datetime.now()-start).total_seconds())

3dim
3dim 0.015481
aaai
aaai 0.010549
aamas
aamas 0.007908
acl
acl 0.005104
ai
ai 0.003239
aiccsa
aiccsa 0.003664
aim
aim 0.002465
aina
aina 0.003161
amai
amai 0.001692
ancs
ancs 0.001287
annals
annals 0.001295
apscc
apscc 0.001355
arith
arith 0.001719
asap
asap 0.001494
ase
ase 0.001638
asiacrypt
asiacrypt 0.001659
asplos
asplos 0.001302
ats
ats 0.00145
avss
avss 0.001476
bibe
bibe 0.001804
bioinformatics
bioinformatics 0.002504
bpm
bpm 0.002483
cacm
cacm 0.002415
cal
cal 0.001204
cases
cases 0.00114
cav
cav 0.001454
cc
cc 0.001287
ccc
ccc 0.001051
cccg
cccg 0.001537
ccgrid
ccgrid 0.001827
ccs
ccs 0.001877
cga
cga 0.001646
cgf
cgf 0.002576
cgo
cgo 0.001043
chi
chi 0.003414
cikm
cikm 0.003076
civr
civr 0.000999
cluster
cluster 0.001525
coling
coling 0.001998
colt
colt 0.001589
compgeom
compgeom 0.001554
compsac
compsac 0.002849
computer
computer 0.00199
conext
conext 0.001498
cp
cp 0.002467
crypto
crypto 0.001252
cscw
cscw 0.001257
cse
cse 0.001545
csfw
csfw 0.001143
csur
csur 0.001526

The function below can be used to add additional filtered papers to a venue,year pair that has been affected by a change in the raw papers scraped or the filtering system without having to regather the information for the exisitng papers.

In [37]:
def get_information_for_venue_papers_add_additional_papers(venue, venuetype, year):
    
    filter_f = filter_journals if venuetype == "journal" else filter_by_header_and_page_number_keep_missing

    in_filename = dblp_raw_filename(venue,year)
    out_filename = filtered_papers_filename(venue,year)
    
    
    with open(in_filename, "r") as fh:
        papers = json.load(fh)


    papers, _ = apply_filter_to_papers(filter_f, papers, venue, year)
    
    
    with open(out_filename, "r") as fh:
        output = json.load(fh)

    original_output_size = len(output)
    
    existing_papers = [(paper["DBLP title"],paper["year"]) for paper in output]
    
    additional_papers = [paper for paper in papers if (paper["title"],paper["year"]) not in existing_papers]
    
    if len(additional_papers) == 0:
        print(venue,year,"nothing to add")
        return

    affiliation_ids = set()
    
    additional_output = list()
    
    for row in additional_papers:

        paper = dict()

        paper["DBLP title"] = row["title"]
        paper["DBLP authors"] = row["authors"]
        paper["year"] = row["year"]
        paper["doi"] = row["doi"]
        paper["OA papers"] = get_openalex_paper_info(row)
        
        additional_output.append(paper)

    output.extend(additional_output)
    
    final_output_size = len(output)

    with open(out_filename,"w") as fh:
        json.dump(output,fh)
        
    print(venue,year,"from",original_output_size,"to",final_output_size)

    return None

In [39]:
start = datetime.now()

threads = 20

def additional_papers_task(venues):
    for venue, venuetype, year in venues:
        print(venue)
        get_information_for_venue_papers_add_additional_papers(venue, venuetype, year)
    return None


venues = list()

with open("single_page_number_exclusions.csv","r") as fh:
    reader = csv.reader(fh,delimiter=",")
    next(reader)
    for row in reader:
        key = row[0]
        year = int(row[1])
        venue_type = row[-1]
        venues.append((key,venue_type,year))
        
# pool_lists = get_pool_lists(venues, threads)

# run_pools(additional_papers_task, pool_lists,)
additional_papers_task(venues)

print((datetime.now()-start).total_seconds())

aamas
aamas 2007 nothing to add
ancs
ancs 2010 nothing to add
cgo
cgo 2014 nothing to add
chi
chi 2018 nothing to add
chi
chi 2019 nothing to add
compgeom
compgeom 2014 nothing to add
conext
conext 2007 nothing to add
conext
conext 2008 nothing to add
conext
conext 2010 nothing to add
conext
conext 2011 nothing to add
csur
csur 2007 nothing to add
emsoft
emsoft 2018 nothing to add
fast
fast 2012 nothing to add
gis
gis 2007 nothing to add
gis
gis 2008 nothing to add
icac
icac 2007 nothing to add
iccad
iccad 2016 nothing to add
iccad
iccad 2018 nothing to add
icdcs
icdcs 2007 nothing to add
icis
icis 2007 nothing to add
icis
icis 2008 nothing to add
icis
icis 2009 nothing to add
icis
icis 2010 nothing to add
icmr
icmr 2011 nothing to add
icmr
icmr 2012 nothing to add
icmr
icmr 2014 nothing to add
icpp
icpp 2007 nothing to add
ismvl
ismvl 2007 nothing to add
iticse
iticse 2009 nothing to add
iticse
iticse 2011 nothing to add
jacm
jacm 2007 nothing to add
jeric
jeric 2007 nothing to add
je