In [1]:
import json, os, csv
import numpy as np
import pandas as pd
from scipy.stats import gmean
from IPython.display import display, HTML

In [2]:
# filepath to the file containing venue names and their categories (conf or journal)
venue_category_filename = os.path.join(os.pardir,"app","data","venue_list.csv")
filtered_papers_filename = lambda name,year: os.path.join( "filtered_papers" , 
                                              "{}_{}_filtered_papers.json".format(name,year) )
previous_weight_filename = os.path.join(os.pardir,"app","data","venue_weight.csv")
yearranges = range(2007,2020)

# Count paper numbers

In [3]:
paper_data = {}
def count_paper_numbers(venue):
    paper_data[venue] = {y:0 for y in yearranges}
    for year in yearranges:
        with open(filtered_papers_filename(venue, year), "r") as fh:
            papers = json.load(fh)
            paper_data[venue][year] = len(papers)
    paper_data[venue]["Total"] = sum(paper_data[venue].values())

In [4]:
with open(venue_category_filename, "r") as fh:
    reader = csv.reader(fh, delimiter=",")
    # skip header row
    next(reader)
    for row in reader:
        venue_name = row[0]
        count_paper_numbers(venue_name)

# Compute Geometric mean

One difficulty with the geometric mean is that it is undefined when the number of citations is zero. There indeed are some papers with zero citations. To handle this case, we add 1 to the number of citations before computing the geometric mean, and then subtract 1 from the mean obtained.

In [5]:
def compute_venue_weight(venue):
    citation_numbers = []
    paper_numbers = []
    no_matching_paper_count =0
    for year in yearranges:
        with open(filtered_papers_filename(venue, year), "r") as fh:
            papers = json.load(fh)
            paper_numbers.append(len(papers))
            for p in papers:
                mag_matching_papers = p["MAG papers"]
                if len(mag_matching_papers) == 0: # no matching paper from MAG
                    no_matching_paper_count += 1
                    continue
                if len(mag_matching_papers) > 1: # more than one matching papers --> take max cc number
                    citation_count = max([cc["CitationCount"] for cc in mag_matching_papers])
                else:
                    citation_count = mag_matching_papers[0]["CitationCount"]
                citation_numbers.append(1+citation_count)
#     print(venue, citation_numbers)
    paper_data[venue]["NoMatch"] = no_matching_paper_count
    paper_data[venue]["p_NoMatch(%)"] = float(no_matching_paper_count)/float(paper_data[venue]["Total"])*100.0
    paper_data[venue]["GeoMean"] = gmean(citation_numbers)-1

In [6]:
with open(venue_category_filename, "r") as fh:
    reader = csv.reader(fh, delimiter=",")
    # skip header row
    next(reader)
    for row in reader:
        venue_name = row[0]
        compute_venue_weight(venue_name)

In [7]:
# read previous weight
with open(previous_weight_filename, "r") as fh:
    reader = csv.reader(fh, delimiter=",")
    # skip header row
    next(reader)
    for row in reader:
        venue_name = row[0]
        pre_gmean = row[2]
        paper_data[venue_name]["old_GeoMean"] = float(pre_gmean)

In [8]:
df = pd.DataFrame(data=paper_data).T
# df.columns.tolist()

In [12]:
df = df.astype({y:float for y in yearranges})
df = df.astype({"GeoMean": float, "p_NoMatch(%)": float})
pd.set_option('display.max_rows', len(df))
print("NoMatch: No matching paper from MAG")
print("p_NoMatch(%): 100*NoMatch/Total")
coverage = 100*sum([pv["NoMatch"] for pv in paper_data.values()])/sum([pv["Total"] for pv in paper_data.values()])
print("{}% of the papers did not match from title search".format(coverage))

NoMatch: No matching paper from MAG
p_NoMatch(%): 100*NoMatch/Total
4.740630948574434% of the papers did not match from title search


In [13]:
df

Unnamed: 0,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,Total,NoMatch,p_NoMatch(%),GeoMean,old_GeoMean
3dim,49.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49.0,0.0,0.0,8.232344,2.141
aaai,290.0,277.0,0.0,270.0,260.0,311.0,210.0,422.0,557.0,572.0,667.0,985.0,0.0,4821.0,195.0,4.044804,4.828385,4.296
aamas,0.0,0.0,0.0,168.0,125.0,136.0,135.0,163.0,166.0,135.0,155.0,193.0,193.0,1569.0,199.0,12.683238,4.757523,6.107
acl,131.0,119.0,121.0,160.0,164.0,187.0,327.0,286.0,318.0,328.0,302.0,430.0,0.0,2873.0,28.0,0.974591,8.595709,10.358
acsac,43.0,44.0,46.0,41.0,41.0,45.0,35.0,47.0,47.0,48.0,48.0,60.0,0.0,545.0,13.0,2.385321,8.167373,1.382
ai,61.0,73.0,63.0,67.0,87.0,46.0,71.0,62.0,67.0,82.0,92.0,57.0,41.0,869.0,85.0,9.781358,12.762963,
aiccsa,140.0,159.0,151.0,110.0,42.0,0.0,87.0,113.0,168.0,212.0,197.0,107.0,0.0,1486.0,111.0,7.469717,1.413562,0.794
aim,33.0,36.0,36.0,30.0,35.0,42.0,39.0,37.0,41.0,46.0,36.0,19.0,0.0,430.0,48.0,11.162791,4.936979,
aina,134.0,146.0,129.0,171.0,105.0,126.0,153.0,154.0,128.0,163.0,157.0,153.0,112.0,1831.0,125.0,6.826871,2.599631,1.908
amai,39.0,37.0,42.0,39.0,46.0,34.0,34.0,37.0,46.0,43.0,48.0,35.0,11.0,491.0,34.0,6.924644,4.298235,
