In [21]:
import sys
!{sys.executable} -m pip install scipy

Looking in indexes: https://mirror.kakao.com/pypi/simple


In [32]:
import json, os, csv
import numpy as np
import pandas as pd
from scipy.stats import gmean
from IPython.display import display, HTML

In [33]:
# filepath to the file containing venue names and their categories (conf or journal)
venue_category_filename = os.path.join(os.pardir,"app","data","venue_list.csv")
venue_weight_filename = os.path.join(os.pardir,"app","data","venue_weight.csv")
filtered_papers_filename = lambda name,year: os.path.join( "filtered_papers" , 
                                              "{}_{}_filtered_papers.json".format(name,year) )
previous_weight_filename = os.path.join(os.pardir,"app","data","venue_weight_2022.csv")
yearranges = range(2007,2023)

# Count paper numbers

In [34]:
paper_data = {}
def count_paper_numbers(venue):
    paper_data[venue] = {y:0 for y in yearranges}
    for year in yearranges:
        with open(filtered_papers_filename(venue, year), "r") as fh:
            papers = json.load(fh)
            paper_data[venue][year] = len(papers)
    paper_data[venue]["Total"] = sum(paper_data[venue].values())

In [35]:
with open(venue_category_filename, "r") as fh:
    reader = csv.reader(fh, delimiter=",")
    # skip header row
    next(reader)
    for row in reader:
        venue_name = row[0]
        count_paper_numbers(venue_name)

# Compute Geometric mean

One difficulty with the geometric mean is that it is undefined when the number of citations is zero. There indeed are some papers with zero citations. To handle this case, we add 1 to the number of citations before computing the geometric mean, and then subtract 1 from the mean obtained.

In [36]:
def compute_venue_weight(venue):
    citation_numbers = []
    paper_numbers = []
    no_matching_paper_count =0
#     for year in yearranges:
    for year in yearranges:
        with open(filtered_papers_filename(venue, year), "r") as fh:
            papers = json.load(fh)
            paper_numbers.append(len(papers))
            for p in papers:
                oa_matching_papers = p["OA papers"]
                if len(oa_matching_papers) == 0: # no matching paper from MAG
                    no_matching_paper_count += 1
                    continue
                if len(oa_matching_papers) > 1: # more than one matching papers --> take max cc number
                    citation_count = max([cc["CitationCount"] for cc in oa_matching_papers])
                else:
                    citation_count = oa_matching_papers[0]["CitationCount"]
                citation_numbers.append(1+citation_count)
#     print(venue, citation_numbers)
    paper_data[venue]["NoMatch"] = no_matching_paper_count
    paper_data[venue]["p_NoMatch(%)"] = float(no_matching_paper_count)/float(paper_data[venue]["Total"])*100.0
    paper_data[venue]["GeoMean"] = gmean(citation_numbers)-1
    paper_data[venue]["AriMean"] = np.mean(citation_numbers)-1

In [37]:
with open(venue_category_filename, "r") as fh:
    reader = csv.reader(fh, delimiter=",")
    # skip header row
    next(reader)
    for row in reader:
        venue_name = row[0]
        compute_venue_weight(venue_name)

In [38]:
# read previous weight
with open(previous_weight_filename, "r") as fh:
    reader = csv.reader(fh, delimiter=",")
    # skip header row
    next(reader)
    for row in reader:
        venue_name = row[0]
        pre_amean = row[1]
        pre_gmean = row[2]
        paper_data[venue_name]["old_GeoMean"] = float(pre_gmean)
        paper_data[venue_name]["old_AriMean"] = float(pre_amean)

In [39]:
df = pd.DataFrame(data=paper_data).T
# df.columns.tolist()

In [40]:
def cell_color(val):
    color = 'black'
    if val == 0:
        color = 'red'
    return 'color: %s' % color

df = df.astype({y:int for y in yearranges})
df = df.astype({"Total":int, "NoMatch":int, "AriMean":float, "GeoMean":float, "p_NoMatch(%)":float})
pd.set_option('display.max_rows', len(df))
print("NoMatch: No matching paper from OpenAlex")
print("p_NoMatch(%): 100*NoMatch/Total")
coverage = 100*sum([pv["NoMatch"] for pv in paper_data.values()])/sum([pv["Total"] for pv in paper_data.values()])
print("{}% of the papers did not match from title search".format(coverage))

NoMatch: No matching paper from OpenAlex
p_NoMatch(%): 100*NoMatch/Total
1.981341819608527% of the papers did not match from title search


In [41]:
df.style.applymap(cell_color)

Unnamed: 0,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,Total,NoMatch,p_NoMatch(%),GeoMean,AriMean,old_GeoMean,old_AriMean
3dim,49,0,0,0,54,75,56,87,74,75,73,81,79,124,141,0,968,0,0.0,6.553447,24.645661,5.901186,22.350389
aaai,290,277,0,270,260,311,210,422,557,572,667,985,1203,1672,1777,1434,10907,218,1.998716,7.178878,22.192721,8.254024,24.139113
aamas,266,254,132,168,125,136,135,163,166,135,155,193,193,195,164,175,2755,136,4.936479,4.817938,11.469263,8.844932,18.727165
acl,131,119,121,160,164,187,327,286,318,328,302,430,762,778,712,702,5827,72,1.235627,13.252911,43.685143,16.859033,50.744614
ai,61,73,63,67,87,46,71,62,67,82,92,57,83,23,0,0,934,0,0.0,24.004624,52.091006,21.888917,49.233143
aiccsa,141,158,151,110,42,0,87,113,168,212,197,107,138,55,51,64,1794,0,0.0,2.252824,4.880713,2.412785,5.23926
aim,36,39,39,33,38,42,42,40,43,48,39,38,32,36,33,32,610,1,0.163934,5.028352,24.581281,7.443129,32.755991
aina,134,146,129,171,104,126,153,154,128,163,157,153,112,124,198,177,2329,0,0.0,2.886789,6.853156,3.182542,7.632663
amai,39,37,42,39,46,34,34,37,46,43,48,35,38,53,47,48,666,0,0.0,4.923572,11.659159,5.206688,11.996587
ancs,20,17,16,37,20,19,18,19,15,12,17,13,21,0,23,0,267,0,0.0,7.354349,17.142322,10.116449,22.148148


In [42]:
df.to_csv(venue_weight_filename, columns=["AriMean", "GeoMean"])
