In [1]:
import sys
!{sys.executable} -m pip install scipy

Looking in indexes: https://mirror.kakao.com/pypi/simple


In [2]:
import json, os, csv
import numpy as np
import pandas as pd
from scipy.stats import gmean
from IPython.display import display, HTML

In [3]:
# filepath to the file containing venue names and their categories (conf or journal)
venue_category_filename = os.path.join(os.pardir,"app","data","venue_list.csv")
venue_weight_filename = os.path.join(os.pardir,"app","data","venue_weight.csv")
filtered_papers_filename = lambda name,year: os.path.join( "filtered_papers" , 
                                              "{}_{}_filtered_papers.json".format(name,year) )
previous_weight_filename = os.path.join(os.pardir,"app","data","venue_weight_2023.csv")
yearranges = range(2007,2024)

# Count paper numbers

In [4]:
paper_data = {}
def count_paper_numbers(venue):
    paper_data[venue] = {y:0 for y in yearranges}
    for year in yearranges:
        with open(filtered_papers_filename(venue, year), "r") as fh:
            papers = json.load(fh)
            paper_data[venue][year] = len(papers)
    paper_data[venue]["Total"] = sum(paper_data[venue].values())

In [5]:
with open(venue_category_filename, "r") as fh:
    reader = csv.reader(fh, delimiter=",")
    # skip header row
    next(reader)
    for row in reader:
        venue_name = row[0]
        count_paper_numbers(venue_name)

# Compute Geometric mean

One difficulty with the geometric mean is that it is undefined when the number of citations is zero. There indeed are some papers with zero citations. To handle this case, we add 1 to the number of citations before computing the geometric mean, and then subtract 1 from the mean obtained.

In [6]:
def compute_venue_weight(venue):
    citation_numbers = []
    paper_numbers = []
    no_matching_paper_count =0
#     for year in yearranges:
    for year in yearranges:
        with open(filtered_papers_filename(venue, year), "r") as fh:
            papers = json.load(fh)
            paper_numbers.append(len(papers))
            for p in papers:
                oa_matching_papers = p["OA papers"]
                if len(oa_matching_papers) == 0: # no matching paper from OpenAlex
                    no_matching_paper_count += 1
                    continue
                if len(oa_matching_papers) > 1: # more than one matching papers --> take max cc number
                    citation_count = max([cc["CitationCount"] for cc in oa_matching_papers])
                else:
                    citation_count = oa_matching_papers[0]["CitationCount"]
                citation_numbers.append(1+citation_count)

    paper_data[venue]["NoMatch"] = no_matching_paper_count
    paper_data[venue]["p_NoMatch(%)"] = float(no_matching_paper_count)/float(paper_data[venue]["Total"])*100.0
    paper_data[venue]["GeoMean"] = gmean(citation_numbers)-1
    paper_data[venue]["AriMean"] = np.mean(citation_numbers)-1
    
    print(venue, paper_data[venue]["Total"], paper_data[venue]["NoMatch"], paper_data[venue]["p_NoMatch(%)"])

In [7]:
with open(venue_category_filename, "r") as fh:
    reader = csv.reader(fh, delimiter=",")
    # skip header row
    next(reader)
    for row in reader:
        venue_name = row[0]
        compute_venue_weight(venue_name)

3dim 1039 0 0.0
aaai 12706 223 1.7550763418857234
aamas 3001 235 7.830723092302566
acl 6901 83 1.202724242863353
ai 934 0 0.0
aiccsa 1913 2 0.10454783063251437
aim 652 3 0.4601226993865031
aina 2489 1 0.040176777822418644
amai 699 0 0.0
ancs 267 0 0.0
annals 512 1 0.1953125
apscc 607 0 0.0
arith 294 1 0.3401360544217687
asap 659 0 0.0
ase 1633 2 0.1224739742804654
asiacrypt 1051 1 0.09514747859181732
asplos 934 1 0.10706638115631692
ats 851 0 0.0
avss 1112 1 0.08992805755395684
bibe 1815 0 0.0
bioinformatics 5029 8 0.1590773513620998
bpm 429 0 0.0
cacm 2157 1 0.04636068613815484
cal 509 0 0.0
cases 245 0 0.0
cav 998 0 0.0
cc 279 1 0.35842293906810035
ccc 561 44 7.8431372549019605
cccg 859 92 10.71012805587893
ccgrid 1433 0 0.0
ccs 2076 2 0.09633911368015415
cga 1017 0 0.0
cgf 3247 1 0.030797659377887282
cgo 451 9 1.9955654101995564
chi 8603 0 0.0
cikm 5805 9 0.15503875968992248
civr 259 0 0.0
cluster 1020 0 0.0
coling 2623 428 16.317194052611512
colt 1237 79 6.386418755052546
compgeom 

In [8]:
# read previous weight
with open(previous_weight_filename, "r") as fh:
    reader = csv.reader(fh, delimiter=",")
    # skip header row
    next(reader)
    for row in reader:
        venue_name = row[0]
        pre_amean = row[1]
        pre_gmean = row[2]
        paper_data[venue_name]["old_GeoMean"] = float(pre_gmean)
        paper_data[venue_name]["old_AriMean"] = float(pre_amean)

In [9]:
df = pd.DataFrame(data=paper_data).T
# df.columns.tolist()

In [10]:
def cell_color(val):
    color = 'black'
    if val == 0:
        color = 'red'
    return 'color: %s' % color

df = df.astype({y:int for y in yearranges})
df = df.astype({"Total":int, "NoMatch":int, "AriMean":float, "GeoMean":float, "p_NoMatch(%)":float})
print(df[["Total", "NoMatch", "p_NoMatch(%)", "AriMean", "GeoMean"]])

pd.set_option('display.max_rows', len(df))
print("NoMatch: No matching paper from OpenAlex")
print("p_NoMatch(%): 100*NoMatch/Total")
coverage = 100*sum([pv["NoMatch"] for pv in paper_data.values()])/sum([pv["Total"] for pv in paper_data.values()])
print("{}% of the papers did not match from title search".format(coverage))

         Total  NoMatch  p_NoMatch(%)    AriMean    GeoMean
3dim      1039        0      0.000000  23.558229   6.408136
aaai     12706      223      1.755076  19.768405   6.278147
aamas     3001      235      7.830723  10.910701   4.435163
acl       6901       83      1.202724  37.584482  10.385427
ai         934        0      0.000000  52.091006  24.004624
...        ...      ...           ...        ...        ...
tmi       3889        3      0.077141  53.782295  20.959217
pacmhci   2152       26      1.208178   8.321731   3.249302
ucc        498        0      0.000000   9.568273   3.727868
bdc        228        0      0.000000   3.460526   1.740317
icsa       147        1      0.680272  21.191781   9.615719

[324 rows x 5 columns]
NoMatch: No matching paper from OpenAlex
p_NoMatch(%): 100*NoMatch/Total
1.6600459363514792% of the papers did not match from title search


In [11]:
df.style.applymap(cell_color)

Unnamed: 0,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,Total,NoMatch,p_NoMatch(%),GeoMean,AriMean,old_GeoMean,old_AriMean
3dim,49,0,0,0,54,75,56,87,74,75,73,81,79,124,141,71,0,1039,0,0.0,6.408136,23.558229,6.553447,24.645661
aaai,290,277,0,270,260,311,210,422,557,572,667,985,1203,1672,1777,1434,1799,12706,223,1.755076,6.278147,19.768405,7.178878,22.192721
aamas,266,254,132,168,125,136,135,163,166,135,155,193,193,195,164,175,246,3001,235,7.830723,4.435163,10.910701,4.817938,11.469263
acl,131,119,121,160,164,187,327,286,318,328,302,430,762,778,712,702,1074,6901,83,1.202724,10.385427,37.584482,13.252911,43.685143
ai,61,73,63,67,87,46,71,62,67,82,92,57,83,23,0,0,0,934,0,0.0,24.004624,52.091006,24.004624,52.091006
aiccsa,141,158,151,110,42,0,87,113,168,212,197,107,138,55,51,64,119,1913,2,0.104548,2.045817,4.593407,2.252824,4.880713
aim,36,39,39,33,38,42,42,40,43,48,39,38,32,36,33,32,42,652,3,0.460123,4.700291,23.209553,5.028352,24.581281
aina,134,146,129,171,104,126,153,154,128,163,157,153,112,124,198,177,160,2489,1,0.040177,2.673988,6.482315,2.886789,6.853156
amai,39,37,42,39,46,34,34,37,46,43,48,35,38,53,47,48,33,699,0,0.0,4.639255,11.187411,4.923572,11.659159
ancs,20,17,16,37,20,19,18,19,15,12,17,13,21,0,23,0,0,267,0,0.0,7.354349,17.142322,7.354349,17.142322


In [12]:
df.to_csv(venue_weight_filename, columns=["AriMean", "GeoMean"])
