In [1]:
import os, sys
import pandas as pd
from datetime import datetime 
import pickle, json

data_dir = '/data4/u5790670/citation_network'

In [2]:
# conference list
conf_string = "AAAI AAMAS ACL ACSAC AICCSA AINA ANCS APSCC ARITH ASAP ASE ASPLOS ATS AVSS BIBE BPM CASES CAV CC CCC CCCG CCGRID CCS CGO CHI CIKM CIVR CLUSTER COLING COLT COMPSAC CP CRYPTO CSB CSCW CVPR DAC DATE DCOSS DDECS DIGITEL DISC EC ECAI ECCV ECOOP EDBT EDCC EMNLP EMSOFT ER ESA ESOP ESORICS EUROCRYPT FCCM FMCAD FOCS FSE FSE GECCO HiPC HiPEAC HOTI HPCA HPCC HPDC i3D IAAI ICAC ICALP ICAPS ICASSP ICC ICCAD ICCCN ICCS ICCV ICDCS ICDE ICDM ICDT ICEBE ICFP ICGSE ICIS ICLP ICML ICNP ICPP ICPR ICRA ICS ICSE ICSM ICST ICWS ICWSM IISWC IJCAI IJCAR IMC INFOCOM INTERACT INTERSPEECH IOLTS IPCCC IPDPS IPMI IPSN IROS ISCA ISLPED ISMAR ISMB ISMM ISMVL ISORC ISPA ISPASS ISSAC ISSCC ISSRE ISSTA ISWC ITC IUI KDD KR LCN LCTES LICS MASCOTS MASS MDM MICCAI MICRO MM MobiHoc MobiSys NDSS NIPS NOMS NSDI OOPSLA OSDI PACT PLDI PODC PODS POPL PPoPP PSB RAID RECOMB RTAS RTSS S&P SACMAT SBAC-PAD SDM SEA SEC SenSys SIGCOMM SIGIR SIGMETRICS SIGMOD SoCG SODA SOSE SOSP SPAA SRDS SSDBM STACS STOC TCC UAI UIST VEE WADS WCCI WoWMoM WSDM WWW VR"
conf_list = conf_string.split(" ")

# affiliation list
affi_list_file = os.path.join(data_dir, 'data_slim_txt', 'US_Affiliation_List.txt')
with open(affi_list_file, 'r') as alf:
    affi_list = [line.strip() for line in alf]


print("conf: ", len(conf_list))
print("affi: ", len(affi_list))

conf:  187
affi:  99


In [3]:
# each paper get score 1, equally assigned to the affiliation for each author
def get_count_score(df_paper, df_author_aff):
    aff_score_dict = {}
    for index, row_paper in df_paper.iterrows():
        pid = row_paper['PaperID']
        year = row_paper['PubYear']
        tmp_paper_author = df_author_aff[df_author_aff['PaperID'] == pid]
        tmp_author_num = len(tmp_paper_author)
        for tmp_aff_name in tmp_paper_author['NormAffName'].values:
            tmp_aff_year = (tmp_aff_name, year)
            if tmp_aff_year not in aff_score_dict:
                aff_score_dict[tmp_aff_year] = 0.0
            aff_score_dict[tmp_aff_year] += 1.0/tmp_author_num
#     print(aff_score_dict)
    return aff_score_dict

# each paper get citing number as score, equally assigned to the affiliation for each author
def get_cited_score(df_paper, df_author_aff, df_cited):
    aff_score_dict = {}
    df_paper_cited_cnt = df_cited.groupby('RefID')['PaperID'].count()
    for pid, cited_cnt in df_paper_cited_cnt.iteritems():
        row_paper = df_paper.loc[df_paper['PaperID']==pid]
#         if len(row_paper) != 1:
#             print("Some error in fetched paper...")
        year = row_paper['PubYear'].iloc[0]
        tmp_paper_author = df_author_aff[df_author_aff['PaperID'] == pid]
        tmp_author_num = len(tmp_paper_author)
        for tmp_aff_name in tmp_paper_author['NormAffName'].values:
            tmp_aff_year = (tmp_aff_name, year)
            if tmp_aff_year not in aff_score_dict:
                aff_score_dict[tmp_aff_year] = 0.0
            aff_score_dict[tmp_aff_year] += cited_cnt/tmp_author_num
#     print(aff_score_dict)
    return aff_score_dict

succ_list = []
fail_list = []
output_dir= os.path.join(data_dir, 'US_score')
# for i in range(2):
for i in range(len(conf_list)):
    
    venue_name = conf_list[i]
    input_dir = os.path.join(data_dir, 'out', venue_name)
    input_file_path = os.path.join(input_dir, 'cite_records.'+venue_name+".pkl")
    author_aff_file = os.path.join(input_dir, 'paper_author_aff.'+venue_name+'.txt')
    output_file_path = os.path.join(output_dir, venue_name+".pkl")
    
    if os.path.exists(input_file_path) and os.path.exists(author_aff_file):
        succ_list.append(venue_name)
    else:
        fail_list.append(venue_name)
        continue
    if os.path.exists(output_file_path):
        print("skip", venue_name)
        continue
        
    print("Dealing with {}...".format(venue_name))
    d = pickle.load( open(input_file_path, 'rb') )
    df_author_aff = pd.read_table(author_aff_file, header=None, 
                         names=['PaperID', 'AuthorID', 'AffID', 'OriAffName', 'NormAffName', 'AuthorSeqNum'])
    df_cited = d['cited']
    df_paper = d['paper']
    
    count_score_dict = get_count_score(df_paper, df_author_aff)
    cited_score_dict = get_cited_score(df_paper, df_author_aff, df_cited)
    
    print("saved to: " + output_file_path)
    pickle.dump({"count_score": count_score_dict, "cited_score_dict": cited_score_dict}, 
                open(output_file_path, 'wb'))

#     print(count_score_dict)
#     print(df_paper.head())
#     print(df_cited.tail(1000))
    
print("succ|fail|total: {}|{}|{}".format(len(succ_list), len(fail_list), len(succ_list)+len(fail_list)))
print("Fail list: ", fail_list)

skip AAMAS
skip ACL
skip ACSAC
skip AICCSA
skip AINA
skip ANCS
skip APSCC
skip ARITH
skip ASAP
skip ASE
skip ASPLOS
skip ATS
skip AVSS
skip BIBE
skip CASES
skip CAV
skip CC
skip CCC
skip CCCG
skip CCGRID
skip CCS
skip CGO
skip CHI
skip CIKM
skip CIVR
skip CLUSTER
skip COLING
skip COLT
skip COMPSAC
skip CP
skip CRYPTO
skip CSB
skip CSCW
skip CVPR
skip DAC
skip DATE
skip DCOSS
skip DDECS
skip DIGITEL
skip DISC
skip EC
skip ECAI
skip ECCV
skip ECOOP
skip EDBT
skip EDCC
skip EMNLP
skip EMSOFT
skip ER
skip ESA
skip ESOP
skip ESORICS
skip EUROCRYPT
skip FCCM
skip FMCAD
skip FOCS
skip FSE
skip FSE
skip GECCO
skip HiPC
skip HiPEAC
skip HOTI
skip HPCA
skip HPCC
skip HPDC
skip i3D
skip IAAI
skip ICAC
skip ICALP
skip ICAPS
skip ICASSP
skip ICC
Dealing with ICCAD...
saved to: /data4/u5790670/citation_network/US_score/ICCAD.pkl
Dealing with ICCCN...
saved to: /data4/u5790670/citation_network/US_score/ICCCN.pkl
Dealing with ICCS...
saved to: /data4/u5790670/citation_network/US_score/ICCS.pkl
Dealing

saved to: /data4/u5790670/citation_network/US_score/SIGIR.pkl
Dealing with SIGMETRICS...
saved to: /data4/u5790670/citation_network/US_score/SIGMETRICS.pkl
Dealing with SIGMOD...
saved to: /data4/u5790670/citation_network/US_score/SIGMOD.pkl
Dealing with SoCG...
saved to: /data4/u5790670/citation_network/US_score/SoCG.pkl
Dealing with SODA...
saved to: /data4/u5790670/citation_network/US_score/SODA.pkl
Dealing with SOSE...
saved to: /data4/u5790670/citation_network/US_score/SOSE.pkl
Dealing with SOSP...
saved to: /data4/u5790670/citation_network/US_score/SOSP.pkl
Dealing with SPAA...
saved to: /data4/u5790670/citation_network/US_score/SPAA.pkl
Dealing with SRDS...
saved to: /data4/u5790670/citation_network/US_score/SRDS.pkl
Dealing with SSDBM...
saved to: /data4/u5790670/citation_network/US_score/SSDBM.pkl
Dealing with STACS...
saved to: /data4/u5790670/citation_network/US_score/STACS.pkl
Dealing with STOC...
saved to: /data4/u5790670/citation_network/US_score/STOC.pkl
Dealing with TCC