In [168]:
import pandas as pd
from pybliometrics.scopus import AbstractRetrieval, AuthorRetrieval, SerialTitle
from tqdm import tqdm

In [191]:
def get_eids(file_path):
    with open(file_path, 'r') as f:
        eids = [data.strip() for data in f.readlines()]
    return eids

def get_sjr_by_issn(issn, year):
    source = SerialTitle(issn, years='1900-2021')
    sjrlist = dict(source.sjrlist)
    
    return sjrlist[year]

def get_biblio_by_eid(eid):
    ab = AbstractRetrieval(eid, view="FULL")
    X_data = {}

    # For novelty factor
    X_data['title'] = ab.title
    X_data['abstract'] = ab.abstract
    X_data['idxterms'] = ";".join(ab.idxterms) if ab.idxterms else ""

    # For Journal-related factor
    # X_data['issn'] = ab.issn # (4)
    # X_data['jif_sjr'] = get_sjr_by_issn(issn, year)
    
    # For Paper-related factor
    X_data['num_authors'] = len(ab.authors) # (6)
    X_data['num_institutions'] = len([affils.id for affils in ab.affiliation]) # (7)
    X_data['num_refers'] = ab.refcount # (8)
    X_data['num_funding'] = len(ab.funding) if ab.funding else 0 # (9)

    # For Author-related factor
    
    return pd.DataFrame(X_data, index=[eid])

In [136]:
train_eids_path = r"D:\BERT-based-Paper-Impact-Prediction\rsc\preparation_data\train_eids.txt"
eids = get_eids(train_eids_path)

In [None]:
rst_df = pd.DataFrame()
for eid in tqdm(eids):
    rst_df = rst_df.append(get_biblio_by_eid(eid))

In [12]:
import pickle
import pandas as pd
from tqdm import tqdm
# from pybliometrics.socpus import ScopusSearch

In [62]:
with open(r"D:\BERT-based-Paper-Impact-Prediction\rsc\preparation_data\ISSN_docData_dict_ai.pickle", 'rb') as f:
    data = pickle.load(f)
# df = pd.read_pickle(r'D:\BERT-based-Paper-Impact-Prediction\rsc\training_data\training_data_AE_full.pickle')

In [64]:
datasets = pd.DataFrame()
for d in tqdm(data.values()):
    datasets = datasets.append(pd.DataFrame(d.results))

100%|██████████| 233/233 [00:46<00:00,  5.01it/s]


In [67]:
datasets['Year'] = datasets['coverDate'].str[:4]

In [71]:
with open(r'D:\BERT-based-Paper-Impact-Prediction\rsc\preparation_data\AI_target_eids_2005-2010.txt', 'w') as f:
    f.write('\n'.join(datasets[(datasets['Year'] >= '2005')&(datasets['Year'] <= '2010')]['eid']))