In [None]:
import pandas as pd
import requests
import json
import string 
import numpy as np
import os
from tqdm import tqdm
import networkx as nx
import seaborn as sns
from scipy.spatial import distance

In [None]:
df = pd.read_json('../data/scopus_complete_metadata_final.jsonl',lines = True)

In [None]:
df

In [None]:
fields = ['title', 'year', 'abstract', 'citations.paperId', 'references.paperId', 'citationCount']
paper_fields = '?fields=' + ','.join(fields)
paper_url = 'https://api.semanticscholar.org/graph/v1/paper/{}'
header = {'x-api-key': 'X2WAIqbVAQB6nWPCy7zL8XU3STL8gep9NY7eYHf0'} # S2 provide API keys for increased limits
doi_list = [paper_id for paper_id in df['paperId'].dropna()]
index_list = list(df['paperId'].dropna().index)
batch_size = 50
num_batches = int(len(doi_list)/batch_size)
meta_data = []
for batch_num in tqdm(range(num_batches)):
    index = index_list[batch_num*batch_size:(batch_num+1)*batch_size]
    query = {"ids": doi_list[batch_num*batch_size:(batch_num+1)*batch_size]}
    url = paper_url.format('batch') + paper_fields
    r = requests.post(url, json = query, headers = header)
    res = json.loads(r.text)
    for i in range(batch_size):
        try:
            if res[i]:
                res[i].update({'original_index':index[i]})
                meta_data.append(res[i])
        except: 
            None 

In [None]:
# repeat for final batch
if len(doi_list)%batch_size != 0:
    index = index_list[(batch_num+1)*batch_size:len(doi_list)]
    query = {"ids": doi_list[(batch_num+1)*batch_size:len(doi_list)]}
    url = paper_url.format('batch') + paper_fields
    r = requests.post(url, json = query, headers = header)
    res = json.loads(r.text)
    for i in range(len(res)):
        if res[i]:
            res[i].update({'original_index':index[i]})
            meta_data.append(res[i])

In [None]:
df = pd.DataFrame(meta_data)

In [None]:
df

In [None]:
full_df = df.join(add_df.set_index('original_index'), rsuffix = '_add')

In [None]:
final_df = full_df.loc[full_df['paperId'] == full_df['paperId_add']]

In [None]:
final_df.to_json('meta_data.jsonl', orient = 'records', lines = True)

In [None]:
final_df = pd.read_json('meta_data.jsonl', lines = True)

In [None]:
final_df.sample()

In [None]:
paper_ids = set(final_df['paperId'])
final_df.loc[:,'in_set_refs'] = final_df['references'].map(lambda x: set(x).intersection(paper_ids))
final_df.loc[:,'in_set_cits'] = final_df['citations'].map(lambda x: set(x).intersection(paper_ids))

In [None]:
final_df.loc[:,'adjacencies'] = final_df.apply(lambda row:
                                              row['in_set_refs'].union(row['in_set_cits']),
                                              axis = 1)

In [None]:
G_total = nx.Graph()
for n1, data in final_df.set_index('paperId').iterrows():
    G_total.add_node(n1, year = data['year'])
for n1, data in final_df.set_index('paperId').iterrows():
    for n2 in data['adjacencies']:
        G_total.add_edge(n1, n2)

In [None]:
G_total.number_of_nodes(), G_total.number_of_edges()

In [None]:
subgraphs = {}
for y in reversed(range(2014,2022)):
    print(y)
    subgraph = G_total.subgraph(
        [node for node, year_attr in G_total.nodes(data='year')
         if year_attr <= y])
    subgraphs[y] = subgraph

In [None]:
subgraphs[2017].number_of_nodes()

In [None]:
for y in subgraphs:
    print(y, subgraphs[y].number_of_nodes(), subgraphs[y].number_of_edges())

In [None]:
with open('scibert_embeddings.json', 'w') as outfile: 
    json.dump(final_df.set_index('paperId')['scibert_cls'].to_dict(), outfile)

In [None]:
final_df.set_index('paperId')['scibert_cls'].to_json('scibert_embeddings.jsonl', orient = 'records', lines = True)

In [None]:
for y in subgraphs:
    nx.write_gexf(subgraphs[y],'citation_graphs/{}.gexf'.format(y))

In [None]:
G_i = nx.convert_node_labels_to_integers(subgraphs[2014], label_attribute = 'paperId')

In [None]:
G_i.nodes(data = True)