# Imports

In [1]:
import networkx as nx
import pickle
import numpy as np
import json
import multiprocessing
from tqdm import tqdm
from datasets.preprocess import mask_citations, tokenize_and_index
with open('../data/graph_lda_arxiv.gpickle' ,'rb') as f:
    G = pickle.load(f)

# papers.json

In [2]:
from multiprocessing import Pool

# Function to calculate in-degree for a single node
def calculate_in_degree(node):
    return node, G.in_degree(node)

# Calculate in-degree for all nodes using parallel processing
def calculate_in_degrees_parallel(graph):
    with Pool(processes=100) as pool:
        results = pool.map(calculate_in_degree, graph.nodes())

    return dict(results)

# Call the function to calculate in-degree for all nodes
in_degrees = calculate_in_degrees_parallel(G)

In [None]:
G.nodes['2009.06394']

In [3]:
authors_all = { ' '.join(reversed(author_info)) for node in G.nodes for author_info in G.nodes[node].get('authors_parsed', []) }
print(len(authors_all))

504661


In [4]:
# Convert authors_all to a dictionary
authors_all_dict = {name: idx for idx, name in enumerate(authors_all)}

In [5]:
from datasets.preprocess import tokenize_and_index
word2index = {}
c = 0
with open('resources/ai2_embeddings.txt') as f:
    next(f)  # skip the first line
    for line in f:
        word2index[line.split()[0]] = c
        c += 1
vocabulary = set(list(word2index.keys()))

In [13]:
# Define a function to process a node
def process_node(node):
    json_data = {}
    node_data = G.nodes[node]
    try:
        if node_data and node_data.get('abstract') and node_data.get('title'):
            authors = [' '.join(reversed(author_info)) for author_info in node_data.get('authors_parsed', [])]
            authors_ids = [authors_all_dict.get(author_name) for author_name in authors if authors_all_dict.get(author_name) is not None]

            json_data[node] = {
                'preprocessed_abstract': tokenize_and_index(node_data.get('abstract'), word2index, vocabulary, split_sentences=True),
                'preprocessed_title': tokenize_and_index(node_data.get('title'), word2index, vocabulary),
                'title': node_data.get('title'),
                'abstract': node_data.get('abstract'),
                'authors': authors,
                'citations_per_year': {},
                'authors_ids': authors_ids,
                'total_citations': in_degrees.get(node, 0)
            }
        return json_data
    except Exception as e:
        print(node.get('title'), e)

# Create a pool of processes
pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())

# Map the function to the nodes using multiple processes with a progress bar
results = []
with tqdm(total=len(G.nodes)) as pbar:
    for result in pool.imap_unordered(process_node, G.nodes):
        results.append(result)
        pbar.update()

# Close the pool to prevent any more tasks from being submitted
pool.close()

# Wait for all the processes to finish
pool.join()

100%|██████████| 479645/479645 [02:28<00:00, 3240.45it/s]


In [6]:
# read the papers.json file
with open('arxiv/papers.json') as f:
    papers = json.load(f)

In [7]:
papers_ids_dataset = set(papers.keys())

In [None]:
json_data = {}
paper_ids_data = {}
for result in results:
    try:
        if result is not None:
            json_data[list(result.keys())[0]] = list(result.values())[0]
            paper_ids_data[list(result.keys())[0]] = True
    except Exception as e:
        print(result,e)

In [24]:
# Define the path of the JSON file
json_file_path = 'arxiv/papers.json'

# Save the JSON data to the file
with open(json_file_path, 'w') as json_file:
    json.dump(json_data, json_file)

print("JSON data saved to", json_file_path)


JSON data saved to arxiv/papers.json


# Contexts.json

In [2]:
G['2009.06394']['1507.07688']

{'meta': [{'heading': 'Altruism',
   'text': 'Extensive previous and ongoing work has been dedicated to estimating reward functions and interactive parameters (Albrecht, Crandall, and Ramamoorthy 2016;Albrecht and Stone 2019;Albrecht et al. 2020;Schwarting et al. 2019). In this work we presume that the "true" reward matrix {(r mn1 , r mn2 )} 0<m≤M,0<n≤N , and altruism values α 1 , α 2 are known to both agents. Each agent can then, independently, construct the reward matrix {(r * mn1 , r * mn2 )} 0<m≤M,0<n≤N , which they will use to choose which intention to follow.'}]}

In [8]:
edges = list(G.edges(data=True))
sample_edges = edges[:10]

In [9]:

from datasets.preprocess import mask_citations

def process_edge(edge):
    try:
        source, target, data = edge
        result = {}
        for i, item in enumerate(data['meta']):
            if source in papers_ids_dataset and target in papers_ids_dataset and item['text'] is not None:
                key = f"{source}_{target}_{i}"
                masked_text = mask_citations(item['text'], dataset='acl')
                preprocessed_text = tokenize_and_index(masked_text, word2index, vocabulary)
                result[key] = {
                    "preprocessed": preprocessed_text,
                    "masked_text": masked_text,
                    "tc_index": preprocessed_text.index(word2index['targetcit']),
                    "citation_context": item['text'],
                    "ref_id": target,
                    "citing_id": source
                }
        return result
    except Exception as e:
        return None

In [13]:

# Create a pool of processes
pool = multiprocessing.Pool(processes=multiprocessing.cpu_count() - 20)
# pool = multiprocessing.Pool()

# Create a list of edges to process

# Process the edges using multiple processes with a progress bar
results = []
with tqdm(total=len(edges)) as pbar:
    for result in pool.imap_unordered(process_edge, edges):
        results.append(result)
        pbar.update()

# Close the pool to prevent any more tasks from being submitted
pool.close()

# Wait for all the processes to finish
pool.join()


100%|██████████| 4452945/4452945 [43:10<00:00, 1719.13it/s]  


In [31]:
json_data = {}
for result in results:
    try:
        for key, value in result.items():
            json_data[key] = value
    except Exception as e:
        print(result,e)

In [33]:
citations_dataset = set(json_data.keys())

In [None]:
# Define the path of the JSON file
json_file_path = 'arxiv/contexts.json'

# Save the JSON data to the file
with open(json_file_path, 'w') as json_file:
    json.dump(json_data, json_file)

print("JSON data saved to", json_file_path)


In [2]:
with open('arxiv/contexts.json') as f:
    contexts = json.load(f)

In [16]:
# replace 'preprocessed_text' with 'preprocessed' in the contexts.json file
for key, value in contexts.items():
    contexts[key]['preprocessed'] = contexts[key]['preprocessed_text']
    del contexts[key]['preprocessed_text']

In [18]:
# save to file
with open('arxiv/contexts_new.json', 'w') as json_file:
    json.dump(contexts, json_file)

# train val test data

In [3]:
citations_dataset_list = list(contexts.keys())

In [4]:
citations_dataset_list[0]

'2103.04556_1609.00451_0'

In [5]:
'quant-ph/0611021' in citations_dataset_list

False

In [6]:
import multiprocessing

def process_key(key):
    source, target, index = key.split('_')
    return {'context_id': key, 'paper_id': target, 'date': G.nodes[target]['date']}

# Create a pool of processes
pool = multiprocessing.Pool()

# Map the function to the keys using multiple processes
results = pool.map(process_key, citations_dataset_list)

# Close the pool to prevent any more tasks from being submitted
pool.close()

# Wait for all the processes to finish
pool.join()

# Combine the results from all processes
result = results

In [2]:
import json

result = [{'context_id': f'{source}_{target}_{i}', 'paper_id': target, 'date': G.nodes[target]['date']}
          for source, target, data in G.edges(data=True)
          for i, _ in enumerate(data['meta'])]

# Print JSON
# json_citation = json.dumps(result, indent=4)
print(len(result))



8400564


In [9]:
G['2009.06394']['1507.07688']

{'meta': [{'heading': 'Altruism',
   'text': 'Extensive previous and ongoing work has been dedicated to estimating reward functions and interactive parameters (Albrecht, Crandall, and Ramamoorthy 2016;Albrecht and Stone 2019;Albrecht et al. 2020;Schwarting et al. 2019). In this work we presume that the "true" reward matrix {(r mn1 , r mn2 )} 0<m≤M,0<n≤N , and altruism values α 1 , α 2 are known to both agents. Each agent can then, independently, construct the reward matrix {(r * mn1 , r * mn2 )} 0<m≤M,0<n≤N , which they will use to choose which intention to follow.'}]}

In [8]:
from datetime import datetime

result = [item for item in result if item['paper_id'][0].isdigit()]
result = sorted(result, key=lambda x: datetime.strptime(x['date'], '%Y-%m-%d'))

In [5]:
# Calculate the sizes for each split
total_size = len(result)
train_size = int(0.7 * total_size)
val_size = int(0.15 * total_size)

# Split the data
train_data = result[:train_size]
val_data = result[train_size:train_size+val_size]
test_data = result[train_size+val_size:]

# Print the sizes of each split
print(f"Total size: {len(result)}")
print(f"Train size: {len(train_data)}")
print(f"Validation size: {len(val_data)}")
print(f"Test size: {len(test_data)}")

Total size: 8303616
Train size: 5812531
Validation size: 1245542
Test size: 1245543


In [10]:
modified_Val_data = []
for obj in val_data:
    obj['true_ref'] = obj.pop('paper_id')
    obj['neg_ref'] = '2206.06468'
    modified_Val_data.append(obj)
# modified_Val_data

In [14]:
print('train start date', train_data[0]['date'])
print('train end date', train_data[-1]['date'])
print('val start date', modified_Val_data[0]['date'])
print('val end date', modified_Val_data[-1]['date'])
print('test start date', test_data[0]['date'])
print('test end date', test_data[-1]['date'])

train start date 2007-05-23
train end date 2019-12-05
val start date 2019-12-05
val end date 2021-01-11
test start date 2021-01-11
test end date 2023-01-27


In [11]:
# Save train data to a file
with open('arxiv/train_data.json', 'w') as train_file:
    json.dump(train_data, train_file, indent=4)

# Save validation data to a file
with open('arxiv/val_data.json', 'w') as val_file:
    json.dump(modified_Val_data, val_file, indent=4)

# Save test data to a file
with open('arxiv/test_data.json', 'w') as test_file:
    json.dump(test_data, test_file, indent=4)


In [6]:
## for val.json neg_ref

for node in G.nodes():
    if G.in_degree(node) == 0 and G.out_degree(node) == 0:
        print(node)
        break

2206.06468
