In [None]:
import pandas as pd, numpy as np
from matplotlib import pyplot as plt
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 100)

import json, pickle as pkl
from glob import glob
from collections import Counter, defaultdict
from itertools import product, combinations
from copy import deepcopy
from tqdm.contrib.concurrent import process_map
import shutil, os

from functools import partial
from tqdm import tqdm, trange
tqdm.pandas(ncols=100, mininterval=1)
tqdm, trange = partial(tqdm, ncols=100, mininterval=1), partial(trange, ncols=100, mininterval=1)

In [None]:
references = pd.read_feather('intermediate/citing_cited_paper_id_year.feather')

# Dict Structure: {citing_paperid (int): citing_year (int)}
dict_paper_id_to_year = (
    references
    .set_index('citing_paperid')
    ['citing_year']
    .to_dict()
)

# Dict Structure: {citing_paperid (int): [list of cited_paperids (int)]}
Dict_Citing_to_Cited = (
    references
    .groupby('citing_paperid')
    .cited_paperid
    .apply(list)
    .to_dict()
)

# Dict Structure: {cited_paperid (int): [list of citing_paperids (int)]}
Dict_Cited_to_Citing = (
    references
    .groupby('cited_paperid')
    .citing_paperid
    .apply(list)
    .to_dict()
)


def paper_id_to_multidisciplinary(focal_paper_id):
	citing_paper_id_year = [(x, dict_paper_id_to_year.get(x, None)) for x in Dict_Cited_to_Citing[focal_paper_id]] # [[citing_paperid, citing_year], ...]
	citing_paper_id_year = [(id, year) for id, year in citing_paper_id_year if year is not None]
	citing_paper_id_year = sorted(citing_paper_id_year, key=lambda x: x[1])
	if len(citing_paper_id_year) >= 2:
		l = []
		for i in range(len(citing_paper_id_year) - 1):
			ref1 = set(Dict_Citing_to_Cited[citing_paper_id_year[i][0]])
			ref2 = set(Dict_Citing_to_Cited[citing_paper_id_year[i + 1][0]])
			not_share_ref = (len(ref1 & ref2) == 1)
			l.append(not_share_ref)
		return (focal_paper_id, np.mean(l))
	else:
		return (focal_paper_id, None)


focal_paper_ids = list(Dict_Cited_to_Citing.keys())

batch_size = 500_000
results = []
for idx in trange(0, len(focal_paper_ids), batch_size):
	results += process_map(paper_id_to_multidisciplinary, focal_paper_ids[idx:idx+batch_size], max_workers=20, chunksize=1000)
multidisciplinary_df = pd.DataFrame(results, columns=['paper_id', 'multidisciplinary'])
multidisciplinary_df["paper_id"] = "pub." + multidisciplinary_df["paper_id"].astype(str)
multidisciplinary_df.to_csv('data/multidisciplinary.tsv', sep='\t', index=False)
multidisciplinary_df.to_feather("data/multidisciplinary.feather")