### **Multidisciplinary Index**

**Reference:**
 - Zeng, An, et al. "Fresh teams are associated with original and multidisciplinary research." Nature human behaviour 5.10 (2021): 1314-1322.

In [None]:
import pandas as pd, numpy as np
from matplotlib import pyplot as plt
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 100)

import json, pickle as pkl
from glob import glob
from collections import Counter, defaultdict
from itertools import product, combinations
from copy import deepcopy
from tqdm.contrib.concurrent import process_map
import shutil, os

from functools import partial
from tqdm import tqdm, trange
tqdm.pandas(ncols=100, mininterval=1)
tqdm, trange = partial(tqdm, ncols=100, mininterval=1), partial(trange, ncols=100, mininterval=1)


references = pd.read_feather('intermediate/citing_cited_paper_id_year.feather')

papers = pd.read_parquet('../parquet/processed/publications.parquet')
papers['id'] = papers['id'].str[4:].astype(int)

paper_to_date = papers[['id', 'date']][lambda x: x.date.str.len() == 10]
paper_to_date.columns = ['citing_paperid', 'citing_paper_date']
references = references.merge(paper_to_date, on='citing_paperid', how='left')

paper_to_future_citation_sequence = (
    references
    .dropna(subset=["citing_paper_date"])
    .sort_values(by=["citing_paper_date"], ascending=True)
    .reset_index(drop=True)
    .groupby("cited_paperid")
    [["citing_paperid", "citing_paper_date"]]
    .apply(lambda x: x.values.tolist())
    .to_dict()
)


# Dict Structure: {citing_paperid (int): citing_year (int)}
dict_paper_id_to_year = papers.set_index('id').date.to_dict()

# Dict Structure: {citing_paperid (int): [list of cited_paperids (int)]}
dict_citing_to_cited = (
    references
    .groupby('citing_paperid')
    .cited_paperid
    .apply(list)
    .to_dict()
)


def paper_id_to_multidisciplinary(focal_paper_id):
    # [[citing_paperid, citing_year], ...]

    citing_paper_id_year_list = paper_to_future_citation_sequence.get(focal_paper_id, [])
    if len(citing_paper_id_year_list) >= 5:
        l = []
        for i in range(len(citing_paper_id_year_list) - 1):
            citing_paper_id1, citing_paper_year1 = citing_paper_id_year_list[i]
            citing_paper_id2, citing_paper_year2 = citing_paper_id_year_list[i + 1]

            ref1 = set(dict_citing_to_cited[citing_paper_id1])
            ref2 = set(dict_citing_to_cited[citing_paper_id2])
            ref1_without_focal = ref1 - {focal_paper_id}
            ref2_without_focal = ref2 - {focal_paper_id}
            not_share_ref = len(ref1_without_focal & ref2_without_focal) == 0
            l.append(not_share_ref)
        return (focal_paper_id, np.mean(l))
    else:
        return (focal_paper_id, None)



focal_paper_ids = list(paper_to_future_citation_sequence.keys())

batch_size = 500_000
results = []
for idx in trange(0, len(focal_paper_ids), batch_size):
    results += process_map(paper_id_to_multidisciplinary, focal_paper_ids[idx:idx+batch_size], max_workers=20, chunksize=1000)
multidisciplinary_df = pd.DataFrame(results, columns=['paper_id', 'multidisciplinary'])
multidisciplinary_df["paper_id"] = "pub." + multidisciplinary_df["paper_id"].astype(str)

multidisciplinary_df.to_csv('data/multidisciplinary.tsv', sep='\t', index=False)
multidisciplinary_df.to_parquet("data/multidisciplinary.parquet")
multidisciplinary_df.to_feather("data/multidisciplinary.feather")