### **Rao-Stirling Diversity Index**

The Rao-Stirling diversity index measures the interdisciplinarity of a paper based on the variety and balance of fields represented in its references, weighted by the cognitive distance between those fields.

For each paper, we:
1. Identify the fields of all cited references
2. Calculate the proportion of references in each field (variety and balance)
3. Compute pairwise distances between fields based on their cognitive dissimilarity
4. Calculate the Rao-Stirling index as the sum of (proportion_i × proportion_j × distance_ij) across all field pairs

This approach captures both the diversity of fields cited and how cognitively distant those fields are from each other. Papers citing references from many distant fields will have higher Rao-Stirling scores.

**Reference:**
 - Park, Minsu, et al. "Interdisciplinary Papers Supported by Disciplinary Grants Garner Deep and Broad Scientific Impact." arXiv preprint arXiv:2303.14732 (2023).

In [1]:
import pandas as pd, numpy as np
from matplotlib import pyplot as plt
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 50)

import json, pickle as pkl
from unidecode import unidecode
from glob import glob
from collections import Counter, defaultdict
from itertools import product, combinations
from copy import deepcopy
from pathlib import Path

from functools import partial
from tqdm import tqdm, trange
tqdm.pandas(ncols=100, mininterval=1)
tqdm, trange = partial(tqdm, ncols=100, mininterval=1), partial(trange, ncols=100, mininterval=1)

import os, shutil

In [2]:
field_l0_list = list(pd.read_csv('intermediate/ANZSRC_FoR.tsv', sep='\t')['Field L0 Code'].drop_duplicates())
field_l0_dict = {field_l0_list[i]:i for i in range(len(field_l0_list))}

field_l1_list = list(pd.read_csv('intermediate/ANZSRC_FoR.tsv', sep='\t')['Field L1 Code'].drop_duplicates())
field_l1_dict = {field_l1_list[i]:i for i in range(len(field_l1_list))}

In [None]:
papers = pd.read_parquet('../parquet/processed/publications.parquet')
paper_field_l0 = pd.read_parquet('../parquet/processed/paper_fields_l0.parquet')
paper_field_l1 = pd.read_parquet('../parquet/processed/paper_fields_l1.parquet')

: 

In [None]:
citing_to_cited_df = pd.read_feather('intermediate/dict_citing_to_cited_id_and_year.feather')
# dict_citing_to_cited = dict_citing_to_cited.set_index("citing_paperid").cited_list.progress_map(lambda x: [i[0] for i in x]).to_dict()
# print(f'len(dict_citing_to_cited) = {len(dict_citing_to_cited)}')

In [None]:
papers2 = papers[['id_INT', 'date_normal', 'field_l0', 'field_l1']]
papers2['year'] = papers2['date_normal'].progress_map(lambda x:int(x[:4]))
papers2['field_l0'] = papers2['field_l0'].progress_map(lambda x:x[0]).map(field_l0_list)
papers2['field_l1'] = papers2['field_l1'].progress_map(lambda x:x[0]).map(field_l1_list)

Papers_dict = dict(papers2.apply(lambda x:(x['id_INT'], (x['year'], x['field_l0'], x['field_l1'])), axis=1).to_numpy())

In [None]:
papers['field_l0'] = papers.field_l0.map(lambda x:[int(i) for i in x.split('|')])
papers['field_l1'] = papers.field_l1.map(lambda x:[int(i) for i in x.split('|')])

papers = papers[~papers.num_references.isna()]
papers = papers[~papers.date_normal.isna()]
papers = papers[papers['num_references'].astype(int) >= 5] # We only keep papers with more than 5 references
print(papers.shape)

In [None]:
%%time

FieldL1CiteMat = np.zeros([2030, len(L1_dict), len(L1_dict)])
for x, y in tqdm(CitingCited[['id_INT', 'reference_INT']].to_numpy()):
    x_t, _, x_fs = Papers_dict.get(x, [None, None, None])
    y_t, _, y_fs = Papers_dict.get(y, [None, None, None])
    if (x_t != None) and (y_t != None):
        FieldL1CiteMat[x_t, x_fs, y_fs] += 1
                
np.save('../tsv/FieldL1CiteMat.npy', FieldL1CiteMat)

In [None]:
import scipy

In [None]:
FieldL1CiteDistance = FieldL1CiteMat * 0
for year in trange(1800, 2030):
    for x in range(FieldL1CiteMat.shape[1]):
        for y in range(FieldL1CiteMat.shape[2]):
            FieldL1CiteDistance[year, x, y] = scipy.spatial.distance.cosine(FieldL1CiteMat[year, x, :], FieldL1CiteMat[year, :, y])
            
            
for x in range(FieldL1CiteDistance.shape[1]):
    FieldL1CiteDistance[:, x, x] = 0

In [None]:
np.save('../tsv/FieldL1Distance.npy', FieldL1CiteDistance)

In [None]:
FieldL1CiteDistance = np.load('../tsv/FieldL1Distance.npy')

In [None]:
%%time
CitingCited = pd.read_csv('../tsv/PaperReferences.tsv', sep='\t')
CitingCited['id_INT'] = CitingCited.id.str.slice(4).astype(int)
CitingCited['reference_INT'] = CitingCited.reference_ids.str.slice(4).astype(int)

In [None]:
Citing2CitedList = CitingCited.groupby('id_INT').reference_INT.apply(list).reset_index()

In [None]:
citation_df = []
for id, v in tqdm(Citing2CitedList.to_numpy()):
    year, field_l0, field_1 = zip(*[Papers_dict.get(i, [None, None, None]) for i in v])
    
    citation_df.append([id, year, field_l0, field_1])
    
citation_df = pd.DataFrame(citation_df, columns=['id_INT', 'refYear', 'refFieldL0', 'refFieldL1'])

In [None]:
citation_df.to_parquet('/kellogg/proj/dashun/dimensions/data_dump/20230910/tsv/Intermediate/paper2RefYearFields.parquet')

In [None]:
citation_df = pd.read_parquet('/kellogg/proj/dashun/dimensions/data_dump/20230910/tsv/Intermediate/paper2RefYearFields.parquet')

# Rao-Sterling Index

In [None]:
citation_df['year'] = citation_df.id_INT.progress_map(lambda x:Papers_dict.get(x, [None, None, None])[0])

In [None]:
citation_df = citation_df[~citation_df.year.isna()]

In [None]:
def paper2RSIndex(refFieldL1, year):
    refFieldL1 = refFieldL1
    year = year
    
    field_l1_dist = np.bincount([i for i in refFieldL1 if i >= 0], minlength=213).astype(float)
    field_l1_dist /= field_l1_dist.sum()

    return 2 * np.sum(field_l1_dist @ FieldL1CiteDistance[int(year)] @ field_l1_dist)

In [None]:
citation_df['RSIndex'] = citation_df.progress_apply(lambda x:paper2RSIndex(x['refFieldL1'], x['year']), axis=1)

In [None]:
citation_df.to_parquet('/kellogg/proj/dashun/dimensions/data_dump/20230910/tsv/Metrics/RaoStirlingDiversity.parquet')

In [None]:
paper2RSIndex(citation_df['refFieldL1'][0], citation_df['year'][0])

In [None]:
%%time
df_zipped = spark.read.format("json").option("compression", "gzip").option("header", True).load(
    "/kellogg/proj/dashun/dimensions/data_dump/20230121/publications/*")

UsedAuthors = {i:True for i in np.load('./UsedAuthors.npy')}

df_zipped2 = df_zipped.select('authors.researcher_id', 'date_normal', 'id', 'category_for.first_level.codes').dropna().rdd.map(lambda x:x.asDict()).map(lambda x:x | {'researcher_id':list(filter(lambda id:UsedAuthors.get(id, False), x['researcher_id']))}).filter(lambda x:len(x['researcher_id']) > 0)
print(df_zipped2.count())

In [None]:
%%time
df_zipped3 = df_zipped2.flatMap(lambda x:[x|{'researcher_id':id} for id in x['researcher_id']]).groupBy(lambda x:x['researcher_id'])
df_zipped4 = df_zipped3.map(lambda x:[x[0], pd.DataFrame(list(x[1])).sort_values('date_normal').reset_index().drop(['researcher_id', 'index'], axis=1)])

In [None]:
df_zipped4.saveAsPickleFile('/kellogg/proj/dashun/shaoerzhuo/Dimensions/Author2Pubs_SparkPickle')

In [None]:
df_zipped5 = df_zipped4.collect()

In [None]:
import pickle as pkl
with open('/kellogg/proj/dashun/shaoerzhuo/Dimensions/Author2Pubs.pkl', 'wb') as f:
    pkl.dump(df_zipped5, f)