### **Rao-Stirling Diversity Index**

The Rao-Stirling diversity index measures the interdisciplinarity of a paper based on the variety and balance of fields represented in its references, weighted by the cognitive distance between those fields.

For each paper, we:
1. Identify the fields of all cited references
2. Calculate the proportion of references in each field (variety and balance)
3. Compute pairwise distances between fields based on their cognitive dissimilarity
4. Calculate the Rao-Stirling index as the sum of (proportion_i × proportion_j × distance_ij) across all field pairs

This approach captures both the diversity of fields cited and how cognitively distant those fields are from each other. Papers citing references from many distant fields will have higher Rao-Stirling scores.

**Reference:**
 - Park, Minsu, et al. "Interdisciplinary Papers Supported by Disciplinary Grants Garner Deep and Broad Scientific Impact." arXiv preprint arXiv:2303.14732 (2023).

In [None]:
import pandas as pd, numpy as np
from matplotlib import pyplot as plt
from tqdm.contrib.concurrent import process_map
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 50)

from functools import partial
from tqdm import tqdm, trange
tqdm.pandas(ncols=100, mininterval=1)
tqdm, trange = partial(tqdm, ncols=100, mininterval=1), partial(trange, ncols=100, mininterval=1)

In [None]:
field_l0_list = list(pd.read_csv('intermediate/ANZSRC_FoR.tsv', sep='\t')['Field L0 Code'].drop_duplicates())
field_l0_dict = {field_l0_list[i]:i for i in range(len(field_l0_list))}
field_l1_list = list(pd.read_csv('intermediate/ANZSRC_FoR.tsv', sep='\t')['Field L1 Code'].drop_duplicates())
field_l1_dict = {field_l1_list[i]:i for i in range(len(field_l1_list))}



papers = pd.read_parquet('../parquet/processed/publications.parquet').dropna(subset=['num_references', 'date'])
papers['year'] = papers['date'].str[:4].astype(int)
papers['id'] = papers['id'].str[4:].astype(int)
print(f"# Papers: {len(papers)}")

paper_field_l0 = pd.read_parquet('../parquet/processed/paper_fields_l0.parquet')[["paper_id", "l0_code"]].drop_duplicates(subset=['paper_id'], keep='first')
paper_field_l1 = pd.read_parquet('../parquet/processed/paper_fields_l1.parquet')[["paper_id", "l1_code"]].drop_duplicates(subset=['paper_id'], keep='first')
paper_field_l0['paper_id'] = paper_field_l0['paper_id'].str[4:].astype(int)
paper_field_l1['paper_id'] = paper_field_l1['paper_id'].str[4:].astype(int)


papers["l0_code"] = papers.id.map(paper_field_l0.set_index('paper_id')["l0_code"])
papers["l1_code"] = papers.id.map(paper_field_l1.set_index('paper_id')["l1_code"])
papers["l0_code_norm"] = papers["l0_code"].astype(pd.Int64Dtype()).map(field_l0_dict).astype(pd.Int64Dtype())
papers["l1_code_norm"] = papers["l1_code"].astype(pd.Int64Dtype()).map(field_l1_dict).astype(pd.Int64Dtype())

papers_dict = papers.set_index('id')[['year', 'l0_code_norm', 'l1_code_norm']].to_dict(orient='index')




df_citing_to_cited = pd.read_feather('intermediate/dict_citing_to_cited_id_and_year.feather')

field_l0_citation_matrix = np.zeros([2030, len(field_l0_dict), len(field_l0_dict)])
field_l1_citation_matrix = np.zeros([2030, len(field_l1_dict), len(field_l1_dict)])

for row in tqdm(df_citing_to_cited.itertuples(index=False), total=len(df_citing_to_cited)):
    citing_paper_id, cited_paper_id_year_list = row.citing_paperid, row.cited_list
    for cited_paper_id, year in cited_paper_id_year_list:
        x_dict = papers_dict.get(int(citing_paper_id), {'year': None, 'l0_code_norm': None, 'l1_code_norm': None})
        y_dict = papers_dict.get(int(cited_paper_id), {'year': None, 'l0_code_norm': None, 'l1_code_norm': None})

        x_t, x_f_l0, x_f_l1 = x_dict['year'], x_dict['l0_code_norm'], x_dict['l1_code_norm']
        y_t, y_f_l0, y_f_l1 = y_dict['year'], y_dict['l0_code_norm'], y_dict['l1_code_norm']
        
        if (x_t != None):
            if (x_f_l0 != None) and (y_f_l0 != None):
                field_l0_citation_matrix[x_t, x_f_l0, y_f_l0] += 1
            if (x_f_l1 != None) and (y_f_l1 != None):
                field_l1_citation_matrix[x_t, x_f_l1, y_f_l1] += 1
                
np.save('data/field_l0_citation_matrix.npy', field_l0_citation_matrix)
np.save('data/field_l1_citation_matrix.npy', field_l1_citation_matrix)

from scipy.spatial.distance import pdist, squareform
field_l0_citation_distance = np.zeros_like(field_l0_citation_matrix)
for year in trange(1800, 2030, desc="Computing L0 distances"):
    citation_vectors = field_l0_citation_matrix[year, :, :]
    # pdist computes pairwise distances efficiently, squareform converts to square matrix
    dist_matrix = squareform(pdist(citation_vectors, metric='cosine'))
    # Handle NaN values (can occur when a field has no citations)
    dist_matrix = np.nan_to_num(dist_matrix, nan=0.0)
    # Ensure diagonal is zero
    np.fill_diagonal(dist_matrix, 0)
    field_l0_citation_distance[year, :, :] = dist_matrix

np.save('data/field_l0_citation_distance.npy', field_l0_citation_distance)


field_l1_citation_distance = np.zeros_like(field_l1_citation_matrix)
for year in trange(1800, 2030, desc="Computing L1 distances"):
    citation_vectors = field_l1_citation_matrix[year, :, :]
    dist_matrix = squareform(pdist(citation_vectors, metric='cosine'))
    dist_matrix = np.nan_to_num(dist_matrix, nan=0.0)
    np.fill_diagonal(dist_matrix, 0)
    field_l1_citation_distance[year, :, :] = dist_matrix

np.save('data/field_l1_citation_distance.npy', field_l1_citation_distance)




dict_citing_to_cited_id_year_field = []

for row in tqdm(df_citing_to_cited.itertuples(index=False), total=len(df_citing_to_cited)):
    cited_year_list, cited_paper_id_list, cited_field_l0_list, cited_field_l1_list = [], [], [], []
    citing_paper_id, cited_paper_id_year_list = row.citing_paperid, row.cited_list
    for cited_paper_id, year in cited_paper_id_year_list:
        y_dict = papers_dict.get(int(cited_paper_id), {'year': None, 'l0_code_norm': None, 'l1_code_norm': None})
        y_t, y_f_l0, y_f_l1 = y_dict['year'], y_dict['l0_code_norm'], y_dict['l1_code_norm']

        cited_year_list.append(y_t)
        cited_paper_id_list.append(cited_paper_id)
        cited_field_l0_list.append(y_f_l0)
        cited_field_l1_list.append(y_f_l1)
    
    dict_citing_to_cited_id_year_field.append([citing_paper_id, cited_paper_id_list, cited_year_list, cited_field_l0_list, cited_field_l1_list])
    
dict_citing_to_cited_id_year_field = pd.DataFrame(dict_citing_to_cited_id_year_field, columns=['citing_paper_id', 'cited_paper_id_list', 'cited_paper_year_list', 'cited_paper_field_l0_list', 'cited_paper_field_l1_list'])

dict_citing_to_cited_id_year_field['citing_paper_year'] = dict_citing_to_cited_id_year_field.citing_paper_id.progress_map(
    lambda x:papers_dict.get(int(x), {'year': None})["year"])
    
def filter_none(x):
    return list(filter(lambda y:y is not None, x))
dict_citing_to_cited_id_year_field["cited_paper_field_l0_list"] = dict_citing_to_cited_id_year_field["cited_paper_field_l0_list"].progress_map(filter_none)
dict_citing_to_cited_id_year_field["cited_paper_field_l1_list"] = dict_citing_to_cited_id_year_field["cited_paper_field_l1_list"].progress_map(filter_none)

dict_citing_to_cited_id_year_field.to_parquet('intermediate/dict_citing_to_cited_id_year_field.parquet')
dict_citing_to_cited_id_year_field.to_feather('intermediate/dict_citing_to_cited_id_year_field.feather')


paper_df = dict_citing_to_cited_id_year_field.dropna(subset=['citing_paper_year']).reset_index(drop=True)
l0_data = list(zip(paper_df['cited_paper_field_l0_list'], paper_df['citing_paper_year']))
l1_data = list(zip(paper_df['cited_paper_field_l1_list'], paper_df['citing_paper_year']))


def paper_to_rsd_l0(row):
    cited_field_l0_list, year = row
    if len(cited_field_l0_list) >= 5:
        valid_fields = [i for i in cited_field_l0_list if i >= 0]
        if len(valid_fields) == 0:
            return None
        field_l0_dist = np.bincount(valid_fields, minlength=len(field_l0_dict)).astype(float)
        total = field_l0_dist.sum()
        if total == 0:
            return None
        field_l0_dist /= total
        return 2 * np.sum(field_l0_dist @ field_l0_citation_distance[int(year)] @ field_l0_dist)
    else:
        return None

def paper_to_rsd_l1(row):
    cited_field_l1_list, year = row
    if len(cited_field_l1_list) >= 5:
        valid_fields = [i for i in cited_field_l1_list if i >= 0]
        if len(valid_fields) == 0:
            return None
        field_l1_dist = np.bincount(valid_fields, minlength=len(field_l1_dict)).astype(float)
        total = field_l1_dist.sum()
        if total == 0:
            return None
        field_l1_dist /= total
        return 2 * np.sum(field_l1_dist @ field_l1_citation_distance[int(year)] @ field_l1_dist)
    else:
        return None




print("Computing Rao-Stirling diversity L0...")
paper_df['rao_stirling_diversity_field_l0'] = process_map(
    paper_to_rsd_l0, l0_data, max_workers=20, chunksize=1000)

print("Computing Rao-Stirling diversity L1...")
paper_df['rao_stirling_diversity_field_l1'] = process_map(
    paper_to_rsd_l1, l1_data, max_workers=20, chunksize=1000)

paper_df = paper_df[['citing_paper_id', 'citing_paper_year', 'rao_stirling_diversity_field_l0', 'rao_stirling_diversity_field_l1']]
paper_df.columns = ["id", "year", "rao_stirling_diversity_field_l0", "rao_stirling_diversity_field_l1"]

paper_df.to_parquet('data/rao_stirling_diversity.parquet')
paper_df.to_feather('data/rao_stirling_diversity.feather')
paper_df.to_csv('data/rao_stirling_diversity.tsv', sep='\t', index=False)