### **Disruptiveness**

Disruptiveness measures how much a paper changes the direction of research in its field. A disruptive paper is one that causes future work to cite it without also citing its references, suggesting it has created a new research direction. In contrast, a consolidating paper is one that future work cites along with its references, suggesting it has built upon and integrated existing knowledge.

For each paper, we:
1. Identify all papers that cite the focal paper (T) and all papers that cite the focal paper's references (S)
2. Calculate three sets:
   - n_i: papers that cite the focal paper but not its references (disruptive citations)
   - n_j: papers that cite both the focal paper and its references (consolidating citations)
   - n_k: papers that cite the focal paper's references but not the focal paper
3. Compute the disruption index: D = (n_i - n_j) / (n_i + n_j + n_k)
4. Calculate this for multiple time windows (3, 5, 10 years, and all time)

The disruption index ranges from -1 (highly consolidating) to +1 (highly disruptive), with 0 indicating a neutral impact.

**Reference:**
 - Funk, Russell J., and Jason Owen-Smith. "A dynamic network measure of technological change." Management science 63.3 (2017): 791-817.
 - Wu, Lingfei, Dashun Wang, and James A. Evans. "Large teams develop and small teams disrupt science and technology." Nature 566.7744 (2019): 378-382.

In [None]:
import pandas as pd, numpy as np
from matplotlib import pyplot as pltnnnnn
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 100)

import json, pickle as pkl
from glob import glob
from collections import Counter, defaultdict
from itertools import product, combinations
from copy import deepcopy
from tqdm.contrib.concurrent import process_map
import shutil, os

from functools import partial
from tqdm import tqdm, trange
tqdm.pandas(ncols=100, mininterval=1)
tqdm, trange = partial(tqdm, ncols=100, mininterval=1), partial(trange, ncols=100, mininterval=1)

references = pd.read_feather('intermediate/citing_cited_paper_id_year.feather')

# Dict Structure: {citing_paperid (int): citing_year (int)}
dict_paper_id_to_year = (
    references
    .set_index('citing_paperid')
    ['citing_year']
    .to_dict()
)

# Dict Structure: {citing_paperid (int): [list of cited_paperids (int)]}
Dict_Citing_to_Cited = (
    references
    .groupby('citing_paperid')
    .cited_paperid
    .apply(list)
    .to_dict()
)

# Dict Structure: {cited_paperid (int): {citing_paperid (int): citing_year (int)}}
Dict_Cited_to_Citing = (
    references
    .groupby('cited_paperid')
    [['citing_paperid', 'citing_year']]
    .apply(lambda x: {id: year for id, year in zip(x.citing_paperid, x.citing_year)})
    .to_dict()
)


def paper_id_to_disruptiveness(focal_paper_id, future_windows=[3, 5, 10, np.inf]):
    # For each paper in cited list, indicating at least one citation.
    focal_paper_year = dict_paper_id_to_year[focal_paper_id]
    
    ref_paper_ids = Dict_Citing_to_Cited.get(focal_paper_id, [])

    if len(ref_paper_ids) > 0:
        S = {}
        for ref_paper_id in ref_paper_ids:
            S.update(Dict_Cited_to_Citing.get(ref_paper_id, {}))

        T = Dict_Cited_to_Citing.get(focal_paper_id, {})
        
        results = {
            "paper_id": f"pub.{focal_paper_id}", 
            "year": focal_paper_year, 
            "times_cited": len(T), 
            "num_references": len(ref_paper_ids)
        }

        for w in future_windows:
            # papers who cite focal paper's reference within N years
            Sw = {id for id, year in S.items() if (year >= focal_paper_year) and (year <= focal_paper_year + w)}
            # papers who cite focal paper within N years
            Tw = {id for id, year in T.items() if (year >= focal_paper_year) and (year <= focal_paper_year + w)}
            
            n_j_w = len(Tw & Sw) # By definition: # papers who cite focal paper and focal paper's reference
            n_i_w = len(Tw) - n_j_w # By definition: # papers who cite focal paper without citing focal paper's reference
            n_k_w = len(Sw) - n_j_w # By definition: # papers who cite focal paper's reference without citing focal paper

            if n_i_w + n_j_w + n_k_w == 0:
                D = None
            else:
                D = (n_i_w - n_j_w) / (n_i_w + n_j_w + n_k_w)

            results.update({f"n_i_{w}": n_i_w, f"n_j_{w}": n_j_w, f"n_k_{w}": n_k_w, f"CD{w}": D})
    else:
        for w in future_windows:
            results.update({f"n_i_{w}": pd.NA, f"n_j_{w}": pd.NA, f"n_k_{w}": pd.NA, f"CD{w}": pd.NA})
    return results

papers = list(dict_paper_id_to_year.keys())
batch_size = 500_000
path = f"intermediate/disruptiveness"
os.makedirs(path, exist_ok=True)
for idx in trange(0, len(papers), batch_size):
    file_path = os.path.join(path, f"{idx}.feather")
    if not os.path.exists(file_path):
        results = process_map(paper_id_to_disruptiveness, papers[idx:idx+batch_size], max_workers=20, chunksize=1000)
        results = pd.DataFrame(results)
        results.to_feather(file_path)