# Calculating alternative disruption indices using Dimensions on Google BigQuery
This  Jupyter notebook demonstrates how to calculate the CD index and variants using Dimesnions on Google BigQuery.
These examples use 3 publications as an example but can be easily applied to any set of publications.

In [1]:
%pip install cdindex
import cdindex

import datetime
from string import Template
import pandas as pd
import numpy as np
import re
import os
import shutil
import pandas as pd
import itertools
import tqdm
import plotly.express as px
from google.cloud import bigquery
import time

from secrets import gbq_project_id, gbq_table_prefix



In [2]:
# Set up Google Big Query access and set up a convenience method to run queries
# You can also use the much simpler pandas_gbq methods: https://pandas-gbq.readthedocs.io/en/latest/
# We chose this approach only because we also get query times MB processed in GBQ
client = bigquery.Client(project=gbq_project_id)

def query_gbq(sql, prefix="log"):
    total_start_time = time.time()
    query_job = client.query(sql)  # API request
    
    load_start_time = time.time()
    result = query_job.to_dataframe()  # Waits for query to finish and load data
    end_time =time.time()
    
    total_sec   =   end_time-total_start_time
    load_sec    =   end_time-load_start_time
    query_sec   =   (query_job.ended-query_job.started).total_seconds()
    processed_mb=   query_job.total_bytes_processed/(1024*1024)

    print(f"Total: {total_sec:.1f} sec,  Query & data load: {load_sec:.1f} sec, Query only: {query_sec:.1f} sec")
    print(f"{processed_mb:.1f} MB processed")
    print(f"Read dataframe with {len(result):,} rows and {len(result.columns):,} columns")

    return result, {
            prefix+"_total_sec" : total_sec,
            prefix+"_load_sec" : load_sec,
            prefix+"_query_sec" : query_sec,
            prefix+"_processed_mb" : processed_mb 
            }

# The papers we use these 3 examples taken from
# Park, M., Leahey, E., & Funk, R. J. (2023). Papers and patents are becoming less disruptive over time. Nature, 613 (7942), 138–144
nature_results = pd.DataFrame([
    {"focal_publication_id" : "pub.1040343773"},
    {"focal_publication_id" : "pub.1060431417"},
    {"focal_publication_id" : "pub.1019844293"}
])

focal_publication_ids_text = "'" + "','".join(nature_results.focal_publication_id.unique())+"'"

## "Vanilla" CD and mCD index
This is the $CD_5$ index as originally defined by Funk as well as the associated $mCD_5$ index introduced in https://pubsonline.informs.org/doi/10.1287/mnsc.2015.2366


In [3]:
sql = f"""
-- This is the focal publications f
DECLARE focal_publication_ids ARRAY<STRING> DEFAULT [{focal_publication_ids_text}];
-- This is the impact span t
DECLARE time_diff INT64 DEFAULT 5;


WITH cd_raw_data AS
(
	-- Calculating s’ for each citation to the focal publication
	-- All are assigned a score s’=-1. Any other publications appearing in
	-- the second SELECT and aren’t included here
	-- implicitly get a score s’= 0
  (
    SELECT
    DISTINCT  -- make sure we list unique citations otherwise we may double count
    publications.id AS focal_id, -- focal publication
    citation.id AS citation_id, -- citing publication to focal publication
    -1 AS score -- s’
    -- the Dimensions GBQ table for publications
    FROM `dimensions-ai.data_analytics.publications` AS publications
    -- fetch all its citing publications: id and year
    LEFT JOIN UNNEST(publications.citations) AS citation
    -- for this experiment we only look at one publication
    WHERE publications.id IN UNNEST(focal_publication_ids)
    -- we only consider citations that appear at most time_diff years after
    -- the focal publication has been published
    AND citation.year - publications.year BETWEEN 1 AND time_diff
    -- exclude cases where there is no publication year...very rare
    AND citation.year IS NOT NULL
    AND publications.year IS NOT NULL
  )
  UNION ALL
  -- Calculating s’’ for each citation to the references of
  -- the focal publication
  -- All are assigned a score s’’=-2. Any other publications appearing in
  -- the first SELECT and aren’t included here
  -- implicitly get a score s’’= 0
  (
    SELECT DISTINCT
    publications.id AS focal_id, -- focal publication
    reference_citation.id AS citation_id,-- citing publication to references
    -2 AS score -- s’’
    FROM `dimensions-ai.data_analytics.publications` AS publications
    -- get all the reference publication IDs of the focal publication
    LEFT JOIN UNNEST(publications.reference_ids) AS reference_id
    -- get the references’ meta data - mainly citations to it
    INNER JOIN `dimensions-ai.data_analytics.publications` AS references
    ON references.id = reference_id
    -- get the citations to the references
    LEFT JOIN UNNEST(references.citations) AS reference_citation
    WHERE publications.id IN UNNEST(focal_publication_ids)
    AND reference_citation.year - publications.year BETWEEN 1 AND time_diff
    -- exclude cases where there is no publication year...very rare
    AND reference_citation.year IS NOT NULL
    AND publications.year IS NOT NULL
  )
)
-- Now add up all scores, count the distinct ids of the citations in both SELECTs
-- above and use that information to calculate the CD index
SELECT focal_id AS focal_publication_id,
publications.doi AS doi,
CONCAT(publications.authors[SAFE_OFFSET(0)].last_name) AS first_author,
publications.title.preferred AS title,
publications.journal.title AS journal,
publications.year AS publication_year,
((SUM(score)/COUNT(DISTINCT citation_id))+2) AS cd_5,
COUNTIF(score = -1)*((SUM(score)/COUNT(DISTINCT citation_id))+2) AS mcd_5
FROM cd_raw_data
LEFT JOIN `dimensions-ai.data_analytics.publications` AS publications ON publications.id = cd_raw_data.focal_id
GROUP BY 1,2,3,4,5,6
"""

results, logs = query_gbq(sql)

# Add the results of the Nature paper for comparison
results = results.merge(nature_results, on="focal_publication_id")
display(results)

Total: 8.6 sec,  Query & data load: 8.1 sec, Query only: 7.2 sec
56924.7 MB processed
Read dataframe with 3 rows and 8 columns


Unnamed: 0,focal_publication_id,doi,first_author,title,journal,publication_year,cd_5,mcd_5
0,pub.1019844293,10.1038/2261209a0,BALTIMORE,Viral RNA-dependent DNA Polymerase: RNA-depend...,Nature,1970,-0.443149,-227.778426
1,pub.1040343773,10.1038/171737a0,WATSON,Molecular Structure of Nucleic Acids: A Struct...,Nature,1953,0.612245,96.734694
2,pub.1060431417,10.1103/physrev.140.a1133,Kohn,Self-Consistent Equations Including Exchange a...,Physical Review,1965,-0.264198,-39.893827


## CD index including citations from the same year
In the original CD index definition only citations from the year AFTER publication of the focal paper are considered. A small change allows us to also include citations from the same year

In [4]:
sql = f"""
-- This is the focal publications f
DECLARE focal_publication_ids ARRAY<STRING> DEFAULT [{focal_publication_ids_text}];
-- This is the impact span t
DECLARE time_diff INT64 DEFAULT 5;


WITH cd_raw_data AS
(
	-- Calculating s’ for each citation to the focal publication
	-- All are assigned a score s’=-1. Any other publications appearing in
	-- the second SELECT and aren’t included here
	-- implicitly get a score s’= 0
  (
    SELECT
    DISTINCT  -- make sure we list unique citations otherwise we may double count
    publications.id AS focal_id, -- focal publication
    citation.id AS citation_id, -- citing publication to focal publication
    -1 AS score -- s’
    -- the Dimensions GBQ table for publications
    FROM `dimensions-ai.data_analytics.publications` AS publications
    -- fetch all its citing publications: id and year
    LEFT JOIN UNNEST(publications.citations) AS citation
    -- for this experiment we only look at one publication
    WHERE publications.id IN UNNEST(focal_publication_ids)
    -- we only consider citations that appear at most time_diff years after
    -- the focal publication has been published
    AND citation.year - publications.year BETWEEN 0 AND time_diff
    -- exclude cases where there is no publication year...very rare
    AND citation.year IS NOT NULL
    AND publications.year IS NOT NULL
  )
  UNION ALL
  -- Calculating s’’ for each citation to the references of
  -- the focal publication
  -- All are assigned a score s’’=-2. Any other publications appearing in
  -- the first SELECT and aren’t included here
  -- implicitly get a score s’’= 0
  (
    SELECT DISTINCT
    publications.id AS focal_id, -- focal publication
    reference_citation.id AS citation_id,-- citing publication to references
    -2 AS score -- s’’
    FROM `dimensions-ai.data_analytics.publications` AS publications
    -- get all the reference publication IDs of the focal publication
    LEFT JOIN UNNEST(publications.reference_ids) AS reference_id
    -- get the references’ meta data - mainly citations to it
    INNER JOIN `dimensions-ai.data_analytics.publications` AS references
    ON references.id = reference_id
    -- get the citations to the references
    LEFT JOIN UNNEST(references.citations) AS reference_citation
    WHERE publications.id IN UNNEST(focal_publication_ids)
    AND reference_citation.year - publications.year BETWEEN 0 AND time_diff
    -- exclude cases where there is no publication year...very rare
    AND reference_citation.year IS NOT NULL
    AND publications.year IS NOT NULL
  )
)
-- Now add up all scores, count the distinct ids of the citations in both SELECTs
-- above and use that information to calculate the CD index
SELECT focal_id AS focal_publication_id,
publications.doi AS doi,
CONCAT(publications.authors[SAFE_OFFSET(0)].last_name) AS first_author,
publications.title.preferred AS title,
publications.journal.title AS journal,
publications.year AS publication_year,
((SUM(score)/COUNT(DISTINCT citation_id))+2) AS cd_5,
COUNTIF(score = -1)*((SUM(score)/COUNT(DISTINCT citation_id))+2) AS mcd_5

FROM cd_raw_data
LEFT JOIN `dimensions-ai.data_analytics.publications` AS publications ON publications.id = cd_raw_data.focal_id
GROUP BY 1,2,3,4,5,6
"""

results, logs = query_gbq(sql)

# Add the results of the Nature paper for comparison
results = results.merge(nature_results, on="focal_publication_id")
display(results)

Total: 10.1 sec,  Query & data load: 9.7 sec, Query only: 8.8 sec
56924.7 MB processed
Read dataframe with 3 rows and 8 columns


Unnamed: 0,focal_publication_id,doi,first_author,title,journal,publication_year,cd_5,mcd_5
0,pub.1060431417,10.1103/physrev.140.a1133,Kohn,Self-Consistent Equations Including Exchange a...,Physical Review,1965,-0.234649,-35.432018
1,pub.1019844293,10.1038/2261209a0,BALTIMORE,Viral RNA-dependent DNA Polymerase: RNA-depend...,Nature,1970,-0.421834,-229.899563
2,pub.1040343773,10.1038/171737a0,WATSON,Molecular Structure of Nucleic Acids: A Struct...,Nature,1953,0.56338,93.521127


## Derivative disruption indicators
In https://doi.org/10.1162/qss_a_00068 define another list of disruption indices. We calculate here $DI_l^{\text{no k}}$ for $l=3$ where we only count citations to the focal paper and papers that cite focal paper and $l$ references

In [11]:
l=3 # This is the parameter l in Figure 1 of https://doi.org/10.1162/qss_a_00068

sql = f"""
-- This is the focal publications f
DECLARE focal_publication_ids ARRAY<STRING> DEFAULT [{focal_publication_ids_text}];
-- This is the impact span t
DECLARE time_diff INT64 DEFAULT 5;


WITH cd_raw_data AS
(
	-- Calculating s’ for each citation to the focal publication
	-- All are assigned a score s’=-1. Any other publications appearing in
	-- the second SELECT and aren’t included here
	-- implicitly get a score s’= 0
  (
    SELECT
    DISTINCT  -- make sure we list unique citations otherwise we may double count
    publications.id AS focal_id, -- focal publication
    citation.id AS citation_id, -- citing publication to focal publication
    -1 AS score, -- s’
    1 AS counter
    
    -- the Dimensions GBQ table for publications
    FROM `dimensions-ai.data_analytics.publications` AS publications
    -- fetch all its citing publications: id and year
    LEFT JOIN UNNEST(publications.citations) AS citation
    -- for this experiment we only look at one publication
    WHERE publications.id IN UNNEST(focal_publication_ids)
    -- we only consider citations that appear at most time_diff years after
    -- the focal publication has been published
    AND citation.year - publications.year BETWEEN 1 AND time_diff
    -- exclude cases where there is no publication year...very rare
    AND citation.year IS NOT NULL
    AND publications.year IS NOT NULL
  )
  UNION ALL
  -- Calculating s’’ for each citation to the references of
  -- the focal publication
  -- All are assigned a score s’’=-2. Any other publications appearing in
  -- the first SELECT and aren’t included here
  -- implicitly get a score s’’= 0
  (
    SELECT
    publications.id AS focal_id, -- focal publication
    reference_citation.id AS citation_id,-- citing publication to references
    -2 AS score, -- s’’
    0 AS counter
    FROM `dimensions-ai.data_analytics.publications` AS publications
    -- get all the reference publication IDs of the focal publication
    LEFT JOIN UNNEST(publications.reference_ids) AS reference_id
    -- get the references’ meta data - mainly citations to it
    INNER JOIN `dimensions-ai.data_analytics.publications` AS references
    ON references.id = reference_id
    -- get the citations to the references
    LEFT JOIN UNNEST(references.citations) AS reference_citation
    WHERE publications.id IN UNNEST(focal_publication_ids)
    AND reference_citation.year - publications.year BETWEEN 1 AND time_diff
    -- exclude cases where there is no publication year...very rare
    AND reference_citation.year IS NOT NULL
    AND publications.year IS NOT NULL
    GROUP BY 1,2,3
    # We only count a citation if it cites at least l of the focal paper's references
    HAVING COUNT(DISTINCT references.id) >= {l}
    )
)
-- Now add up all scores, count the distinct ids of the citations in both SELECTs
-- above and use that information to calculate the CD index
SELECT focal_id AS focal_publication_id,
publications.doi AS doi,
CONCAT(publications.authors[SAFE_OFFSET(0)].last_name) AS first_author,
publications.title.preferred AS title,
publications.journal.title AS journal,
publications.year AS publication_year,
((SUM(score)/SUM(counter))+2) AS DI_nk_{l}

FROM cd_raw_data
LEFT JOIN `dimensions-ai.data_analytics.publications` AS publications ON publications.id = cd_raw_data.focal_id
GROUP BY 1,2,3,4,5,6
"""

results, logs = query_gbq(sql)

# Add the results of the Nature paper for comparison
results = results.merge(nature_results, on="focal_publication_id")
display(results)

Total: 8.8 sec,  Query & data load: 8.5 sec, Query only: 7.7 sec
56924.7 MB processed
Read dataframe with 3 rows and 7 columns


Unnamed: 0,focal_publication_id,doi,first_author,title,journal,publication_year,DI_nk_3
0,pub.1019844293,10.1038/2261209a0,BALTIMORE,Viral RNA-dependent DNA Polymerase: RNA-depend...,Nature,1970,0.657588
1,pub.1040343773,10.1038/171737a0,WATSON,Molecular Structure of Nucleic Acids: A Struct...,Nature,1953,1.0
2,pub.1060431417,10.1103/physrev.140.a1133,Kohn,Self-Consistent Equations Including Exchange a...,Physical Review,1965,0.854305
