# Performance measurements

This notebook compares the performance of a python library against the performance of our SQL-approach when computing the CD index.

In [1]:
%pip install cdindex
%pip install https://github.com/dspinellis/fast-cdindex/archive/refs/heads/master.zip

import cdindex
import fast_cdindex

import sys
import datetime
from string import Template
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)
import pandas_gbq
import plotly.express as px
import numpy as np
import re
import os
import shutil
import tqdm
import time

from google.cloud import bigquery

from settings import gbq_project_id, gbq_table_prefix

In [3]:
client = bigquery.Client(project=gbq_project_id)

In [4]:
def query_gbq(sql, prefix):
    total_start_time = time.time()
    query_job = client.query(sql)  # API request
    
    load_start_time = time.time()
    result = query_job.to_dataframe()  # Waits for query to finish and load data
    end_time =time.time()
    
    total_sec   =   end_time-total_start_time
    load_sec    =   end_time-load_start_time
    query_sec   =   (query_job.ended-query_job.started).total_seconds()
    processed_mb=   query_job.total_bytes_processed/(1024*1024)

    print(f"Total: {total_sec:.1f} sec,  Query & data load: {load_sec:.1f} sec, Query only: {query_sec:.1f} sec")
    print(f"{processed_mb:.1f} MB processed")
    print(f"Read dataframe with {len(result):,} rows and {len(result.columns):,} columns")

    return result, {
            prefix+"_total_sec" : total_sec,
            prefix+"_load_sec" : load_sec,
            prefix+"_query_sec" : query_sec,
            prefix+"_processed_mb" : processed_mb 
            }


In [5]:
def create_network(where_statement, covid=True):
  start_time = time.time()

  if covid:
    # This is a publically available dataset 
    table_name = "covid-19-dimensions-ai.data.publications"
  else:
    # This is part of the commercial offering of Dimensions although researchers may get free access
    table_name = "dimensions-ai.data_analytics.publications"

  # We write a mini citation network to GBQ and load the data into the notebook itself
  # This creates a level playing field for both the python libraries and GBQ calculations of the CD index
  # In order to create a more realistic network we only allow references and 
  sql = f"""
CREATE OR REPLACE TABLE {gbq_table_prefix}graph
CLUSTER BY id
AS
(
  WITH raw_publications AS
  (
    SELECT
      id, year, citations, reference_ids
    FROM `{table_name}` AS p
    WHERE p.year IS NOT NULL AND
    -- These are the conditions for the publications we want to look at
    {where_statement}
  ),
  publications AS
  (
    SELECT
    raw_publications.id, raw_publications.year,
    ARRAY(
      (
        SELECT STRUCT(citation.id AS id, citation.year AS year)
        FROM UNNEST(raw_publications.citations) AS citation
        INNER JOIN raw_publications AS publications_citation ON citation.id = publications_citation.id
      )
    ) AS citations,
    ARRAY(
      (
        SELECT reference_id
        FROM UNNEST(raw_publications.reference_ids) AS reference_id
        INNER JOIN raw_publications AS publications_reference ON reference_id = publications_reference.id
      )
    ) AS reference_ids
    FROM raw_publications

  )

  SELECT *
  FROM publications
  WHERE ARRAY_LENGTH(citations)>0 OR ARRAY_LENGTH(reference_ids)>0
)
"""
  print("Writing citation network:")
  dummy, measurements =query_gbq(sql, prefix="create_network")
  
  print("\nReading data:")
  sql = f"SELECT * FROM {gbq_table_prefix}graph"
  result, read_measurements =query_gbq(sql, prefix="load_network")

  measurements.update(read_measurements)

  return result, measurements

In [6]:
def calc_gbq():
  sql = f"""
  WITH publications AS
  (
    SELECT id, year, citations, reference_ids
    FROM {gbq_table_prefix}graph
  )

  SELECT focal_id AS name,
  (SUM(score)/COUNT(DISTINCT citation_id))+2 AS cd_5
  FROM
  (
    (
      SELECT DISTINCT publications.id AS focal_id,
      citation.id AS citation_id,
      -1 AS score
      FROM publications
      LEFT JOIN UNNEST(publications.citations) AS citation
      WHERE citation.year - publications.year BETWEEN 1 AND 5
    )
    UNION ALL
    (
      SELECT DISTINCT publications.id AS focal_id,
      reference_citation.id as citation_id,
      -2 as score
      FROM publications
      LEFT JOIN UNNEST(publications.reference_ids) AS reference_id
      INNER JOIN publications AS references
      ON references.id = reference_id
      LEFT JOIN UNNEST(references.citations) AS reference_citation
      WHERE reference_citation.year - publications.year BETWEEN 1 AND 5
    )
  )
  GROUP BY 1
  """
  return query_gbq(sql, "calc_gbq")


In [7]:
def calc_cdindex(publications):
  # We make the assumptions that the references and citations are also in the publication ids so we can just take id ->citations to build the graph
  start_time = time.time()
  vertices = publications[["id", "year"]].drop_duplicates().rename(columns={"id" : "name", "year" : "time"})
  edges = publications[["id", "reference_ids"]].explode("reference_ids").rename(columns={"id" : "source", "reference_ids" : "target"})
  edges = edges[~edges.target.isna()]
  graph = cdindex.Graph(vertices = vertices.to_dict("records"), edges = edges.to_dict("records"))
  vertices["cd_5"] = vertices["name"].apply(graph.cdindex, t_delta=5)
  duration = time.time()-start_time
  print(f"Calculation CD index via cdindex: {duration:.1f} sec")
  return vertices, { "calc_cdindex_sec" : duration, "edges" : len(edges) }

In [8]:
def calc_fast_cdindex(publications):
  # We make the assumptions that the references and citations are also in the publication ids so we can just take id ->citations to build the graph
  start_time = time.time()
  vertices = publications[["id", "year"]].drop_duplicates().rename(columns={"id" : "name", "year" : "time"})
  edges = publications[["id", "reference_ids"]].explode("reference_ids").rename(columns={"id" : "source", "reference_ids" : "target"})
  edges = edges[~edges.target.isna()]
  graph = fast_cdindex.Graph(vertices = vertices.to_dict("records"), edges = edges.to_dict("records"))
  vertices["cd_5"] = vertices["name"].apply(graph.cdindex, t_delta=5)
  duration = time.time()-start_time
  print(f"Calculation CD index via fast-cdindex: {duration:.1f} sec")
  return vertices, { "calc_fast_cdindex_sec" : duration, "edges" : len(edges) }

In [9]:
# Helper function to make graphs for the performance
def make_graph(logs, x_col):
  labels = { k : k.replace('_', ' ').capitalize() for k in logs.keys()}
  sec_cols = [c for c in logs.columns if c.endswith("_sec") and not c.startswith("create") and not c.endswith("_load_sec")]
            
  fig = px.line(logs, x=x_col, y=sec_cols,
                labels= labels
                )
  fig.update_layout(xaxis_title=labels[x_col], yaxis_title="sec")

  fig.for_each_trace(lambda t: t.update(name = labels[t.name]))
  fig.show()


def measure_mismatch(left_df, right_df, dec):
  df_cmp=left_df.merge(right_df, on="name", how="outer")
  tolerance = 10**(-dec)
  # compare the result up to n digits
  return ((df_cmp["cd_5_x"].fillna(0)-df_cmp["cd_5_y"].fillna(0))>tolerance).sum()


def run_test(first_year, last_year, steps, subject):
  # Here we store the results
  measurements_list=[]
  
  covid = (subject == "covid")

  for year in range(last_year, first_year,-steps):
    print("\n==================\n", year)
    print("Create citation network...")
    if covid:
      publications, create_measurements=create_network(f"p.year >= {year}", covid = True)
    else:
      publications, create_measurements=create_network(f"p.year >= {year} AND p.category_for.second_level.full[SAFE_OFFSET(0)].name = '{subject}'", covid=False)
    if len(publications) == 0:
      print("\tNo publications....ignore the rest")
      continue
  
    print("\nCalculate CD index via python cdindex")
    cdindex_result, cdindex_measurements = calc_cdindex(publications)
    
    print("\nCalculate CD index via python fast_cd_index")
    fast_cdindex_result, fast_cdindex_measurements = calc_fast_cdindex(publications)
    
    print("\nCalculate CD index via GBQ")
    gbq_result, gbq_measurements = calc_gbq()

    print("\nCompare CD index calculation: cdindex vs GBQ")
    dec = 4
    cdindex_gbq_mismatch = measure_mismatch(cdindex_result, gbq_result, dec)
    print(f"\tNr of discrapencies up to {dec} decimalse: {cdindex_gbq_mismatch}")

    print("\nCompare CD index calculation: cdindex vs fast_cdindex")
    dec = 1
    cdindex_fast_cdindex_mismatch = measure_mismatch(cdindex_result, fast_cdindex_result, dec)
    print(f"\tNr of discrapencies up to {dec} decimalse: {cdindex_fast_cdindex_mismatch}")

    measurements = {
        "year" : year,
        "subject" : subject,
        "nr_publications" : len(publications),
        "cdindex_gbq_mismatch" : cdindex_gbq_mismatch,
        "cdindex_fast_cdindex_mismatch" : cdindex_fast_cdindex_mismatch
    }
    measurements.update(create_measurements)
    measurements.update(cdindex_measurements)
    measurements.update(fast_cdindex_measurements)
    measurements.update(gbq_measurements)
    measurements_list.append(measurements)
    
    logs=pd.DataFrame(measurements_list)
    print("Save backup to disc...")
    logs.to_feather("logs.fth")
    publications.to_feather("publications.fth")
    cdindex_result.to_feather("cd_index_result.fth")
    fast_cdindex_result.to_feather("fast_cdindex_result.fth")
    gbq_result.to_feather("gbq_result.fth")
    print("...done")
    
    
  make_graph(logs, "nr_publications")
  return logs
  
 

In [10]:
# This is a test run for the covid dataset which is publically available
# logs = run_test(2020,2023,1,"covid")

In [11]:
# This is a more serious test run with much more data
logs=run_test(2000,2024,4,'Clinical Sciences')



 2024
Create citation network...
Writing citation network:
Total: 7.5 sec,  Query & data load: 6.5 sec, Query only: 6.0 sec
1165.2 MB processed
Read dataframe with 0 rows and 0 columns

Reading data:
Total: 4.0 sec,  Query & data load: 2.4 sec, Query only: 1.3 sec
0.8 MB processed
Read dataframe with 15,887 rows and 4 columns

Calculate CD index via python cdindex
Calculation CD index via cdindex: 0.1 sec

Calculate CD index via python fast_cd_index
Calculation CD index via fast-cdindex: 0.0 sec

Calculate CD index via GBQ
Total: 1.2 sec,  Query & data load: 0.5 sec, Query only: 0.4 sec
0.8 MB processed
Read dataframe with 0 rows and 2 columns

Compare CD index calculation: cdindex vs GBQ
	Nr of discrapencies up to 4 decimalse: 0

Compare CD index calculation: cdindex vs fast_cdindex
	Nr of discrapencies up to 1 decimalse: 0
Save backup to disc...
...done

 2020
Create citation network...
Writing citation network:
Total: 20.6 sec,  Query & data load: 20.1 sec, Query only: 19.5 sec
141

In [8]:
# This method produces the output for the manuscript

logs = pd.read_feather("logs.fth")

def output_for_manuscript(logs, x_col):
  labels = {  
    'nr_publications' : "Nr publications",
    'load_network_total_sec' : "Load network",
    'calc_cdindex_sec' : 'Calculation via cdindex',
    'calc_fast_cdindex_sec' : "Calculation via fast-cdindex",
    'calc_gbq_total_sec' : "Calculation via GBQ (total)",
    'calc_gbq_query_sec' : "Calculation via GBQ (query)"
  }
  sec_cols = list(labels.keys())
  display_cols =[
        'year',
        "nr_publications",
        "edges"
    ] + sec_cols

  print(logs[display_cols].to_latex(index=False, float_format='{:,.0f}'.format))
  fig = px.line(logs, x=x_col, y=sec_cols,
                labels= labels
                )
  fig.update_layout(xaxis_title=labels[x_col], yaxis_title="sec", legend_title="Duration",
                    legend=dict(
                      orientation="h",
                      yanchor="bottom",
                      y=1.02,
                      xanchor="left",
                      x=0
  ))

  fig.for_each_trace(lambda t: t.update(name = labels[t.name]))
  fig.show()

output_for_manuscript(logs, "nr_publications")

\begin{tabular}{rrrrrrrrr}
\toprule
year & nr_publications & edges & nr_publications & load_network_total_sec & calc_cdindex_sec & calc_fast_cdindex_sec & calc_gbq_total_sec & calc_gbq_query_sec \\
\midrule
2024 & 15887 & 10650 & 15887 & 4 & 0 & 0 & 1 & 0 \\
2020 & 1627943 & 6350878 & 1627943 & 540 & 497 & 208 & 232 & 172 \\
2016 & 3041079 & 18414971 & 3041079 & 1,499 & 781 & 490 & 332 & 205 \\
2012 & 4178177 & 32776089 & 4178177 & 2,612 & 1,144 & 810 & 418 & 233 \\
2008 & 5085459 & 47362015 & 5085459 & 3,781 & 1,559 & 1,248 & 518 & 293 \\
2004 & 5828698 & 61162240 & 5828698 & 4,744 & 2,050 & 1,646 & 567 & 299 \\
\bottomrule
\end{tabular}

