In [2]:
import pandas as pd, numpy as np
from matplotlib import pyplot as plt
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 100)

import json, pickle as pkl
from unidecode import unidecode
from glob import glob
from collections import Counter, defaultdict
from itertools import product, combinations
from copy import deepcopy

from functools import partial
from tqdm import tqdm, trange
tqdm.pandas(ncols=100, mininterval=1)
tqdm, trange = partial(tqdm, ncols=100, mininterval=1), partial(trange, ncols=100, mininterval=1)

In [8]:
import polars as pl

citation_by_time_pl = pl.read_parquet("../parquet/processed/pub_citing_cited_years.parquet")

citation_counts = citation_by_time_pl.group_by("cited_paperid").agg([
    (pl.col("year_diff") <= 3).sum().alias("citation_3y"),
    (pl.col("year_diff") <= 5).sum().alias("citation_5y"), 
    (pl.col("year_diff") <= 10).sum().alias("citation_10y"),
    pl.col("year_diff").count().alias("citation_inf")
]).rename({"cited_paperid": "id"})

# citation_counts DataFrame columns:
# - id: cited paper ID (the paper receiving citations)
# - citation_3y: number of citations within 3 years of publication
# - citation_5y: number of citations within 5 years of publication  
# - citation_10y: number of citations within 10 years of publication
# - citation_inf: total number of citations (no time limit)

print(f"Citation counts shape: {citation_counts.shape}")
print(citation_counts.head())

citation_counts.write_csv("data/citation-counts-within-time-window.tsv", separator="\t")

Citation counts shape: (82256823, 5)
shape: (5, 5)
┌────────────────┬─────────────┬─────────────┬──────────────┬──────────────┐
│ id             ┆ citation_3y ┆ citation_5y ┆ citation_10y ┆ citation_inf │
│ ---            ┆ ---         ┆ ---         ┆ ---          ┆ ---          │
│ str            ┆ u32         ┆ u32         ┆ u32          ┆ u32          │
╞════════════════╪═════════════╪═════════════╪══════════════╪══════════════╡
│ pub.1124405832 ┆ 37          ┆ 44          ┆ 44           ┆ 44           │
│ pub.1135969783 ┆ 0           ┆ 1           ┆ 1            ┆ 1            │
│ pub.1014389954 ┆ 2           ┆ 2           ┆ 5            ┆ 6            │
│ pub.1140210773 ┆ 6           ┆ 7           ┆ 7            ┆ 7            │
│ pub.1173465787 ┆ 1           ┆ 1           ┆ 1            ┆ 1            │
└────────────────┴─────────────┴─────────────┴──────────────┴──────────────┘


In [10]:
citation_counts_old = pl.read_parquet(
    "/kellogg/proj/dashun/dimensions/data_dump/20230910/tsv/Metrics/TimesCitedWithinTimeWindow.parquet")
print(citation_counts_old.head())

shape: (5, 5)
┌────────────┬─────┬─────┬─────┬──────┐
│ id         ┆ C3  ┆ C5  ┆ C10 ┆ CINF │
│ ---        ┆ --- ┆ --- ┆ --- ┆ ---  │
│ i64        ┆ i64 ┆ i64 ┆ i64 ┆ i64  │
╞════════════╪═════╪═════╪═════╪══════╡
│ 1000000002 ┆ 14  ┆ 21  ┆ 34  ┆ 48   │
│ 1000000004 ┆ 0   ┆ 0   ┆ 2   ┆ 4    │
│ 1000000006 ┆ 19  ┆ 27  ┆ 45  ┆ 61   │
│ 1000000007 ┆ 65  ┆ 100 ┆ 164 ┆ 164  │
│ 1000000008 ┆ 10  ┆ 10  ┆ 10  ┆ 10   │
└────────────┴─────┴─────┴─────┴──────┘


In [None]:
# Downsample citation_counts to 100,000 papers for fast processing, then join with old data
citation_counts = citation_counts.with_columns(
    pl.col("id").str.replace("pub.", "").cast(pl.Int64).alias("id")
)

citation_counts_sample = citation_counts.sample(n=100000, seed=42)
merged = citation_counts_sample.join(citation_counts_old, on="id", how="inner")

# Calculate correlation coefficients for each time window
corr_3y = merged.select(pl.corr("citation_3y", "C3")).item()
corr_5y = merged.select(pl.corr("citation_5y", "C5")).item()
corr_10y = merged.select(pl.corr("citation_10y", "C10")).item()
corr_inf = merged.select(pl.corr("citation_inf", "CINF")).item()

print(f"\nCorrelation Coefficients: Old vs New Citation Counts")
print(f"Sample size: {merged.shape[0]:,} papers")
print("=" * 50)
print(f"  3-year window:    {corr_3y:.4f}")
print(f"  5-year window:    {corr_5y:.4f}")
print(f" 10-year window:    {corr_10y:.4f}")
print(f"Infinite window:    {corr_inf:.4f}")


Correlation Coefficients: Old vs New Citation Counts
Sample size: 85,588 papers
  3-year window:    0.9474
  5-year window:    0.9524
 10-year window:    0.9792
Infinite window:    0.9892
