### **Reference Age and Citation**

This notebook computes reference age and citation counts for each paper's references.

For each citing paper, we compute:
- `cited_paper_ids`: list of referenced paper IDs
- `cited_paper_ages`: age of each cited paper at the time of citation (citing_year - cited_year)
- `cited_paper_citation_counts`: total citation count of each cited paper

In [1]:
import os
from functools import partial
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

# Configure pandas display options
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 50)

# Configure tqdm for better progress bars
tqdm.pandas(ncols=100, mininterval=1)
tqdm = partial(tqdm, ncols=100, mininterval=1)
trange = partial(trange, ncols=100, mininterval=1)

In [2]:
# Load citation data
# Schema: citing_paperid (int), cited_paperid (int), citing_year (int), cited_year (int)
references = pd.read_feather('intermediate/citing_cited_paper_id_year.feather')

print(f"Loaded {len(references):,} citation records")
print(f"Unique citing papers: {references['citing_paperid'].nunique():,}")
print(f"Unique cited papers: {references['cited_paperid'].nunique():,}")
print(f"\nSample data:")
references.head()

Loaded 2,082,399,190 citation records
Unique citing papers: 77,926,751
Unique cited papers: 82,122,147

Sample data:


Unnamed: 0,citing_paperid,cited_paperid,citing_year,cited_year
0,1047469350,1036753132,2005,1982
1,1047469350,1058332925,2005,1983
2,1047469350,1064398364,2005,1972
3,1047469350,1064401023,2005,1958
4,1047469350,1064408021,2005,1983


In [3]:
# Compute reference age: citing_year - cited_year
references['cited_paper_age'] = references['citing_year'] - references['cited_year']

print(f"Reference age computed")
print(f"Age range: {references['cited_paper_age'].min()} to {references['cited_paper_age'].max()}")
references.head()

Reference age computed
Age range: -212 to 360


Unnamed: 0,citing_paperid,cited_paperid,citing_year,cited_year,cited_paper_age
0,1047469350,1036753132,2005,1982,23
1,1047469350,1058332925,2005,1983,22
2,1047469350,1064398364,2005,1972,33
3,1047469350,1064401023,2005,1958,47
4,1047469350,1064408021,2005,1983,22


In [4]:
# Build dictionary: paper_id -> citation count (count how many times it appears as cited_paperid)
dict_paper_id_to_citation_count = references['cited_paperid'].value_counts().to_dict()
print(f"Built paper-to-citation-count mapping for {len(dict_paper_id_to_citation_count):,} cited papers")

# Map citation counts to each reference
references['cited_paper_citation_count'] = references['cited_paperid'].map(dict_paper_id_to_citation_count)
references.head()

Built paper-to-citation-count mapping for 82,122,147 cited papers


Unnamed: 0,citing_paperid,cited_paperid,citing_year,cited_year,cited_paper_age,cited_paper_citation_count
0,1047469350,1036753132,2005,1982,23,4
1,1047469350,1058332925,2005,1983,22,2
2,1047469350,1064398364,2005,1972,33,73
3,1047469350,1064401023,2005,1958,47,148
4,1047469350,1064408021,2005,1983,22,7


In [5]:
# Aggregate by citing paper
print("Aggregating reference info by citing paper...")

result = references.groupby('citing_paperid').agg(
    cited_paper_ids=('cited_paperid', list),
    cited_paper_ages=('cited_paper_age', list),
    cited_paper_citation_counts=('cited_paper_citation_count', list)
).reset_index()

result.rename(columns={'citing_paperid': 'paper_id'}, inplace=True)

# Convert paper_id to standard format
result['paper_id'] = 'pub.' + result['paper_id'].astype(str)

print(f"\nAggregated reference info for {len(result):,} papers")
result.head()

Aggregating reference info by citing paper...

Aggregated reference info for 77,926,751 papers


Unnamed: 0,paper_id,cited_paper_ids,cited_paper_ages,cited_paper_citation_counts
0,pub.1000000002,"[1002143281, 1002975241, 1005757338, 102267567...","[2, 0, 2, 0, 2, 3, 0, 0, 2, 0, 7, 2, 3, 2]","[84, 8, 20, 32, 28, 104, 22, 18, 22, 36, 67, 3..."
1,pub.1000000006,"[1015758664, 1037457933, 1051031921, 105577665...","[1, 9, 7, 1, 5, 1, 2]","[48, 407, 50, 395, 1683, 215, 10]"
2,pub.1000000007,"[1000357624, 1001067672, 1001349503, 100181882...","[6, 12, 2, 5, 2, 3, 9, 3, 0, 4, 1, 9, 9, 5, 3,...","[14, 3192, 322, 255, 80, 45, 252, 312, 30, 651..."
3,pub.1000000008,"[1000384421, 1000666938, 1003119029, 100579821...","[28, 1, 27, 19, 3, 8, 8, 2, 6, 3, 23, 58, 14, ...","[12, 4, 510, 226, 16, 34, 194, 21, 6, 4, 49, 2..."
4,pub.1000000009,"[1001695924, 1004400545, 1006854670, 100876983...","[19, 6, 26, 3, 0, 6, 17, 2, 6, 12, 4, 8, 13, 7...","[199, 139, 3047, 25, 74, 34, 69, 34, 12, 213, ..."


In [6]:
# Save results
result.to_parquet('data/paper_reference_age_citation.parquet')
result.to_feather('data/paper_reference_age_citation.feather')

print("Data saved to:")
print("  - data/paper_reference_age_citation.parquet")
print("  - data/paper_reference_age_citation.feather")
print(f"\nDataset Summary:")
print(f"  Total papers: {len(result):,}")
print(f"  Columns: {result.columns.tolist()}")

Data saved to:
  - data/paper_reference_age_citation.parquet
  - data/paper_reference_age_citation.feather

Dataset Summary:
  Total papers: 77,926,751
  Columns: ['paper_id', 'cited_paper_ids', 'cited_paper_ages', 'cited_paper_citation_counts']


In [7]:
# Summary statistics
print("Reference Age Statistics:")
all_ages = [age for ages in result['cited_paper_ages'] for age in ages]
print(f"  Total references: {len(all_ages):,}")
print(f"  Mean reference age: {np.mean(all_ages):.2f} years")
print(f"  Median reference age: {np.median(all_ages):.2f} years")
print(f"  Std reference age: {np.std(all_ages):.2f} years")

print("\nReference Citation Count Statistics:")
all_citations = [c for citations in result['cited_paper_citation_counts'] for c in citations]
print(f"  Total references: {len(all_citations):,}")
print(f"  Mean citation count: {np.mean(all_citations):.2f}")
print(f"  Median citation count: {np.median(all_citations):.2f}")
print(f"  Std citation count: {np.std(all_citations):.2f}")

Reference Age Statistics:
  Total references: 2,082,399,190
  Mean reference age: 10.12 years
  Median reference age: 7.00 years
  Std reference age: 11.04 years

Reference Citation Count Statistics:
  Total references: 2,082,399,190
  Mean citation count: 743.41
  Median citation count: 82.00
  Std citation count: 6486.53
