### **Policy Distance**

**Reference:**
 - Ahmadpoor, Mohammad, and Benjamin F. Jones. "The dual frontier: Patented inventions and prior scientific advance." Science 357.6351 (2017): 583-587.

**Note:** This notebook calculates the policy distance metric following the same methodology as patent distance, but using policy-paper citations instead of patent-paper citations.

In [1]:
import os
from functools import partial
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

# Configure pandas display options
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 50)

# Configure tqdm for better progress bars
tqdm.pandas(ncols=100, mininterval=1)
tqdm = partial(tqdm, ncols=100, mininterval=1)
trange = partial(trange, ncols=100, mininterval=1)

In [2]:
# Load policy-paper DOI mapping
policy_to_doi = pd.read_csv('/kellogg/proj/dashun/policy/general/20240202/df_policy_to_doi.csv')

print(f"Loaded {len(policy_to_doi):,} policy-paper citation records")
print(f"Unique policies: {policy_to_doi['policy_document_id'].nunique():,}")
print(f"Unique DOIs: {policy_to_doi['dois_cited'].nunique():,}")
print("\nSample data:")
policy_to_doi.head()

Loaded 18,409,473 policy-paper citation records
Unique policies: 1,007,973
Unique DOIs: 5,734,162

Sample data:


Unnamed: 0.1,Unnamed: 0,policy_document_id,dois_cited
0,0,jiia-121176f43125848e5c73e939291a1829,10.1080/24761028.2014.11869071
1,0,jiia-121176f43125848e5c73e939291a1829,10.2307/2750627
2,1,jiia-1a118bd53fea6bd1b335adcf8fd16904,10.2307/2050758
3,2,jiia-1f818b4973fab3ff09462ef5dc7dcb25,10.1177/097492848103700103
4,2,jiia-1f818b4973fab3ff09462ef5dc7dcb25,10.1525/as.1997.37.4.01p0237s


In [3]:
# Load publications data and create DOI to paper_id mapping
# Note: paper id in Dimensions is formatted as "pub.XXXXXXXXX"
print("Loading publications data...")
papers = pd.read_feather("../parquet/processed/publications.feather")

# Create DOI to paper_id mapping (only for papers with DOI)
papers_with_doi = papers[papers['doi'].notna()][['id', 'doi']]
print(f"Total papers: {len(papers):,}")
print(f"Papers with DOI: {len(papers_with_doi):,}")

# Convert DOI to lowercase for case-insensitive matching
papers_with_doi['doi_lower'] = papers_with_doi['doi'].str.lower()

# Create mapping dict: doi -> paper_id (as integer)
# paper_id format is "pub.XXXXXXXXX", we extract the numeric part
papers_with_doi['paper_id'] = papers_with_doi['id'].str[4:].astype(int)
doi_to_paper_id = papers_with_doi.set_index('doi_lower')['paper_id'].to_dict()

print(f"Created DOI to paper_id mapping with {len(doi_to_paper_id):,} entries")

Loading publications data...
Total papers: 152,387,558
Papers with DOI: 144,556,840
Created DOI to paper_id mapping with 144,456,093 entries


In [4]:
# Map policy DOIs to paper_ids
policy_to_doi['doi_lower'] = policy_to_doi['dois_cited'].str.lower()
policy_to_doi['paper_id'] = policy_to_doi['doi_lower'].map(doi_to_paper_id)

# Check mapping success rate
total_citations = len(policy_to_doi)
mapped_citations = policy_to_doi['paper_id'].notna().sum()
print(f"Total policy-paper citations: {total_citations:,}")
print(f"Successfully mapped to paper_id: {mapped_citations:,} ({mapped_citations/total_citations*100:.2f}%)")
print(f"Unmapped (DOI not found in Dimensions): {total_citations - mapped_citations:,}")

# Get unique papers cited by policies
paper_policies = policy_to_doi[policy_to_doi['paper_id'].notna()][['paper_id', 'policy_document_id']].copy()
paper_policies['paper_id'] = paper_policies['paper_id'].astype(int)

print(f"\nUnique papers cited by policies: {paper_policies['paper_id'].nunique():,}")
print(f"Unique policies citing papers: {paper_policies['policy_document_id'].nunique():,}")

Total policy-paper citations: 18,409,473
Successfully mapped to paper_id: 17,361,459 (94.31%)
Unmapped (DOI not found in Dimensions): 1,048,014

Unique papers cited by policies: 5,282,873
Unique policies citing papers: 953,146


In [5]:
# Load citation network: citing_paper_id -> [(cited_paper_id, year), ...]
dict_citing_to_cited_id_and_year = pd.read_feather("intermediate/dict_citing_to_cited_id_and_year.feather")
dict_citing_to_cited_id_and_year = dict_citing_to_cited_id_and_year.set_index("citing_paperid").cited_list.to_dict()
print(f"Loaded citation network with {len(dict_citing_to_cited_id_and_year):,} citing papers")

Loaded citation network with 77,933,004 citing papers


In [6]:
# Initialize BFS
# Distance 1: All papers cited by policies (starting point)
papers_with_policies = set(paper_policies['paper_id'].unique().tolist())
distance_to_papers = [papers_with_policies]  # distance_to_papers[0] = papers at distance 1

# Track all papers we've already discovered to avoid revisiting
discovered_papers = papers_with_policies.copy()

# Number of unique policies (source)
num_policies = paper_policies['policy_document_id'].nunique()

print(f"{'Distance':<10} {'New Papers':<15} {'Cumulative Papers':<20}")
print("-" * 45)
print(f"{'Policies':<10} {num_policies:<15,} {'-':<20}")
print(f"{1:<10} {len(papers_with_policies):<15,} {len(discovered_papers):<20,}")

# Maximum iterations to prevent infinite loops
MAX_DISTANCE = 100

# BFS traversal
for distance in range(MAX_DISTANCE):
    current_distance = distance + 2  # We start at distance 1, so next is 2
    
    # Find all papers that cite papers at the previous distance level
    papers_at_prev_distance = distance_to_papers[-1]
    papers_at_current_distance = []
    
    for citing_paper_id in papers_at_prev_distance:
        # Get all papers cited by this paper
        cited_list = dict_citing_to_cited_id_and_year.get(citing_paper_id, [])
        cited_paper_ids = [cited_paper_id for cited_paper_id, year in cited_list]
        papers_at_current_distance.extend(cited_paper_ids)
    
    papers_at_current_distance = set(papers_at_current_distance) - discovered_papers
    discovered_papers.update(papers_at_current_distance) 
    distance_to_papers.append(papers_at_current_distance)
    
    # Log progress
    print(f"{current_distance:<10} {len(papers_at_current_distance):<15,} {len(discovered_papers):<20,}")
    
    # Stop if no new papers found
    if len(papers_at_current_distance) == 0:
        print(f"\nBFS completed: No new papers found at distance {current_distance}")
        print(f"Maximum distance reached: {current_distance - 1}")
        print(f"Total papers discovered: {len(discovered_papers):,}")
        break
else:
    # Only print warning if we actually hit the MAX_DISTANCE limit
    print(f"\nWarning: Reached maximum distance limit ({MAX_DISTANCE})")
    print(f"Total papers discovered: {len(discovered_papers):,}")

Distance   New Papers      Cumulative Papers   
---------------------------------------------
Policies   953,146         -                   
1          5,282,873       5,282,873           
2          17,997,999      23,280,872          
3          19,013,681      42,294,553          
4          9,145,288       51,439,841          
5          3,580,845       55,020,686          
6          1,520,268       56,540,954          
7          744,998         57,285,952          
8          417,122         57,703,074          
9          266,724         57,969,798          
10         189,095         58,158,893          
11         144,956         58,303,849          
12         119,871         58,423,720          
13         103,497         58,527,217          
14         94,636          58,621,853          
15         86,445          58,708,298          
16         77,618          58,785,916          
17         64,059          58,849,975          
18         49,968          58,899,943     

In [7]:
# Build list of (paper_id, distance) tuples
paper_policy_distance_df = []

# Skip the last element (empty set from termination condition)
for distance_idx in trange(len(distance_to_papers), desc="Building records"):
    distance = distance_idx + 1  # Actual distance value
    papers_at_distance = distance_to_papers[distance_idx]
    
    for paper_id in papers_at_distance:
        paper_policy_distance_df.append([paper_id, distance])

print(f"\nCreated {len(paper_policy_distance_df):,} paper-distance records")


# Convert to DataFrame
paper_policy_distance_df = pd.DataFrame(
    paper_policy_distance_df, 
    columns=['paper_id', 'distance'],
).astype({'paper_id': 'int64', 'distance': 'int64'})

paper_policy_distance_df["paper_id"] = "pub." + paper_policy_distance_df["paper_id"].astype(str)

# Display summary statistics
print("Dataset Summary:")
print(f"  Total papers: {len(paper_policy_distance_df):,}")
print(f"  Distance range: {paper_policy_distance_df['distance'].min()} - {paper_policy_distance_df['distance'].max()}")
print(f"\nDistance distribution:")
print(paper_policy_distance_df['distance'].value_counts().sort_index().head(20))

paper_policy_distance_df.to_parquet("data/paper_policy_distance.parquet")
paper_policy_distance_df.to_feather("data/paper_policy_distance.feather")
print("\n\nData saved.")

Building records: 100%|█████████████████████████████████████████████| 44/44 [03:35<00:00,  4.90s/it]



Created 59,037,193 paper-distance records
Dataset Summary:
  Total papers: 59,037,193
  Distance range: 1 - 43

Distance distribution:
distance
1      5282873
2     17997999
3     19013681
4      9145288
5      3580845
6      1520268
7       744998
8       417122
9       266724
10      189095
11      144956
12      119871
13      103497
14       94636
15       86445
16       77618
17       64059
18       49968
19       39157
20       30027
Name: count, dtype: int64


Data saved.
