### **Patent Distance**

**Reference:**
 - Ahmadpoor, Mohammad, and Benjamin F. Jones. "The dual frontier: Patented inventions and prior scientific advance." Science 357.6351 (2017): 583-587.

In [1]:
import os
from functools import partial
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

# Configure pandas display options
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 50)

# Configure tqdm for better progress bars
tqdm.pandas(ncols=100, mininterval=1)
tqdm = partial(tqdm, ncols=100, mininterval=1)
trange = partial(trange, ncols=100, mininterval=1)

In [None]:
# Load citation network: citing_paper_id -> [(cited_paper_id, year), ...]
dict_citing_to_cited_id_and_year = pd.read_feather("intermediate/dict_citing_to_cited_id_and_year.feather")

dict_citing_to_cited_id_and_year = dict_citing_to_cited_id_and_year.set_index("citing_paperid").cited_list.to_dict()

In [3]:
# Load paper-patent mapping
# Each row represents a paper that has at least one associated patent
paper_patents = pd.read_parquet("../parquet/processed/paper_patents.parquet")
paper_patents["id"] = paper_patents.id.str[4:].astype(int)

print(f"Loaded {len(paper_patents):,} paper-patent records")
print(f"Unique papers with patents: {paper_patents['id'].nunique():,}")

Loaded 27,549,451 paper-patent records
Unique papers with patents: 5,176,635


In [None]:
# Initialize BFS
# Distance 1: All papers with patents (starting point)
papers_with_patents = set(paper_patents.id.unique().tolist())
distance_to_papers = [papers_with_patents]  # distance_to_papers[0] = papers at distance 1

# Track all papers we've already discovered to avoid revisiting
discovered_papers = papers_with_patents.copy()

print(f"Starting BFS from {len(papers_with_patents):,} papers with patents\n")
print(f"{'Distance':<10} {'New Papers':<15} {'Cumulative Papers':<20}")
print("-" * 45)
print(f"{1:<10} {len(papers_with_patents):<15,} {len(discovered_papers):<20,}")

# Maximum iterations to prevent infinite loops
MAX_DISTANCE = 100

# BFS traversal
for distance in range(MAX_DISTANCE):
    current_distance = distance + 2  # We start at distance 1, so next is 2
    
    # Find all papers that cite papers at the previous distance level
    papers_at_prev_distance = distance_to_papers[-1]
    papers_at_current_distance = []
    
    for citing_paper_id in papers_at_prev_distance:
        # Get all papers cited by this paper
        cited_list = dict_citing_to_cited_id_and_year.get(citing_paper_id, [])
        cited_paper_ids = [cited_paper_id for cited_paper_id, year in cited_list]
        papers_at_current_distance.extend(cited_paper_ids)
    
    papers_at_current_distance = set(papers_at_current_distance) - discovered_papers
    discovered_papers.update(papers_at_current_distance) 
    distance_to_papers.append(papers_at_current_distance)
    
    # Log progress
    print(f"{current_distance:<10} {len(papers_at_current_distance):<15,} {len(discovered_papers):<20,}")
    
    # Stop if no new papers found
    if len(papers_at_current_distance) == 0:
        print(f"\nBFS completed: No new papers found at distance {current_distance}")
        print(f"Maximum distance reached: {current_distance - 1}")
        print(f"Total papers discovered: {len(discovered_papers):,}")
        break
else:
    # Only print warning if we actually hit the MAX_DISTANCE limit
    print(f"\nWarning: Reached maximum distance limit ({MAX_DISTANCE})")
    print(f"Total papers discovered: {len(discovered_papers):,}")

In [7]:
# Build list of (paper_id, distance) tuples
paper_patent_distance_df = []

# Skip the last element (empty set from termination condition)
for distance_idx in trange(len(distance_to_papers), desc="Building records"):
    distance = distance_idx + 1  # Actual distance value
    papers_at_distance = distance_to_papers[distance_idx]
    
    for paper_id in papers_at_distance:
        paper_patent_distance_df.append([paper_id, distance])

print(f"\nCreated {len(paper_patent_distance_df):,} paper-distance records")


# Convert to DataFrame
paper_patent_distance_df = pd.DataFrame(
    paper_patent_distance_df, 
    columns=['paper_id', 'distance']
)

paper_patent_distance_df["paper_id"] = "pub." + paper_patent_distance_df["paper_id"].astype(str)

# Display summary statistics
print("Dataset Summary:")
print(f"  Total papers: {len(paper_patent_distance_df):,}")
print(f"  Distance range: {paper_patent_distance_df['distance'].min()} - {paper_patent_distance_df['distance'].max()}")
print(f"\nDistance distribution:")
print(paper_patent_distance_df['distance'].value_counts().sort_index().head(20))

paper_patent_distance_df.to_parquet("data/paper_patent_distance.parquet")
paper_patent_distance_df.to_feather("data/paper_patent_distance.feather")
print("\n\nData saved.")

Building records: 100%|█████████████████████████████████████████████| 41/41 [03:41<00:00,  5.41s/it]



Created 60,863,881 paper-distance records
Dataset Summary:
  Total papers: 60,863,881
  Distance range: 1 - 40

Distance distribution:
distance
1      5176635
2     18500590
3     18237356
4      9773323
5      4579080
6      2155390
7      1061638
8       548265
9       299288
10      174194
11      108459
12       72341
13       49444
14       36112
15       25516
16       18637
17       13331
18        9091
19        6593
20        4656
Name: count, dtype: int64


Data saved.
