# Collect patent data

In [None]:
# import uspto
import pandas as pd
import polars as pl
import numpy as np
import datetime
import time
import re
import copy
import math
import multiprocess as mp
import sys
sys.path.append("/share/tml_package/")
sys.path.append("/share/uspto_pkg")
import json, csv, time, os
import uspto.search
import uspto.crawl, uspto.parse
from uspto.analysis import miscellaneous, citation, claim
import re
import traceback
import pickle
from tqdm import tqdm
from utils import DotDict
from pathlib import Path
import pyarrow as pa
from collections.abc import Iterable, Mapping

In [None]:
class DotDict(dict):
    def __init__(self, *args, **kwargs):
        dict.__init__(self)
        self.update(*args, **kwargs)

    def __repr__(self):
        return f"{self.__class__.__name__}({super().__repr__()})"

    def __getattr__(self, name):
        try:
            return self[name]
        except KeyError:
            # No key by that name? Let's try being helpful.
            if f"@{name}" in self:
                # Does this same name occur starting with ``@``?
                return self[f"@{name}"]
            if f"#{name}" in self:
                # Does this same name occur starting with ``#``?
                return self[f"#{name}"]
            # Otherwise, raise the original exception
            raise AttributeError(name)

    def __setattr__(self, name, val):
        self.__setitem__(name, val)

    def __delattr__(self, name):
        self.__delitem__(name)

    def __setitem__(self, key, val):
        val = self.__class__.build(val)
        dict.__setitem__(self, key, val)

    def __iter__(self):
        return iter([self])

    def update(self, *args, **kwargs):
        for key, val in dict(*args, **kwargs).items():
            self[key] = self.__class__.build(val)

    @classmethod
    def load(cls, file):
        with open(file, 'r') as f:
            raw_dict = json.loads(f.read())
            return DotDict(raw_dict)

    @classmethod
    def build(cls, obj):
        if isinstance(obj, Mapping):
            # Return a new DotDict object wrapping `obj`.
            return cls(obj)
        if not isinstance(obj, str) and isinstance(obj, Iterable):
            # Build each item in the `obj` sequence,
            # then construct a new sequence matching `obj`'s type.
            # Must be careful not to pass strings here, even though they are iterable!
            return obj.__class__(cls.build(x) for x in obj)
        # In all other cases, return `obj` unchanged.
        return obj


In [None]:
RE_HTML = re.compile('<.*?>')
DEP_PATTERN = re.compile(r'(?i)\bclaim\s+\d+\b')

## USPTO AI patent dataset

In [None]:
ai_cols = pd.read_csv("data/ai_patents_2023.csv", nrows=1).columns
# ai_patents = pd.read_csv("data/ai_patents_2023.csv", nrows=3000000)
ai_patents_ldf = pl.scan_csv(
    "data/ai_patents_2023.csv", 
    separator=",", 
    schema={
        "doc_id": pl.Utf8,
        "flag_patent": pl.Int32,
        "pub_dt": pl.Utf8,
        "appl_id": pl.Utf8,
        "predict50_any_ai": pl.Int32,
        "predict86_any_ai": pl.Int32,
        "predict93_any_ai": pl.Int32
        }, 
    truncate_ragged_lines=True)
ai_patents = ai_patents_ldf.filter(pl.col("flag_patent") == 1).collect().to_pandas()

In [None]:
ai_patents.loc[:, "pub_yr"] = ai_patents["pub_dt"][ai_patents["pub_dt"].notna()].apply(lambda x: x.split("-")[0]).astype(int)

In [None]:
ai_thre = 93

In [None]:
ai_patents_predicted = ai_patents.loc[ai_patents[f"predict{ai_thre}_any_ai"][ai_patents[f"predict{ai_thre}_any_ai"]==1].index]

In [None]:
ai_patents_period = ai_patents_predicted[(ai_patents_predicted["pub_yr"] >= 2000) & (ai_patents_predicted["pub_yr"] <= 2019)]

In [None]:
ai_patents_period

In [None]:
# Sampling

In [None]:
digit_pns = ai_patents_period.apply(lambda x: True if re.sub(r"\d+", "", x["doc_id"]) == '' else False, axis=1)
ai_patents_digit = ai_patents_period[digit_pns]

In [None]:
indexes_per_yr = pd.Series([ai_patents_digit[ai_patents_digit["pub_yr"]==yr].index.values for yr in ai_patents_digit["pub_yr"].unique()])

In [None]:
indexes_per_yr.apply(lambda x: len(x))

In [None]:
do_sampling = False

In [None]:
if do_sampling:
    np.random.seed(1)
    sampling_ratio = 1
    sampled_indexes = np.concatenate(indexes_per_yr.apply(lambda x: np.random.choice(x, int(len(x)*sampling_ratio), replace=False)).tolist())
    ai_patents_sampled = ai_patents_digit.loc[sampled_indexes].reset_index(drop=True)
else:
    ai_patents_sampled = ai_patents_digit.reset_index(drop=True)

In [None]:
ai_patents_sampled

In [None]:
patent_ids_ai = ai_patents_sampled["doc_id"][ai_patents_sampled["doc_id"].notna()].apply(lambda x: str(int(x)) if not isinstance(x, str) else x)

In [None]:
patent_ids_ai

In [None]:
ai_appl = ai_patents_sampled["appl_id"][ai_patents_sampled["appl_id"].notna()].apply(lambda x: str(int(x)) if not isinstance(x, str) else x)

In [None]:
ai_appl

## Extract citation records from patentsview

### Load data

In [None]:
data_dir = "/share/patentsview/"

**g_us_patent_citation**
- patent_id: "citing" patent's id
- citation_patent_id: "cited" patent's id
- citation_date: granted date of the "cited" patent

In [None]:
patent_ids_ai_pl = pl.Series("patent_id", patent_ids_ai)

In [None]:
ldf_granted_dates = pl.scan_csv(os.path.join(data_dir, "g_patent.tsv"), separator="\t", infer_schema=False, low_memory=True).select(["patent_id", "patent_date"])

In [None]:
ldf_granted_patents = pl.scan_csv(os.path.join(data_dir, "g_patent.tsv"), separator="\t", infer_schema=False, low_memory=True)

In [None]:
granted_patents = ldf_granted_patents.collect()

In [None]:
granted_patents

In [None]:
ldf_cit = pl.scan_csv(os.path.join(data_dir, "g_us_patent_citation.tsv"), separator="\t", infer_schema=False, low_memory=True).drop_nulls(["patent_id", "citation_patent_id"])
filtered_cit = ldf_cit.filter(pl.col("citation_patent_id").is_in(patent_ids_ai_pl)).select(["patent_id", "citation_patent_id", "citation_date"])

In [None]:
joined_cit = filtered_cit.join(
    ldf_granted_dates,
    on="patent_id",
    how="inner"
)

In [None]:
citing_filtered = joined_cit.select(
    [pl.all(), pl.col("patent_date").str.slice(0,4).cast(pl.Int32).alias("citing_year")]
    ).filter(
        (pl.col("citing_year") >= 1999) &
        (pl.col("citing_year") < 2022)
    ).select([
        "patent_id",
        "citation_patent_id",
        "citing_year"
    ])

In [None]:
valid_citations = citing_filtered.collect()

In [None]:
valid_citations

**g_application**
- filing_date: date of application filing

In [None]:
ldf_appl = pl.scan_csv(os.path.join(data_dir, "g_application.tsv"), separator="\t", infer_schema=False, low_memory=True).select(["patent_id", "filing_date"])

In [None]:
application_dates = ldf_appl.collect()

In [None]:
application_dates_map = dict(zip(application_dates["patent_id"].to_list(), application_dates["filing_date"].to_list()))

In [None]:
application_dates = application_dates.to_pandas()

In [None]:
application_dates = application_dates.set_index("patent_id")

### Extracting patent documents by patent ids

In [None]:
analysis_date = "2025-06-07_1732" # AI, CPC, 2025 scientometrics rev.

root_dir = '/home2/glee/dissertation/1_tech_gen_impact/class2class/Tech_Gen/'
data_dir = os.path.join("/home2/glee/patent_data/data/")

In [None]:
result_dir = os.path.join(root_dir, "results")
used_test_data = pd.read_excel(os.path.join(result_dir, "[DATASET]"+analysis_date+".xlsx"), sheet_name="TEST_dataset")

In [None]:
# col_years = ["<1976"] + np.arange(1976,2023).astype(str).tolist()
col_years = np.arange(1976,2023).astype(str).tolist()
cols = ['number', 'granted_year', 'granted_date', 'application_year', 'application_date',
        'assignee', 'main_cpc', 'sub_cpc', 'main_ipc', 'sub_ipc', 
        'backward_refs', 'NPL_refs', 'n_NPL_refs', 'forward_refs',
        'application_year_backward_refs', 'application_year_forward_refs',
        'patent_family', 'claims'] + col_years

In [None]:
# ─── 설정 ────────────────────────────────────────────────────────────────────
data_dir = "/share/patentsview"
json_dir = "/data/uspto/USPAT/"     # 개별 .json 파일들이 있는 디렉터리
# ndjson_path = "USPTO_1976-2022.ndjson" # 변환된 줄단위 JSON 파일
batch_size = 5000                 # 배치를 몇 개씩 처리할지
# ──────────────────────────────────────────────────────────────────────────────

In [None]:
batched_valid_patent_ids = np.array_split(patent_ids_ai, int(len(patent_ids_ai) / batch_size))

In [None]:
if os.path.exists("patent_file_list.pickle"):
    with open("patent_file_list.pickle", "rb") as f:
        all_json_path = pickle.load(f)
else:
    all_json_path = [str(p) for p in Path(json_dir).rglob("*.json")]
    with open("patent_file_list.pickle", "wb") as f:
        pickle.dump(all_json_path, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
json_file_map = {Path(p).stem: p for p in all_json_path}

In [None]:
app_ldf = pl.scan_csv(
    f"{data_dir}/g_application.tsv",
    separator="\t",
    infer_schema=False,
    low_memory=True
    ).select(["patent_id", "filing_date"])

In [None]:
batches = [
    patent_ids_ai[i : i + batch_size]
    for i in range(0, len(patent_ids_ai), batch_size)
]

In [None]:
results = []
results_lazy = []
for batch in tqdm(batches, desc="Batches"):
    lookup = pl.Series("patent_number", batch)
    batch_paths = [json_file_map[pn] for pn in lookup if pn in json_file_map]
    if len(batch_paths) == 0: continue
    
    filtered = valid_citations.lazy().filter(pl.col("citation_patent_id").is_in(lookup))
    
    joined = (
        filtered
        .join(app_ldf, on="patent_id", how="left")
        .with_columns(
            pl.col("filing_date")
              .str.slice(0,4)
              .cast(pl.Int32)
              .alias("filing_year")
        )
    )
    
    grouped = (
        joined
        .group_by("citation_patent_id")
        .agg([
            pl.col("patent_id").unique().alias("forward_refs"),
            pl.col("filing_year").alias("filing_years"),
            pl.col("citing_year").cast(pl.Utf8).alias("citing_years"),
        ])
        .select([
            pl.col("citation_patent_id").alias("patent_number"),
            pl.col("forward_refs").list.join(";").alias("forward_refs"),
            pl.col("filing_years").cast(pl.List(pl.Utf8)).list.join(";").alias("application_year_forward_refs"),
            pl.col("citing_years")
        ])
        .sort("patent_number")
    )
    
    meta_ldf = pl.scan_ndjson(
        batch_paths,
        schema={
            "publicationReferenceDocumentNumber": pl.Utf8,
            "datePublished": pl.Utf8,
            "applicationYear": pl.Utf8,
            "cpcInventiveFlattened": pl.Utf8,
            "cpcAdditionalFlattened": pl.Utf8,
            "intlPubClassificationPrimary": pl.Utf8,
            "intlPubClassificationSecondary": pl.Utf8,
            "urpn": pl.Utf8,
            "claimsHtml": pl.Utf8
        }
        ).select([
            pl.col("publicationReferenceDocumentNumber").alias("patent_number"),
            pl.col("datePublished").str.slice(0,10).alias("granted_date"),
            pl.col("datePublished").str.slice(0,4).cast(pl.Int32).alias("granted_year"),
            pl.col("applicationYear").cast(pl.Int32).alias("application_year"),
            pl.col("cpcInventiveFlattened").alias("main_cpc"),
            pl.col("cpcAdditionalFlattened").alias("sub_cpc"),
            pl.col("intlPubClassificationPrimary").alias("main_ipc"),
            pl.col("intlPubClassificationSecondary").alias("sub_ipc"),
            pl.col("urpn").alias("backward_refs"),
            pl.col("claimsHtml").alias("claims_org"),
            pl.col("claimsHtml").str.replace_all(r"<.*?>", "").alias("claims")]
        ).drop_nulls(["main_cpc", "claims"]
        ).with_columns(pl.col("claims").map_elements(lambda s: ";".join([c for c in re.split(r'(?<=\.)\s*(?=\d+\.)', s.replace(";","")) if not DEP_PATTERN.search(c)]), return_dtype=pl.Utf8).alias("claims_independent")
        ).with_columns(pl.col("claims").map_elements(lambda s: ";".join([c for c in re.split(r'(?<=\.)\s*(?=\d+\.)', s.replace(";","")) if DEP_PATTERN.search(c)]), return_dtype=pl.Utf8).alias("claims_dependent")
        ).filter((pl.col("granted_year") >= 2000) & (pl.col("granted_year") <= 2019))
                       
    batch_res = (
        grouped
        .join(meta_ldf, on="patent_number", how="right")
        .select(pl.col("patent_number"), pl.exclude("patent_number"))
    )
                       
    results_lazy.append(batch_res)

In [None]:
results = pl.concat(results_lazy, how="vertical_relaxed").collect()
FC_counts_batch = results.select(["citing_years"]).to_pandas()["citing_years"].apply(lambda x: pd.Series(x).value_counts().to_dict() if x is not None else {})
df_FC = pd.DataFrame(columns=col_years, index=FC_counts_batch.index).fillna(0)
df_FC.update(pd.DataFrame(FC_counts_batch.tolist()).fillna(0).astype(int))

In [None]:
results_final = pl.concat([results, pl.from_pandas(df_FC)], how="horizontal").to_pandas()
results_final = results_final.sort_values(by="granted_date").set_index("patent_number", drop=False)

In [None]:
results_final

In [None]:
results_final.loc[:, "backward_refs"] = results_final["backward_refs"].apply(lambda x: json.loads(x) if x is not None else [])

### Save the collected patent data

In [None]:
start_yr = 2001
end_yr = 2020

In [None]:
out = results_final[(results_final["granted_year"] >= start_yr) & (results_final["granted_year"] <= end_yr)]

In [None]:
if do_sampling:
    config_name = f"[uspto_AI][{start_yr}-{end_yr}][{sampling_ratio}sampling]"
else:
    config_name = f"[uspto_AI][{start_yr}-{end_yr}]"
out.to_csv(f"data/collection_{config_name}.csv", index=False)