In [1]:
root_dir = '/home2/glee/dissertation/1_tech_gen_impact/class2class/Tech_Gen/'
master_dir = '/home2/glee/dissertation/1_tech_gen_impact/master/Tech_Gen/'
import sys
sys.path.append(root_dir)

import copy
import os
import argparse
import time
import pickle
import re
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)
warnings.filterwarnings(action='ignore', category=DeprecationWarning)
sys.path.append("/share/tml_package")
from tqdm import tqdm

import torch
from accelerate import Accelerator

import numpy as np
import pandas as pd
from sklearn.metrics import matthews_corrcoef, precision_recall_fscore_support, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

from data import TechDataset, CVSampler
from train_utils import EarlyStopping, perf_eval, objective_cv, build_model, train_model, validate_model_mp
from utils import token2class, DotDict, to_device

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Configuration

In [2]:
# analysis_date = "2023-05-04_1802" # Seminconductor
# analysis_date = "2023-05-09_0331" # AI
analysis_date = "2025-06-07_1732" # AI, CPC, 2025 scientometrics rev.
args = argparse.Namespace(
    do_eval = True,
    do_save=False,
    config_file=os.path.join(root_dir, "configs", "USED_configs", "[CONFIGS]"+analysis_date+".json"),
    eval_train_set=False)

project_data_dir = os.path.join(master_dir, "data")
data_dir = os.path.join("/home2/glee/patent_data/data/")
model_dir = os.path.join(root_dir, "models")
result_dir = os.path.join(root_dir, "results")
config_dir = os.path.join(root_dir, "configs")

## parse configuration file
if args.config_file is not None:
    config_file = args.config_file
else:
    config_file = os.path.join(config_dir, "configs_light.json") if args.light else os.path.join(config_dir, "configs.json")
if args.do_eval: args.do_train = False
configs = DotDict().load(config_file)
org_config_keys = {key: list(configs[key].keys()) for key in configs.keys()}

# parse command line argument
instant_configs = {key: value for (key, value) in vars(args).items() if value is not None} # if any argument passed when main.py executed
instant_configs_for_update = {configkey: {key: value for (key,value) in instant_configs.items() if key in org_config_keys[configkey]} for configkey in org_config_keys.keys()}
for key, value in configs.items():
    value.update(instant_configs_for_update[key])

## assign loss weights
if configs.model.model_type == "enc-pred-dec":
    configs.train.loss_weights["recon"] = configs.train.loss_weights["recon"] / sum(configs.train.loss_weights.values())
    configs.train.loss_weights["y"] = 1 - configs.train.loss_weights["recon"]
elif configs.model.model_type == "enc-pred":
    configs.train.loss_weights = {"recon": 0, "y": 1}
elif configs.model.model_type == "enc-dec":
    configs.train.loss_weights = {"recon": 1, "y": 0}

## assign devices
if configs.train.use_accelerator:
    accelerator = Accelerator()
    device_ids = list(range(torch.cuda.device_count()))
    device = accelerator.device
    configs.train.update({"accelerator": accelerator})
else:
    if torch.cuda.is_available():
        device_ids = list(range(torch.cuda.device_count()))
        gpu_usages = [np.sum([float(usage.split("uses")[-1].replace(" ","").replace("MB","")) for usage in torch.cuda.list_gpu_processes(id).split("GPU memory") if not usage=="" and "no processes are running" not in usage]) for id in device_ids]
        device_ids = np.argsort(gpu_usages)[:configs.train.n_gpus]
        device_ids = list(map(lambda x: torch.device('cuda', x),list(device_ids)))
        device = device_ids[0] # main device
        torch.cuda.set_device(device)
    else:
        device = torch.device('cpu')
        device_ids = []

## extract configurations for dataset
config_period = "["+"-".join([str(year) for year in configs.data.target_period])+"]"
config_area = str(configs.data.target_area).replace("\'","").replace(" ","")
config_keywords = str(configs.data.target_keywords).replace("\'","").replace(" ","")
config_sampling_ratio = "["+str(configs.data.sampling_ratio)+"sampling"+"]" if configs.data.sampling_ratio < 1 else ""

## update configurations
configs.data.update({"root_dir": root_dir,
                        "data_dir": data_dir,
                        "model_dir": model_dir,
                        "result_dir": result_dir,
                        "pretrained_enc": configs.model.pretrained_enc,
                        "pretrained_dec": configs.model.pretrained_dec,
                        "data_nrows": None,
                        "data_file": "collection_" + "".join([config_keywords, config_area, config_period, config_sampling_ratio]) + ".csv"})
configs.train.update({"device": device,
                        "device_ids": device_ids,
                        "root_dir": root_dir,
                        "data_dir": data_dir,
                        "model_dir": model_dir,
                        "use_keywords": configs.data.use_keywords,
                        "class_system": configs.data.class_system,
                        "curr_ep": 1,
                        "early_stop_patience": int(0.3*configs.train.max_epochs)})
configs.model.update({"device": device,
                        "device_ids": device_ids,
                        "n_directions": 2 if configs.model.bidirec else 1,
                        "use_accelerator": configs.train.use_accelerator,
                        "model_dir": model_dir})

## Set hyperparameters for model training (To be TUNED)
if configs.train.do_train and configs.train.do_tune:
    n_layers = configs.model.n_layers = None
    d_embedding = configs.model.d_embedding = None
    d_enc_hidden = configs.model.d_enc_hidden = None
    d_pred_hidden = configs.model.d_pred_hidden = None
    learning_rate = configs.train.learning_rate = None
    batch_size = configs.train.batch_size = None
    config_name = "HPARAM_TUNING"
    final_model_path = None
else:
    n_layers = configs.model.n_layers
    d_embedding = configs.model.d_embedding
    d_enc_hidden = configs.model.d_enc_hidden
    d_pred_hidden = configs.model.d_pred_hidden
    d_latent = configs.model.d_latent

    key_components = {"data": ["class_level", "class_system", "max_seq_len_class", "max_seq_len_claim", "vocab_size"], "model": ["n_layers", "d_hidden", "d_pred_hidden", "d_latent", "d_embedding", "d_ff", "n_head", "d_head"], "train": ["learning_rate", "batch_size", "max_epochs", "curr_ep"]}
    model_config_name_prefix = "".join([config_keywords, config_area, config_period, config_sampling_ratio]) + "data"
    model_config_name = "" + model_config_name_prefix
    model_config_name += f"[{configs.data.class_system}]system"
    for key in ["model", "train"]:
        for component in key_components[key]:
            model_config_name += f"[{str(configs[key][component])}]{component}"
    final_model_path = os.path.join(model_dir, f"[MODEL]{model_config_name}.ckpt")

# configs.train.update({"model_config_name": model_config_name, "final_model_path": final_model_path})

# Dataset setting

In [3]:
''' PART 2: Dataset setting '''
tstart = time.time()
dataset_config_name = "".join([config_keywords, config_area, config_period, config_sampling_ratio]) + "data"
for component in key_components["data"]:
    dataset_config_name += f"[{str(configs.data[component])}]{component}"
dataset_path = os.path.join(project_data_dir, "pickled_dataset", "[DATASET]"+dataset_config_name+".pickle")

if os.path.exists(dataset_path) and args.do_save is False:
    print("Load pickled dataset...")
    with open(dataset_path, "rb") as f:
        tech_dataset = pickle.load(f)   # Load pickled dataset if dataset with same configuration already saved
        if tech_dataset.pretrained_enc != configs.data.pretrained_enc or tech_dataset.pretrained_dec != configs.data.pretrained_dec:
            tech_dataset.pretrained_enc = configs.data.pretrained_enc
            tech_dataset.pretrained_dec = configs.data.pretrained_dec
            tech_dataset.tokenizers = tech_dataset.get_tokenizers()
        for tk in tech_dataset.tokenizers.values():
            if "vocab_size" not in dir(tk):
                tk.vocab_size = tk.get_vocab_size()
        tech_dataset.use_keywords = configs.data.use_keywords
        ## load saved rawdata
        if tech_dataset.rawdata is None:
            tech_dataset.rawdata = pd.read_csv(os.path.join(data_dir, configs.data.data_file), low_memory=False)
    print("Pickled dataset loaded")
else:
    print("Make dataset...")
    if args.debug:
        configs.data.update({"data_nrows": 1000})
        dataset_path += ".debug"
    tech_dataset = TechDataset(configs.data)
    if not args.debug:
        rawdata_for_save = copy.deepcopy(tech_dataset.rawdata)
        with open(dataset_path, "wb") as f:
            tech_dataset.rawdata = None
            pickle.dump(tech_dataset, f)
        tech_dataset.rawdata = rawdata_for_save
tend = time.time()

configs.model.update({"tokenizers": tech_dataset.tokenizers,
                    "n_enc_seq_claim": tech_dataset.max_seq_len_claim,
                    "n_dec_seq_claim": tech_dataset.max_seq_len_claim,
                    "n_enc_seq_class": tech_dataset.max_seq_len_class,
                    "n_dec_seq_class": tech_dataset.max_seq_len_class,
                    "n_outputs": 1 if configs.data.pred_type=="regression" else tech_dataset.n_outputs,
                    "i_padding": tech_dataset.tokenizers["class_enc"].pad_id})

Load pickled dataset...
Pickled dataset loaded


# Load model

In [4]:
final_model = build_model(configs.model, tokenizers=tech_dataset.tokenizers)
final_model_finder = final_model_path.split("[MODEL]")[-1].split("max_epochs")[0]+"max_epochs"
matched_ckpts = [f for f in os.listdir(model_dir) if final_model_finder in f]
latest_ckpt_index = np.argmax([int(f.split("curr_ep")[0].split("[")[-1].replace("]","")) for f in matched_ckpts])
final_model_path = os.path.join(model_dir, matched_ckpts[latest_ckpt_index])
if os.path.exists(final_model_path):
    best_states = torch.load(final_model_path, map_location=device)
else:
    raise Exception("Model need to be trained first")

has_module_prefix = any(k.startswith("module.") for k in best_states.keys())
if has_module_prefix:
    stripped = {}
    for k, v in best_states.items():
        new_key = k[len("module."):] if k.startswith("module.") else k
        stripped[new_key] = v
    best_states = stripped
final_model.load_state_dict(best_states)

del best_states
torch.cuda.empty_cache()
print("Model successfully loaded")

Model successfully loaded


In [5]:
 if re.search("^1.", torch.__version__) is not None:
        model_size = sum(t.numel() for t in final_model.parameters())
        print(f"Model size: {model_size/1000**2:.1f}M paramaters")

Model size: 1.5M paramaters


In [6]:
result_path = os.path.join(root_dir, "results")
used_train_data = pd.read_excel(os.path.join(result_dir, "[DATASET]"+analysis_date+".xlsx"), sheet_name="TRAIN_dataset")
used_test_data = pd.read_excel(os.path.join(result_dir, "[DATASET]"+analysis_date+".xlsx"), sheet_name="TEST_dataset")

In [7]:
used_test_data

Unnamed: 0,patent_number,patent_number.1,main_class,sub_class,patent_classes,claims,TC5,TC5_digitized,class
0,8423759,8423759,"['H04L009', 'H04L063', 'H04L009', 'H04L009', '...",[],"['H04L009', 'H04L063', 'H04L009', 'H04L009', '...",1. A method of bootstrapping configuration for...,39,1,2335
1,7268723,7268723,['G01S013'],['G01S013'],"['G01S013', 'G01S013']",1. A system for measuring the position of a ta...,3,0,1788
2,7508332,7508332,"['H03M001', 'G01K007', 'G01K007']","['H03M001', 'H03M001', 'H03M001', 'H03M001']","['H03M001', 'G01K007', 'G01K007', 'H03M001', '...",1. An on die thermal sensor (ODTS) of a semico...,3,0,2307
3,8095229,8095229,"['G05B019', 'G06Q050', 'G06Q010', 'G16Z099']","['Y02P090', 'Y02P090']","['G05B019', 'G06Q050', 'G06Q010', 'G16Z099', '...",1. A computer implemented method for defining ...,2,0,1882
4,8825655,8825655,"['G06F016', 'G06F016', 'G06F016', 'G06V030', '...",[],"['G06F016', 'G06F016', 'G06F016', 'G06V030', '...",1. A system comprising: one or more processors...,2,0,1916
...,...,...,...,...,...,...,...,...,...
13361,7496557,7496557,['G06F016'],"['Y10S707', 'Y10S707']","['G06F016', 'Y10S707', 'Y10S707']",1. A method of crawling at least one website c...,3,0,1916
13362,8483738,8483738,"['G06F021', 'G06F021']","['H04W008', 'H04W004']","['G06F021', 'G06F021', 'H04W008', 'H04W004']","1. A child telecommunications device, comprisi...",0,0,1919
13363,8838301,8838301,"['B61L029', 'B61L025', 'B61L029', 'B61L029']",['B61L025'],"['B61L029', 'B61L025', 'B61L029', 'B61L029', '...",1. A method for train traffic advising includi...,10,0,700
13364,9087301,9087301,"['G06N003', 'G06N003', 'G06N003', 'G06N003']","['G06N003', 'G06N003']","['G06N003', 'G06N003', 'G06N003', 'G06N003', '...","1. A neural network system, comprising: a memo...",11,0,1933


In [8]:
org_data = pd.read_csv(os.path.join(data_dir, "collection_[uspto_AI][2006-2015].csv"))
org_data = org_data.set_index("patent_number")

In [9]:
org_data

Unnamed: 0_level_0,patent_number.1,forward_refs,application_year_forward_refs,citing_years,granted_date,granted_year,main_cpc,sub_cpc,main_ipc,sub_ipc,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
patent_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6983309,6983309,10235008;8243902;8050917;9582461;8015607;76068...,2014;2014;2008;2008;2004;2004;2004;2008;2008;2...,['2019' '2019' '2021' '2021' '2009' '2009' '20...,2006-01-03,2006,G06Q10/107;H04L63/123;H04L63/12;G06Q20/3674;G0...,Y10S707/99945;Y10S707/99948;H04L2209/60;H04L51...,"[""G06F15/16 20060101 G06F015/16""]","[""G06F15/173 20060101 G06F015/173"", ""G06F17/00...",...,2,3,1,0,2,0,2,0,2,0
6981642,6981642,8985462;9418269;7303131;7976167;8646694;108957...,2019;2004;2006;2003;2007;2007;2011;2007;2008;2...,['2021' '2007' '2008' '2009' '2010' '2011' '20...,2006-01-03,2006,G06K7/10722;G06K7/10811;G02B7/285,,"[""G06K7/10 20060101 G06K007/10""]",,...,1,3,3,1,0,0,0,0,1,0
6982420,6982420,7838834,2008,['2010'],2006-01-03,2006,H01J37/26;G06V20/698,H01J2237/221,"[""G21K7/00 20060101 G21K007/00""]",,...,0,0,0,0,0,0,0,0,0,0
6983071,6983071,8675968,2009,['2014'],2006-01-03,2006,G06V30/155;G06V30/1444;G06V10/22;G06V10/273,G06V30/10,"[""G06K9/34 20060101 G06K009/34""]",,...,0,1,0,0,0,0,0,0,0,0
6982717,6982717,8111264;8421795,2006;2007;2007,['2012' '2013' '2013'],2006-01-03,2006,G06T5/002,A63F2300/8017;A63F2300/66,"[""G06T15/00 20060101 G06T015/00""]",,...,2,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9224259,9224259,10940388;9876894;9882981;10709989;10803041;959...,2015;2017;2016;2018;2019;2018;2017;2019;2015;2...,['2018' '2018' '2019' '2020' '2020' '2020' '20...,2015-12-29,2015,A63F13/358;H04L67/10;G07F17/3241;G07F17/3234;G...,,"[""G07F17/32 20060101 G07F017/32""]",,...,0,0,0,1,2,4,1,3,2,0
9223616,9223616,10768959,2015,['2020'],2015-12-29,2015,G06F9/4856;G06F9/5077,,"[""G06F15/173 20060101 G06F015/173""]","[""G06F9/48 20060101 G06F009/48"", ""G06F9/50 200...",...,0,0,0,0,0,0,0,1,0,0
9223929,9223929,9708647;10106839;9499861;10501778;10174367;945...,2013;2016;2017;2011;2011;2015;2015,['2018' '2019' '2019' '2016' '2016' '2016' '20...,2015-12-29,2015,G16B5/20;G16B25/10;G16B25/00;G16B40/30;G16B40/...,G16B30/10,"[""G06F7/60 20060101 G06F007/60""]","[""G01N33/48 20060101 G01N033/48"", ""G01N31/00 2...",...,0,0,0,3,1,1,2,0,0,0
9223600,9223600,10341257;9973445;10057191;10110514;10084726;10...,2015;2015;2015;2015;2019;2015,['2018' '2018' '2018' '2019' '2020' '2018'],2015-12-29,2015,G06F9/45525;G06F9/45533,,"[""G06F9/455 20060101 G06F009/455""]",,...,0,0,0,0,0,4,1,1,0,0


# Inference

## Technology domain shifts during the landscape exploration

In [10]:
target_pns = np.array(["6983071", "6983272", "9223967", "9224096"])

In [11]:
tech_dataset.data.loc[target_pns]

Unnamed: 0_level_0,patent_number,main_class,sub_class,patent_classes,claims,TC5,TC5_digitized,class
patent_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
6983071,6983071,"[G06V030, G06V030, G06V010, G06V010]",[G06V030],"[G06V030, G06V030, G06V010, G06V010, G06V030]",1. A character segmentation device for removin...,0,0,1958
6983272,6983272,"[G06Q030, G06Q030, G06Q030, G06Q030, G06Q050, ...","[Y10S707, Y10S707, Y10S707, Y10S707, Y10S707]","[G06Q030, G06Q030, G06Q030, G06Q030, G06Q050, ...",1. A method of generating a result list in res...,38,1,1941
9223967,9223967,"[G06F021, G06F009, G06F009, G06F009, G06F021, ...","[G06F011, G06F011]","[G06F021, G06F009, G06F009, G06F009, G06F021, ...",1. A microprocessor comprising: at least one i...,7,0,1919
9224096,9224096,"[G06N003, H04W004, H04W004, G06F021, G06Q010, ...","[G08B013, H04W004]","[G06N003, H04W004, H04W004, G06F021, G06Q010, ...",1. A method for a device to self-assess a stat...,22,1,1933


In [12]:
set(tech_dataset.data.loc["6983071"]["patent_classes"])

{'G06V010', 'G06V030'}

In [13]:
tech_dataset.rawdata[tech_dataset.rawdata["patent_number"]==6983071]

Unnamed: 0,patent_number,patent_number.1,forward_refs,application_year_forward_refs,citing_years,granted_date,granted_year,main_cpc,sub_cpc,main_ipc,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
3,6983071,6983071,8675968,2009,['2014'],2006-01-03,2006,G06V30/155;G06V30/1444;G06V10/22;G06V10/273,G06V30/10,"[""G06K9/34 20060101 G06K009/34""]",...,0,1,0,0,0,0,0,0,0,0


In [14]:
tech_dataset.data.index.get_indexer(pd.Index(target_pns))

array([     2,     91, 133314, 133218])

## Preparing validation analysis

In [15]:
col_years = ["<1976"] + np.arange(1976,2022).astype(str).tolist()
latest_year = 2022
n_TC = configs.data.n_TC

In [16]:
ref_config_period = "[2006-2020]"
ref_data_file = "collection_" + "".join([config_keywords, config_area, ref_config_period, config_sampling_ratio]) + ".csv"
ref_configs = copy.deepcopy(configs)
ref_configs.data.update({"target_period": ref_config_period, "data_file": ref_data_file})

In [17]:
load_ref = False

In [18]:
ref_dataset_path = os.path.join(root_dir, "data", "[DATASET]reference_[uspto_AI][2006-2020].pickle")

In [19]:
if load_ref:
    with open(dataset_path, "rb") as f:
        ref_dataset = pickle.load(f)
        if ref_dataset.rawdata is None:
            ref_dataset.rawdata = pd.read_csv(os.path.join(ref_configs.data.data_dir, ref_configs.data.data_file), low_memory=False)
else: 
    ref_dataset = TechDataset(ref_configs.data)
    with open(ref_dataset_path, "wb") as f:
        ref_dataset.rawdata = None
        pickle.dump(ref_dataset, f)
    ref_dataset.rawdata = pd.read_csv(os.path.join(ref_configs.data.data_dir, ref_configs.data.data_file), low_memory=False)




Tokenizer is trained and saved


In [20]:
ref_dataset.rawdata = pd.read_csv(os.path.join(ref_configs.data.data_dir, ref_configs.data.data_file), low_memory=False)

In [21]:
ref_dataset

<data.TechDataset at 0x7fcec1f03370>

In [22]:
used_rawdata = tech_dataset.rawdata.set_index("patent_number")
total_data = pd.concat([tech_dataset.data, ref_dataset.data], axis=0)
total_data = total_data.drop_duplicates(subset=["patent_number"])
total_rawdata = pd.concat([tech_dataset.rawdata.set_index("patent_number"), ref_dataset.rawdata.set_index("patent_number")], axis=0)
total_rawdata = total_rawdata.drop_duplicates(subset=["patent_number.1"])

In [23]:
used_test_data_TC = used_test_data[used_test_data["TC5"]!=0].reset_index()
used_test_index_TC = tech_dataset.data.index.get_indexer(pd.Index(used_test_data_TC["patent_number"].astype(str)))

## Micro-validation analyis

### Functions that identify patent sets for micro-validation analysis

In [None]:
def prepare_input_data(tech_dataset, used_test_index_TC, idx, model):
    # Prepares input data and loads it to the model device
    # Encode class and claim data
    input_class = torch.tensor(tech_dataset.tokenizers["class_enc"].encode(tech_dataset.X_class[used_test_index_TC][idx])).unsqueeze(0)
    input_claim = tech_dataset.tokenize(tech_dataset.tokenizers["claim_enc"], tech_dataset.X_claim[used_test_index_TC][idx])
    input_claim = {k: v.unsqueeze(0) for k, v in input_claim.items()}
    
    # Create batch input and send to model device
    batch_input = {"class": torch.tensor(input_class), "claim": input_claim}
    input_inf = to_device(batch_input, model.device)
    
    return input_inf, input_class

def encode_and_predict(model, input_inf):
    # Encodes input data and performs initial prediction
    enc_outputs, z, mu, logvar = model.encode(input_inf)
    pred_outputs = model.predict(z)
    
    return enc_outputs, z, mu, logvar, pred_outputs

def analyze_forward_references(used_test_data_TC, used_rawdata, total_data, idx, n_TC):
    # Analyzes forward references to retrieve CPC codes and citation counts of referenced patents
    # Check if forward citations exist
    if used_test_data_TC.iloc[idx]["TC5"] <= 0:
        return False, None, None
    
    # Get forward reference information
    forward_refs = used_rawdata.loc[used_test_data_TC.iloc[idx]["patent_number"]]["forward_refs"].split(";")
    ref_info = total_data.loc[[ref for ref in forward_refs if ref in total_data.index]]
    
    if len(ref_info) == 0:
        return False, None, None
        
    # Get IPC codes and citation counts of referenced patents
    ref_ipcs = ref_info["patent_classes"].apply(lambda x: set(x))
    ref_FCs = ref_info["TC" + str(n_TC)]
    
    return True, ref_ipcs, ref_FCs

def decode_original_text(tech_dataset, input_class):
    # Decodes the original IPC code from input class tensor
    tokenizer = tech_dataset.tokenizers["class_dec"]
    org_text = tokenizer.decode_batch(input_class.cpu().detach().numpy())[0]
    org_text = org_text[org_text.index(tokenizer.sos_token)+1:org_text.index(tokenizer.eos_token)]
    
    return org_text

def check_same_ipcs(org_text, ref_ipcs):
    # Checks if the original IPC codes are the same as those in referenced patents
    return set(org_text) == set(np.concatenate(ref_ipcs.apply(lambda x: list(x)).values))

def optimize_latent_space(model, z, enc_outputs, tech_dataset, L1_threshold, n_iter, step_size):
    # Optimizes latent space through gradient descent to generate IPC codes with citation count above L1_threshold
    tokenizer = tech_dataset.tokenizers["class_dec"]
    optimized = False
    gen_text = None
    FC_estimated = 0
    
    # Iterative optimization using gradient descent
    for i in range(n_iter):
        pred_outputs = model.predict(z)
        z.retain_grad()
        FC_estimated = np.round(np.exp(pred_outputs[0,1].item()), 4)  # estimated forward citations
        
        # Calculate L1 error and backpropagate
        L1_error = (1 - torch.exp(pred_outputs[0,1]))
        L1_error.backward(retain_graph=True)
        
        # Update latent vector in gradient direction
        grad_for_update = step_size * z.grad
        z_ = z - grad_for_update
        
        z.grad.zero_()
        
        # Generate IPC code with new latent vector
        dec_outputs = model.decode(z_, enc_outputs, dec_inputs=None)
        dec_outputs = dec_outputs.argmax(-1)
        
        # Decode generated IPC code
        gen_text = tokenizer.decode_batch(dec_outputs.cpu().detach().numpy())[0]
        if tokenizer.eos_token in gen_text:
            gen_text = gen_text[gen_text.index(tokenizer.sos_token)+1:gen_text.index(tokenizer.eos_token)]
        else:
            gen_text = gen_text[gen_text.index(tokenizer.sos_token)+1:]
            
        # Refine generated IPC code
        if gen_text != []:
            gen_text = [gen_text[0]] + list(np.array(gen_text[1:])[np.unique(gen_text[1:], return_index=True)[1]])
            gen_text = set(gen_text)
        else:
            continue
        
        # End optimization if estimated citation count is sufficient
        if FC_estimated >= L1_threshold:
            optimized = True
            break
            
        z = z_
        
    return optimized, gen_text, FC_estimated, z

def breakdown(ipcs):
    # Breaks down IPC codes into different levels (section, class, subclass, full)
    return ([ipc[0] for ipc in ipcs], [ipc[:3] for ipc in ipcs], [ipc[:4] for ipc in ipcs], ipcs)

def validate_generated_ipc(gen_text, ref_ipcs, ref_FCs):
    # Validates generated IPC codes by comparing with IPC codes of referenced patents
    inclusions = [None, None, None, None]
    higher_impacts = [None, None, None, None]
    similar_refs = [None, None, None, None]
    unsimilar_refs = [None, None, None, None]
    
    # Break down IPC codes
    gen_text_breakdown = breakdown(gen_text)
    ref_ipcs_breakdown = (
        ref_ipcs.apply(lambda x: breakdown(x)[0]),
        ref_ipcs.apply(lambda x: breakdown(x)[1]),
        ref_ipcs.apply(lambda x: breakdown(x)[2]),
        ref_ipcs
    )
    
    # Validate at each level
    for i in range(4):
        if inclusions[i] is not None:
            continue
            
        temp_gen_text = gen_text_breakdown[i]
        temp_ref_ipcs = ref_ipcs_breakdown[i]
        
        # Find references matching generated IPC
        hit_index = temp_ref_ipcs.apply(lambda x: 1 if set(x) == set(temp_gen_text) else 0) == 1
        similar_refs[i] = temp_ref_ipcs[hit_index].index
        unsimilar_refs[i] = temp_ref_ipcs[~hit_index].index
        
        # No matching references
        if len(similar_refs[i]) == 0:
            inclusions[i] = 0
            higher_impacts[i] = None
        # All references match
        elif len(unsimilar_refs[i]) == 0:
            inclusions[i] = 1
            similar_mean_FC = np.mean(ref_FCs.loc[similar_refs[i]])
            higher_impacts[i] = 1 if similar_mean_FC > 0 else 0
        # Some references match
        else:
            inclusions[i] = 1
            similar_mean_FC = np.mean(ref_FCs.loc[similar_refs[i]])
            unsimilar_mean_FC = np.mean(ref_FCs.loc[unsimilar_refs[i]])
            
            if similar_mean_FC >= unsimilar_mean_FC:
                higher_impacts[i] = 1 if similar_mean_FC > 0 else None
            else:
                higher_impacts[i] = 0
    
    return inclusions, higher_impacts, similar_refs, unsimilar_refs

def validate_reliability(model=None, idx=None, L1_threshold=0.5, n_iter=30, step_size=40):
    # Validates the reliability of generated IPC codes
    # Initialize counters
    cnt_nonexist = 0
    cnt_noFC = 0
    cnt_diverge = 0
    cnt_same_ipcs = 0
    cnt_diff_ipcs = 0
    
    # 1. Prepare input data
    input_inf, input_class = prepare_input_data(tech_dataset, used_test_index_TC, idx, model)
    
    # 2. Encode and perform initial prediction
    enc_outputs, z, mu, logvar, pred_outputs = encode_and_predict(model, input_inf)
    
    # 3. Analyze forward references
    ref_exists, ref_ipcs, ref_FCs = analyze_forward_references(used_test_data_TC, used_rawdata, total_data, idx, n_TC)
    
    if not ref_exists:
        if used_test_data_TC.iloc[idx]["TC5"] <= 0:
            cnt_noFC += 1
        else:
            cnt_nonexist += 1
        return (cnt_nonexist, cnt_noFC, cnt_diverge, cnt_same_ipcs, cnt_diff_ipcs), None
    
    # 4. Decode original IPC code
    org_text = decode_original_text(tech_dataset, input_class)
    
    # 5. Check if original IPC matches referenced IPCs
    if check_same_ipcs(org_text, ref_ipcs):
        cnt_same_ipcs += 1
    
    # 6. Generate IPC code through latent space optimization
    optimized, gen_text, FC_estimated, z = optimize_latent_space(model, z, enc_outputs, tech_dataset, L1_threshold, n_iter, step_size)
    
    if not optimized:
        cnt_diverge += 1
        return (cnt_nonexist, cnt_noFC, cnt_diverge, cnt_same_ipcs, cnt_diff_ipcs), None
    
    # 7. Validate generated IPC code
    inclusions, higher_impacts, similar_refs, unsimilar_refs = validate_generated_ipc(gen_text, ref_ipcs, ref_FCs)
    
    # 8. Collect final results
    cnt_diff_ipcs += 1
    results = {
        "index": idx,
        "patent_id": used_test_data_TC.iloc[idx]["patent_number"],
        "org_text": org_text,
        "gen_text": gen_text,
        "ref_ipcs": ref_ipcs,
        "ref_FCs": ref_FCs,
        "inclusions": inclusions,
        "higher_impacts": higher_impacts, 
        "FC_estimated": FC_estimated,
        "similar_refs": similar_refs,
        "unsimilar_refs": unsimilar_refs
    }
    
    return (cnt_nonexist, cnt_noFC, cnt_diverge, cnt_same_ipcs, cnt_diff_ipcs), results


### Identify sets of patents and patent classes

- Set hyperparameters for validation analysis (e.g., L1 threshold)

In [32]:
load_dict_out = True
save_dict_out = False
L1_threshold = 0.8
n_iter = 30
step_size = 40
analysis_config = analysis_date + "_thre" + str(L1_threshold)
print(analysis_config)

2025-06-07_1732_thre0.8


- Compute the results of analysis

In [None]:
if load_dict_out:
    with open("../results/validation/"+analysis_config+"/dict_out.pickle", "rb") as f:
        dict_out = pickle.load(f)
    print("dict_out loaded")
else:
    model = final_model.module if torch.cuda.is_available() else final_model
    cnt_nonexist, cnt_noFC, cnt_diverge, cnt_same_pcs, cnt_diff_pcs = 0, 0, 0, 0, 0
    dict_out = {"index": [], "patent_id": [], "org_text": [], "gen_text": [], "ref_pcs": [], "ref_FCs": [],
                "inclusions": [], "higher_impacts": [], "FC_estimated": [], "similar_refs": [], "unsimilar_refs": []}
    for idx in tqdm(range(len(used_test_index_TC))):
        cnts, results = validate_reliability(model=model, idx=idx, L1_threshold=L1_threshold, n_iter=n_iter, step_size=step_size)
        cnt_nonexist += cnts[0]
        cnt_noFC += cnts[1]
        cnt_diverge += cnts[2]
        cnt_same_pcs += cnts[3]
        cnt_diff_pcs += cnts[4]
        if results is not None:
            for k,v in results.items():
                dict_out[k].append(v)
    for k, v in dict_out.items():
        dict_out[k] = np.array(v)
    dict_out["cnts"] = {"cnt_nonexist": cnt_nonexist, "cnt_noFC": cnt_noFC, 
                "cnt_diverge": cnt_diverge, "cnt_same_pcs": cnt_same_pcs, "cnt_diff_pcs": cnt_diff_pcs}
    
    if save_dict_out:
        save_dir = "../results/validation/"+analysis_config
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        with open(save_dir+"/dict_out.pickle", "wb") as f:
            pickle.dump(dict_out, f)

dict_out loaded


In [None]:
print("#valid data:",len(dict_out["index"]))
print("#nonexist data:",dict_out["cnts"]["cnt_nonexist"])
print("#no forward citations:",dict_out["cnts"]["cnt_noFC"])
print("#diverged:",dict_out["cnts"]["cnt_diverge"])
print("#same pcs:",dict_out["cnts"]["cnt_same_pcs"])

#valid data: 6929
#nonexist data: 2983
#no forward citations: 0
#diverged: 8
#same ipcs: 618


### Compare distribution of forward citations between P_citing/remained and P_citing/shifted

In [35]:
from scipy.stats import ttest_ind
whole_patent_classes = tech_dataset.data["patent_classes"].apply(lambda x: set(x))

hit_similar_FCs, hit_unsimilar_FCs, hit_diff_FCs = [], [], []
hit_similar_FCs_mean, hit_unsimilar_FCs_mean = [], []
hit_similar_FCs_rank = []
whole_FC_ttest = {"statistic": [], "pvalue": []}
whole_FCs_diff = []
hit_samples_index = dict_out["inclusions"][:,-1]==1

hit_patent_ids = dict_out["patent_id"][hit_samples_index]
hit_similar_refs = dict_out["similar_refs"][hit_samples_index][:,-1]
hit_unsimilar_refs = dict_out["unsimilar_refs"][hit_samples_index][:,-1]

for i in range(len(hit_patent_ids)):
    hit_FCs = dict_out["ref_FCs"][hit_samples_index][i]
    hit_FCs = hit_FCs.loc[~hit_FCs.index.duplicated(keep="first")]
    hit_similar_FC = hit_FCs.loc[hit_similar_refs[i].drop_duplicates()]
    hit_similar_FC_rank = hit_FCs.rank(pct=True).loc[hit_similar_refs[i].drop_duplicates()]
    hit_unsimilar_FC = hit_FCs.loc[hit_unsimilar_refs[i].drop_duplicates()]
    if len(hit_unsimilar_FC)>0:
        hit_diff_FC = hit_similar_FC.mean() - hit_unsimilar_FC.mean()
    else:
        hit_diff_FC = hit_similar_FC.mean()
    
    org_whole_FC = tech_dataset.data.loc[whole_patent_classes[whole_patent_classes==set(dict_out["org_text"][hit_samples_index][i])].index]["TC5"]
    gen_whole_FC = tech_dataset.data.loc[whole_patent_classes[whole_patent_classes==set(dict_out["gen_text"][hit_samples_index][i])].index]["TC5"]
    if len(org_whole_FC)>0:
        whole_FC_diff = gen_whole_FC.mean() - org_whole_FC.mean()
    else:
        whole_FC_diff = gen_whole_FC.mean()
    
    hit_similar_FCs.append(hit_similar_FC)
    hit_similar_FCs_rank.append(hit_similar_FC_rank)
    hit_similar_FCs_mean.append(hit_similar_FC.mean())
    hit_unsimilar_FCs.append(hit_unsimilar_FC)
    if len(hit_unsimilar_FC)>0:
        hit_unsimilar_FCs_mean.append(hit_unsimilar_FC.mean())
    else:
        hit_unsimilar_FCs_mean.append(0)
    hit_diff_FCs.append(hit_diff_FC)
    
    ttest_res = ttest_ind(gen_whole_FC, org_whole_FC, equal_var=False)
    
    whole_FC_ttest["statistic"].append(ttest_res.statistic)
    whole_FC_ttest["pvalue"].append(ttest_res.pvalue)
    if set(dict_out["org_text"][hit_samples_index][i]) != set(dict_out["gen_text"][hit_samples_index][i]):
        whole_FCs_diff.append(whole_FC_diff)
    
hit_similar_FCs = np.concatenate(hit_similar_FCs)
hit_similar_FCs_rank = np.concatenate(hit_similar_FCs_rank)
hit_similar_FCs_mean = np.array(hit_similar_FCs_mean)
hit_unsimilar_FCs = np.concatenate(hit_unsimilar_FCs)
hit_unsimilar_FCs_mean = np.array(hit_unsimilar_FCs_mean)
hit_diff_FCs = np.array(hit_diff_FCs)

print("Distribution of forward citations (hit)\n",pd.Series(hit_similar_FCs).describe(),"\n")
print("Distribution of forward citations (not hit)\n",pd.Series(hit_unsimilar_FCs).describe(),"\n")
print("Distribution of mean forward citations (hit)\n",pd.Series(hit_similar_FCs_mean).describe(),"\n")
print("Distribution of mean forward citations (not hit)\n",pd.Series(hit_unsimilar_FCs_mean).describe(),"\n")
print("Distribution of difference of forward citations\n",pd.Series(hit_diff_FCs).describe(),"\n")

  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)


Distribution of forward citations (hit)
 count    730.000000
mean      23.068493
std       53.970380
min        0.000000
25%        1.000000
50%        3.500000
75%       14.000000
max      421.000000
dtype: float64 

Distribution of forward citations (not hit)
 count    3546.000000
mean       22.330231
std        45.144595
min         0.000000
25%         1.000000
50%         4.000000
75%        18.000000
max       481.000000
dtype: float64 

Distribution of mean forward citations (hit)
 count    276.000000
mean      11.163587
std       24.624972
min        0.000000
25%        0.500000
50%        3.000000
75%        9.625000
max      164.000000
dtype: float64 

Distribution of mean forward citations (not hit)
 count    276.000000
mean       9.878425
std       19.964201
min        0.000000
25%        0.350000
50%        3.101010
75%       10.754808
max      137.148148
dtype: float64 

Distribution of difference of forward citations
 count    276.000000
mean       1.285162
std       17.

### Identify breakthrough inventions

In [None]:
cnt_same, cnt_diff = 0, 0
ranks_same, ranks_diff = [], []
value_same, value_diff = [], []
org_patents_same, org_patents_diff = [], []
cols_val = ["patent_id", "org_pcs", "org_FC", "gen_pcs", "is_same", 
            "forward_ref", "ref_pcs", "ref_FC", "ref_FC_rank"]
df_val = pd.DataFrame(columns=cols_val)
hit_similar_refs = dict_out["similar_refs"][hit_samples_index][:,-1]

for i in tqdm(range(len(dict_out["ref_pcs"][hit_samples_index]))):
    pid = dict_out["patent_id"][hit_samples_index][i]
    pid = str(pid)
    org_FC = tech_dataset.data.loc[pid]["TC5"]
    orgs = set(dict_out["org_text"][dict_out["inclusions"][:,-1]==1][i])
    gens = set(dict_out["gen_text"][dict_out["inclusions"][:,-1]==1][i])
    is_same = 1 if orgs==gens or orgs.union(gens)==orgs else 0
    hit_FCs = dict_out["ref_FCs"][hit_samples_index][i]
    hit_FCs = hit_FCs.loc[~hit_FCs.index.duplicated(keep="first")]
    hit_similar_FC = hit_FCs.loc[hit_similar_refs[i][hit_similar_refs[i].duplicated()]]
    hit_similar_FC_rank = hit_FCs.rank(pct=True).loc[hit_similar_refs[i][hit_similar_refs[i].duplicated()]]
    
    for ref in hit_similar_refs[i].drop_duplicates():
        ref_pcs = dict_out["ref_pcs"][hit_samples_index][i].loc[ref]
        if isinstance(ref_pcs, pd.Series): ref_pcs = ref_pcs[0]
        hit_similar_FC = hit_FCs.loc[ref]
        if isinstance(hit_similar_FC, pd.Series): hit_similar_FC = hit_similar_FC[0]
        hit_similar_FC_rank = hit_FCs.rank(pct=True).loc[ref]
        if isinstance(hit_similar_FC_rank, pd.Series): hit_similar_FC_rank = hit_similar_FC_rank[0]
        
        df_container = pd.DataFrame([[pid, orgs, org_FC, gens, is_same, ref, ref_pcs, hit_similar_FC, hit_similar_FC_rank]], columns=cols_val)
        df_val = pd.concat([df_val, df_container])

df_val = df_val.set_index("patent_id")

  0%|          | 0/276 [00:00<?, ?it/s]

In [37]:
df_val.loc[:,"ref_FC_new"] = df_val.apply(lambda x: str(x["ref_FC"])+" ("+str(np.round(x["ref_FC_rank"],2))+")", axis=1)

In [38]:
df_val

Unnamed: 0_level_0,org_ipcs,org_FC,gen_ipcs,is_same,forward_ref,ref_ipcs,ref_FC,ref_FC_rank,ref_FC_new
patent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
7542816,"{G11B027, G06F016, Y10S707}",18,{G06F016},1,9576050,{G06F016},6,0.821429,6 (0.82)
8355610,"{G02B027, G02B006}",30,{G02B027},1,9715110,{G02B027},18,0.857143,18 (0.86)
7010095,"{A61B090, A61B034}",5,"{A61B090, A61B034}",1,8165659,"{A61B090, A61B034}",6,1.000000,6 (1.0)
8977643,{G06F016},35,{G06F016},1,10169421,{G06F016},0,1.000000,0 (1.0)
7242681,{H04L063},10,"{G06F021, H04L063}",0,9875344,"{G06F021, H04L063}",20,0.800000,20 (0.8)
...,...,...,...,...,...,...,...,...,...
7366735,"{G06F016, Y10S707}",16,"{G06F016, Y10S707}",1,7493305,"{G06F016, Y10S707}",8,1.000000,8 (1.0)
7080090,"{G06F016, G06F021, Y10S707}",3,"{G06F016, G06F021}",1,10346429,"{G06F016, G06F021}",12,1.000000,12 (1.0)
9043922,{G06F021},16,"{G06F016, G06F021, H04L063}",0,9977920,"{G06F016, G06F021, H04L063}",1,0.583333,1 (0.58)
7475425,{H04L063},17,{H04L063},1,9356941,{H04L063},15,1.000000,15 (1.0)


In [39]:
L1_criterion = tech_dataset.data["TC5"].quantile(0.9)
print("total hit:", len(df_val))
print("same:",len(df_val[df_val["is_same"]==1]))
print("diff:",len(df_val[df_val["is_same"]==0]))
print("over L1 criterion:", len(df_val[df_val["is_same"]==0][df_val[df_val["is_same"]==0]["ref_FC"]>=L1_criterion]))
print("ratio:",len(df_val[df_val["is_same"]==0][df_val[df_val["is_same"]==0]["ref_FC"]>=L1_criterion]) / len(df_val[df_val["is_same"]==0]))

total hit: 730
same: 669
diff: 61
over L1 criterion: 18
ratio: 0.29508196721311475


In [40]:
## Data samples where the patent classes of citing patent is the same as generated classes, 
## different from original classes, 
## and the number of forward citations exceeds the L1 criterion of the entire dataset
df_val[df_val["is_same"]==0][df_val[df_val["is_same"]==0]["ref_FC"]>=L1_criterion].drop(labels=["is_same", "ref_FC", "ref_FC_rank"], axis=1)

Unnamed: 0_level_0,org_ipcs,org_FC,gen_ipcs,forward_ref,ref_ipcs,ref_FC_new
patent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7242681,{H04L063},10,"{G06F021, H04L063}",9875344,"{G06F021, H04L063}",20 (0.8)
8549579,"{H04L063, H04L051}",8,"{G06F021, H04L063}",9087215,"{G06F021, H04L063}",23 (1.0)
8307177,"{G06F016, G06F011, G06F009}",165,"{G06F016, G06F011, G06F009, G06F003}",9823977,"{G06F016, G06F011, G06F009, G06F003}",40 (0.92)
8307177,"{G06F016, G06F011, G06F009}",165,"{G06F016, G06F011, G06F009, G06F003}",10474542,"{G06F016, G06F011, G06F009, G06F003}",20 (0.78)
9089966,{B25J009},13,"{Y10S901, B25J009}",9486921,"{Y10S901, B25J009}",114 (1.0)
8832846,"{G06F008, G06F021}",14,"{G06F021, H04L063}",9536108,"{G06F021, H04L063}",71 (1.0)
7697758,"{G06K009, G06V030}",33,"{G06F016, G06V020, G06V010, G06V030}",8953886,"{G06F016, G06V020, G06V010, G06V030}",39 (1.0)
7024364,"{G10L015, H04M003}",10,"{G10L015, G06F003}",8589161,"{G10L015, G06F003}",33 (0.7)
7024364,"{G10L015, H04M003}",10,"{G10L015, G06F003}",9721566,"{G10L015, G06F003}",102 (0.87)
7127403,"{G10L015, H04M003}",6,"{G10L015, G06F003}",9721566,"{G10L015, G06F003}",102 (0.88)


In [41]:
df_val[df_val["is_same"]==0][df_val[df_val["is_same"]==0]["ref_FC"]>=L1_criterion].drop(labels=["is_same", "ref_FC", "ref_FC_rank"], axis=1).to_csv("Breakthrough_examples.csv")

In [42]:
## Data samples where the patent classes of citing patent is the same as generated classes, 
## different from original classes, and the number of forward citations is greater than that of the original patent
df_val[df_val["is_same"]==0].drop(labels=["is_same", "ref_FC", "ref_FC_rank"], axis=1).apply(lambda x: x.name if int(x["org_FC"]) <= int(x["ref_FC_new"].split("(")[0]) else np.nan, axis=1).dropna()

patent_id
7242681    7242681
8549579    8549579
8549579    8549579
9089966    9089966
8832846    8832846
8621259    8621259
7697758    7697758
7693889    7693889
7024364    7024364
7024364    7024364
7127403    7127403
7127403    7127403
7769756    7769756
7363549    7363549
7363549    7363549
7389229    7389229
dtype: object

In [None]:
df_val.loc["7697758"]

## Obtain descriptive statistics

- P_citing/remained

In [43]:
forward_refs = used_rawdata.loc[used_test_data_TC["patent_number"]]["forward_refs"].apply(lambda x: np.array(x.split(";")))

In [44]:
valid_refs = forward_refs.apply(lambda x: [xx for xx in x if xx in total_data.index])

In [45]:
exist_refs = valid_refs[valid_refs.apply(lambda x: True if x != [] else False)]

In [None]:
ref_pcs = exist_refs.apply(lambda x: [set(total_data.loc[xx]["patent_classes"]) for xx in x])
ref_pcs.name = "ref_pc"

In [47]:
ref_FC = exist_refs.apply(lambda x: [total_data.loc[xx]["TC5"] for xx in x])
ref_FC.name = "TC5"

In [None]:
org_pcs = pd.Series(exist_refs.index, name="patent_number").astype(str).apply(lambda x: set(total_data.loc[x]["patent_classes"]))
org_pcs.index = exist_refs.index
org_pcs.name = "org_pc"

In [None]:
compare_total = pd.concat([org_pcs, ref_pcs, ref_FC], axis=1)

In [50]:
compare_total

Unnamed: 0_level_0,org_ipc,ref_ipc,TC5
patent_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8423759,"{H04L009, H04L063}","[{H04L009, H04L063, G06F021}, {H04L009, H04L06...","[2, 85, 2, 0, 2, 2, 4, 0, 0, 0, 5, 2, 0, 22, 3]"
8095229,"{G06Q010, G06Q050, Y02P090, G05B019, G16Z099}","[{G06Q010, G05B015, Y02P090, B33Y050, G05B019}...","[1, 10]"
8616308,"{Y10S901, B62D055}","[{B25J009, G05D003, G05D001}]",[1]
7313340,"{G03G015, H04N001}","[{H04N001}, {G06V010, G06V030}, {G06F016, G06F...","[0, 0, 12]"
7529392,{G06T007},[{G06T005}],[0]
...,...,...,...
7287033,"{G06F016, Y10S707}","[{G06F008, G06F016, Y10S707}, {G06F016, Y10S70...","[2, 7, 2, 0, 2]"
7577671,"{G06F016, Y10S707}","[{G06F016, G06Q010, G06Q030}]",[22]
7496557,"{G06F016, Y10S707}","[{G06F016, Y10S707}]",[1]
8838301,"{B61L029, B61L025}","[{H04N007, G06K009, G06V020, B61L023}, {B61L02...","[1, 3, 2]"


In [None]:
ref_FC_identical = compare_total.apply(lambda x: [x["TC5"][i] for i in range(len(x["ref_pc"])) if x["ref_pc"][i]==x["org_pc"]], axis=1)

In [52]:
valid_FC_identical = ref_FC_identical[ref_FC_identical.apply(lambda x: True if x != [] else False)]

In [53]:
avg_FC_identical = valid_FC_identical.apply(lambda x: np.mean(x))

In [54]:
avg_FC_identical.describe()

count    1930.000000
mean        5.666213
std        13.403872
min         0.000000
25%         0.000000
50%         2.000000
75%         5.500000
max       160.700000
dtype: float64

- P_citing/shifted

In [55]:
df_val[df_val["is_same"]==0]["ref_FC"].astype(int).describe()

count     61.000000
mean      16.901639
std       30.767789
min        0.000000
25%        1.000000
50%        4.000000
75%       20.000000
max      153.000000
Name: ref_FC, dtype: float64

## Macro validation analysis


In [None]:
org_patent_ids = dict_out["patent_id"]
org_pcs = pd.Series(dict_out["org_text"]).apply(lambda x: set(x))
org_pcs.name = "patent_number"
org_pcs.index = org_patent_ids
gen_pcs = pd.Series(dict_out["gen_text"])
gen_pcs.name = "patent_number"
gen_pcs.index = org_patent_ids

In [None]:
org_pcs

8423759                                   {H04L009, H04L063}
8095229        {G06Q010, G06Q050, Y02P090, G05B019, G16Z099}
8616308                                   {Y10S901, B62D055}
7313340                                   {G03G015, H04N001}
7529392                                            {G06T007}
                                 ...                        
7287033                                   {G06F016, Y10S707}
7577671                                   {G06F016, Y10S707}
7496557                                   {G06F016, Y10S707}
8838301                                   {B61L029, B61L025}
9158975    {G06V020, G06F016, G06V010, G08B013, G06V040, ...
Name: patent_number, Length: 6929, dtype: object

In [None]:
gen_pcs

8423759                 {G06Q020, H04L063, G05B019, G06Q010}
8095229    {G06Q010, G06Q050, G06F016, Y02P090, H04L012, ...
8616308                 {G06N003, G05D001, A61N001, A61M005}
7313340    {G06Q040, G06F016, G06F021, G06Q020, H04L067, ...
7529392                          {G06V040, G06F003, G06Q030}
                                 ...                        
7287033                                   {H04L065, G06F016}
7577671                                   {H04W004, G06F016}
7496557                                            {G06F016}
8838301        {B61L027, G06Q010, Y02P090, H04W004, G06Q030}
9158975                                   {G06V040, G06V020}
Name: patent_number, Length: 6929, dtype: object

In [None]:
pcs_total_frozen = total_data.iloc[:10000].apply(lambda x: frozenset(x["patent_classes"]), axis=1)

In [None]:
org_pcs_frozen = org_pcs.apply(lambda x: frozenset(np.sort(list(x))))
gen_pcs_frozen = gen_pcs.apply(lambda x: frozenset(np.sort(list(x))))

In [None]:
org_pcs_frozen

8423759                                   (H04L009, H04L063)
8095229        (G06Q010, G06Q050, Y02P090, G05B019, G16Z099)
8616308                                   (Y10S901, B62D055)
7313340                                   (G03G015, H04N001)
7529392                                            (G06T007)
                                 ...                        
7287033                                   (G06F016, Y10S707)
7577671                                   (G06F016, Y10S707)
7496557                                   (G06F016, Y10S707)
8838301                                   (B61L025, B61L029)
9158975    (G06V020, G06F016, G06V010, G08B013, G06V040, ...
Name: patent_number, Length: 6929, dtype: object

In [None]:
avg_TIs_org, avg_TIs_gen = {}, {}

for i in tqdm(range(len(org_pcs_frozen))):
    pn = org_pcs_frozen.index[i]
    pc_org = org_pcs_frozen.iloc[i]
    pc_gen = gen_pcs_frozen.iloc[i]
    
    avg_TIs_org[pn] = np.mean(total_data.loc[pcs_total_frozen[pcs_total_frozen==pc_org].index]["TC5"])
    avg_TIs_gen[pn] = np.mean(total_data.loc[pcs_total_frozen[pcs_total_frozen==pc_gen].index]["TC5"])

  0%|          | 0/6929 [00:00<?, ?it/s]

In [None]:
res_macro = pd.concat([org_pcs, pd.DataFrame(avg_TIs_org, index=[0]).T, gen_pcs, pd.DataFrame(avg_TIs_gen, index=[0]).T], axis=1)
res_macro.columns = ["org_pc", "org_TI", "gen_pc", "gen_TI"]

In [64]:
res_macro

Unnamed: 0,org_ipc,org_TI,gen_ipc,gen_TI
8423759,"{H04L009, H04L063}",35.500000,"{G06Q020, H04L063, G05B019, G06Q010}",
8095229,"{G06Q010, G06Q050, Y02P090, G05B019, G16Z099}",,"{G06Q010, G06Q050, G06F016, Y02P090, H04L012, ...",
8616308,"{Y10S901, B62D055}",,"{G06N003, G05D001, A61N001, A61M005}",
7313340,"{G03G015, H04N001}",2.000000,"{G06Q040, G06F016, G06F021, G06Q020, H04L067, ...",
7529392,{G06T007},4.695652,"{G06V040, G06F003, G06Q030}",
...,...,...,...,...
7287033,"{G06F016, Y10S707}",9.901048,"{H04L065, G06F016}",
7577671,"{G06F016, Y10S707}",9.901048,"{H04W004, G06F016}",
7496557,"{G06F016, Y10S707}",9.901048,{G06F016},10.1
8838301,"{B61L029, B61L025}",,"{B61L027, G06Q010, Y02P090, H04W004, G06Q030}",


In [65]:
valid_res_macro = res_macro[(res_macro["org_TI"].notna()) & (res_macro["gen_TI"].notna())]

In [66]:
valid_res_macro

Unnamed: 0,org_ipc,org_TI,gen_ipc,gen_TI
8386498,"{H04L041, H04L063}",18.166667,"{G06F016, G06F021, H04L063}",9.00
7647471,{G06F012},4.847222,"{G06F016, G06F021}",14.00
7542816,"{G11B027, G06F016, Y10S707}",8.571429,{G06F016},10.10
7002462,{B66B027},3.000000,"{A61B005, A61N001}",9.25
8032383,{G10L015},6.515464,"{G10L015, G06F016}",6.00
...,...,...,...,...
8373775,{G06T005},3.640000,{G06F016},10.10
7475425,{H04L063},9.000000,{H04L063},9.00
7818341,{G06F016},10.100000,{G06F016},10.10
7627618,"{G06F016, Y10S707, G06F009}",3.428571,"{G06F016, G06F003}",42.00


In [67]:
valid_res_macro["org_TI"].mean()

10.33533590921993

In [68]:
valid_res_macro["gen_TI"].mean()

13.485535757873558