In [22]:
root_dir = '/home2/glee/dissertation/1_tech_gen_impact/class2class/Tech_Gen/'
master_dir = '/home2/glee/dissertation/1_tech_gen_impact/master/Tech_Gen/'
import sys
sys.path.append(root_dir)

import uspto
import json
import copy
import gc
import os
import argparse
import math
import time
import pickle
import re
import multiprocess as mp
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)
warnings.filterwarnings(action='ignore', category=DeprecationWarning)
sys.path.append("/share/tml_package")
from tml import utils
from scipy import io
from tqdm import tqdm
from collections import OrderedDict

import torch
from torch.nn import functional as F
from torch.nn import DataParallel as DP
from torch.utils.data import TensorDataset, DataLoader, Subset, Dataset
from accelerate import Accelerator
import pytorch_model_summary

import optuna
from optuna.samplers import RandomSampler, TPESampler
from optuna.integration import SkoptSampler

import numpy as np
import pandas as pd
import scipy.stats
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import matthews_corrcoef, precision_recall_fscore_support, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

from data import TechDataset, CVSampler
from models import Transformer, Predictor
from train_utils import EarlyStopping, perf_eval, objective_cv, build_model, train_model, validate_model_mp
from utils import token2class, DotDict, to_device

from cleantext.sklearn import CleanTransformer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Configuration

In [4]:
# analysis_date = "2023-05-04_1802" # Seminconductor
# analysis_date = "2023-05-09_0331" # AI
analysis_date = "2025-06-07_1732" # AI, CPC
args = argparse.Namespace(
    do_eval = True,
    do_save=False,
    config_file=os.path.join(root_dir, "configs", "USED_configs", "[CONFIGS]"+analysis_date+".json"),
    eval_train_set=False)

project_data_dir = os.path.join(master_dir, "data")
data_dir = os.path.join("/home2/glee/patent_data/data/")
model_dir = os.path.join(root_dir, "models")
result_dir = os.path.join(root_dir, "results")
config_dir = os.path.join(root_dir, "configs")

## parse configuration file
if args.config_file is not None:
    config_file = args.config_file
else:
    config_file = os.path.join(config_dir, "configs_light.json") if args.light else os.path.join(config_dir, "configs.json")
if args.do_eval: args.do_train = False
configs = DotDict().load(config_file)
org_config_keys = {key: list(configs[key].keys()) for key in configs.keys()}

# parse command line arguments
instant_configs = {key: value for (key, value) in vars(args).items() if value is not None} # if any argument passed when main.py executed
instant_configs_for_update = {configkey: {key: value for (key,value) in instant_configs.items() if key in org_config_keys[configkey]} for configkey in org_config_keys.keys()}
for key, value in configs.items():
    value.update(instant_configs_for_update[key])

## assign loss weights
if configs.model.model_type == "enc-pred-dec":
    configs.train.loss_weights["recon"] = configs.train.loss_weights["recon"] / sum(configs.train.loss_weights.values())
    configs.train.loss_weights["y"] = 1 - configs.train.loss_weights["recon"]
elif configs.model.model_type == "enc-pred":
    configs.train.loss_weights = {"recon": 0, "y": 1}
elif configs.model.model_type == "enc-dec":
    configs.train.loss_weights = {"recon": 1, "y": 0}

## assign devices
if configs.train.use_accelerator:
    accelerator = Accelerator()
    device_ids = list(range(torch.cuda.device_count()))
    device = accelerator.device
    configs.train.update({"accelerator": accelerator})
else:
    if torch.cuda.is_available():
        device_ids = list(range(torch.cuda.device_count()))
        gpu_usages = [np.sum([float(usage.split("uses")[-1].replace(" ","").replace("MB","")) for usage in torch.cuda.list_gpu_processes(id).split("GPU memory") if not usage=="" and "no processes are running" not in usage]) for id in device_ids]
        device_ids = np.argsort(gpu_usages)[:configs.train.n_gpus]
        device_ids = list(map(lambda x: torch.device('cuda', x),list(device_ids)))
        device = device_ids[0] # main device
        torch.cuda.set_device(device)
    else:
        device = torch.device('cpu')
        device_ids = []

## extract configurations for dataset
config_period = "["+"-".join([str(year) for year in configs.data.target_period])+"]"
config_area = str(configs.data.target_area).replace("\'","").replace(" ","")
config_keywords = str(configs.data.target_keywords).replace("\'","").replace(" ","")
config_sampling_ratio = "["+str(configs.data.sampling_ratio)+"sampling"+"]" if configs.data.sampling_ratio < 1 else ""

## update configurations
configs.data.update({"root_dir": root_dir,
                        "data_dir": data_dir,
                        "model_dir": model_dir,
                        "result_dir": result_dir,
                        "pretrained_enc": configs.model.pretrained_enc,
                        "pretrained_dec": configs.model.pretrained_dec,
                        "data_nrows": None,
                        "data_file": "collection_" + "".join([config_keywords, config_area, config_period, config_sampling_ratio]) + ".csv"})
configs.train.update({"device": device,
                        "device_ids": device_ids,
                        "root_dir": root_dir,
                        "data_dir": data_dir,
                        "model_dir": model_dir,
                        "use_keywords": configs.data.use_keywords,
                        "class_system": configs.data.class_system,
                        "curr_ep": 1,
                        "early_stop_patience": int(0.3*configs.train.max_epochs)})
configs.model.update({"device": device,
                        "device_ids": device_ids,
                        "n_directions": 2 if configs.model.bidirec else 1,
                        "use_accelerator": configs.train.use_accelerator,
                        "model_dir": model_dir})

## Set hyperparameters for model training (To be TUNED)
if configs.train.do_train and configs.train.do_tune:
    n_layers = configs.model.n_layers = None
    d_embedding = configs.model.d_embedding = None
    d_enc_hidden = configs.model.d_enc_hidden = None
    d_pred_hidden = configs.model.d_pred_hidden = None
    learning_rate = configs.train.learning_rate = None
    batch_size = configs.train.batch_size = None
    config_name = "HPARAM_TUNING"
    final_model_path = None
else:
    n_layers = configs.model.n_layers
    d_embedding = configs.model.d_embedding
    d_enc_hidden = configs.model.d_enc_hidden
    d_pred_hidden = configs.model.d_pred_hidden
    d_latent = configs.model.d_latent

    key_components = {"data": ["class_level", "class_system", "max_seq_len_class", "max_seq_len_claim", "vocab_size"], "model": ["n_layers", "d_hidden", "d_pred_hidden", "d_latent", "d_embedding", "d_ff", "n_head", "d_head"], "train": ["learning_rate", "batch_size", "max_epochs", "curr_ep"]}
    model_config_name_prefix = "".join([config_keywords, config_area, config_period, config_sampling_ratio]) + "data"
    model_config_name = "" + model_config_name_prefix
    model_config_name += f"[{configs.data.class_system}]system"
    for key in ["model", "train"]:
        for component in key_components[key]:
            model_config_name += f"[{str(configs[key][component])}]{component}"
    final_model_path = os.path.join(model_dir, f"[MODEL]{model_config_name}.ckpt")

# configs.train.update({"model_config_name": model_config_name, "final_model_path": final_model_path})

# Dataset setting

In [5]:
''' PART 2: Dataset setting '''
tstart = time.time()
dataset_config_name = "".join([config_keywords, config_area, config_period, config_sampling_ratio]) + "data"
for component in key_components["data"]:
    dataset_config_name += f"[{str(configs.data[component])}]{component}"
dataset_path = os.path.join(project_data_dir, "pickled_dataset", "[DATASET]"+dataset_config_name+".pickle")

if os.path.exists(dataset_path) and args.do_save is False:
    print("Load pickled dataset...")
    with open(dataset_path, "rb") as f:
        tech_dataset = pickle.load(f)   # Load pickled dataset if dataset with same configuration already saved
        if tech_dataset.pretrained_enc != configs.data.pretrained_enc or tech_dataset.pretrained_dec != configs.data.pretrained_dec:
            tech_dataset.pretrained_enc = configs.data.pretrained_enc
            tech_dataset.pretrained_dec = configs.data.pretrained_dec
            tech_dataset.tokenizers = tech_dataset.get_tokenizers()
        for tk in tech_dataset.tokenizers.values():
            if "vocab_size" not in dir(tk):
                tk.vocab_size = tk.get_vocab_size()
        tech_dataset.use_keywords = configs.data.use_keywords
        ## load saved rawdata
        if tech_dataset.rawdata is None:
            tech_dataset.rawdata = pd.read_csv(os.path.join(data_dir, configs.data.data_file), low_memory=False)
    print("Pickled dataset loaded")
else:
    print("Make dataset...")
    if args.debug:
        configs.data.update({"data_nrows": 1000})
        dataset_path += ".debug"
    tech_dataset = TechDataset(configs.data)
    if not args.debug:
        rawdata_for_save = copy.deepcopy(tech_dataset.rawdata)
        with open(dataset_path, "wb") as f:
            tech_dataset.rawdata = None
            pickle.dump(tech_dataset, f)
        tech_dataset.rawdata = rawdata_for_save
tend = time.time()
# print(f"{np.round(tend-tstart,4)} sec elapsed for loading patents for class [{configs.data.target_area}]")

configs.model.update({"tokenizers": tech_dataset.tokenizers,
                    "n_enc_seq_claim": tech_dataset.max_seq_len_claim,
                    "n_dec_seq_claim": tech_dataset.max_seq_len_claim,
                    "n_enc_seq_class": tech_dataset.max_seq_len_class,
                    "n_dec_seq_class": tech_dataset.max_seq_len_class,
                    "n_outputs": 1 if configs.data.pred_type=="regression" else tech_dataset.n_outputs,
                    "i_padding": tech_dataset.tokenizers["class_enc"].pad_id})

Load pickled dataset...
Pickled dataset loaded


In [6]:
tech_dataset.__len__()

133654

# Load model

In [7]:
final_model_path

'/home2/glee/dissertation/1_tech_gen_impact/class2class/Tech_Gen/models/[MODEL][uspto_AI][2006-2015]data[CPC]system[2]n_layers[32]d_hidden[8]d_pred_hidden[32]d_latent[128]d_embedding[16]d_ff[2]n_head[16]d_head[0.0015]learning_rate[512]batch_size[30]max_epochs[1]curr_ep.ckpt'

In [8]:
final_model = build_model(configs.model, tokenizers=tech_dataset.tokenizers)
final_model_finder = final_model_path.split("[MODEL]")[-1].split("max_epochs")[0]+"max_epochs"
matched_ckpts = [f for f in os.listdir(model_dir) if final_model_finder in f]
latest_ckpt_index = np.argmax([int(f.split("curr_ep")[0].split("[")[-1].replace("]","")) for f in matched_ckpts])
final_model_path = os.path.join(model_dir, matched_ckpts[latest_ckpt_index])
if os.path.exists(final_model_path):
    best_states = torch.load(final_model_path, map_location=device)
else:
    raise Exception("Model need to be trained first")

has_module_prefix = any(k.startswith("module.") for k in best_states.keys())
if has_module_prefix:
    stripped = {}
    for k, v in best_states.items():
        new_key = k[len("module."):] if k.startswith("module.") else k
        stripped[new_key] = v
    best_states = stripped
final_model.load_state_dict(best_states)

del best_states
torch.cuda.empty_cache()
print("Model successfully loaded")

Model successfully loaded


In [9]:
global final_model

In [10]:
 if re.search("^1.", torch.__version__) is not None:
        model_size = sum(t.numel() for t in final_model.parameters())
        print(f"Model size: {model_size/1000**2:.1f}M paramaters")

Model size: 1.5M paramaters


In [11]:
result_path = os.path.join(root_dir, "results")

used_train_data = pd.read_excel(os.path.join(result_dir, "[DATASET]"+analysis_date+".xlsx"), sheet_name="TRAIN_dataset")
train_idx = tech_dataset.data.index.astype(int).get_indexer(pd.Index(used_train_data["patent_number"]))
train_dataset = Subset(tech_dataset, train_idx)

used_test_data = pd.read_excel(os.path.join(result_dir, "[DATASET]"+analysis_date+".xlsx"), sheet_name="TEST_dataset")
test_idx = tech_dataset.data.index.astype(int).get_indexer(pd.Index(used_test_data["patent_number"]))
test_dataset = Subset(tech_dataset, test_idx)

# used_train_data = pd.read_excel(os.path.join(result_path, "[DATASET]"+analysis_date+".xlsx"), sheet_name="TRAIN_dataset")
# used_test_data = pd.read_excel(os.path.join(result_path, "[DATASET]"+analysis_date+".xlsx"), sheet_name="TEST_dataset")
# used_train_index = tech_dataset.data.index.get_indexer(pd.Index(used_train_data["number"]))
# used_test_index = tech_dataset.data.index.get_indexer(pd.Index(used_test_data["number"]))

# Inference

## Visualize

In [12]:
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt

In [21]:
batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, drop_last=True)

In [22]:
zs, ys, preds = [], [], []
newzs = []
for batch_data in tqdm(train_loader):
    batch_data = to_device(batch_data, final_model.device)
    y = batch_data["targets"].cpu().detach().numpy()
    
    enc_outputs, z, mu, logvar = final_model.encode(batch_data["text_inputs"])
    pred_outputs = final_model.predictor(z)
    torch.cuda.empty_cache()

    zs.append(z.cpu().detach().numpy())
    ys.append(y)
    preds.append(pred_outputs.argmax(1).cpu().detach().numpy())
    
    torch.cuda.empty_cache()
                                                
zs = np.concatenate(zs)
ys = np.concatenate(ys)
preds = np.concatenate(preds)

100%|██████████| 939/939 [42:54<00:00,  2.74s/it]


In [25]:
import datetime
from nltk.translate.bleu_score import sentence_bleu
col_years = ["<1976"] + np.arange(1976,2022).astype(str).tolist()
# latest_year = datetime.datetime.now().year - 1
latest_year = 2022
n_TC = configs.data.n_TC

visualize = True

## Validation analysis

In [13]:
ref_config_period = "[2006-2020]"
ref_data_file = "collection_" + "".join([config_keywords, config_area, ref_config_period, config_sampling_ratio]) + ".csv"
ref_configs = copy.deepcopy(configs)
ref_configs.data.update({"target_period": ref_config_period, "data_file": ref_data_file})

In [14]:
ref_dataset = TechDataset(ref_configs.data)




Tokenizer is trained and saved


In [270]:
ref_dataset

<data.TechDataset at 0x7f9d04d0e370>

In [100]:
ref_dataset

<data.TechDataset at 0x7f2844143970>

In [101]:
root_dir

'/home2/glee/dissertation/1_tech_gen_impact/class2class/Tech_Gen/'

In [103]:
ref_dataset_path = os.path.join(root_dir, "data", "[DATASET]reference_[uspto_AI][2006-2020].pickle")

In [104]:
with open(ref_dataset_path, "wb") as f:
    ref_dataset.rawdata = None
    pickle.dump(ref_dataset, f)

In [None]:
with open(dataset_path, "rb") as f:
    ref_dataset = pickle.load(f)

In [15]:
used_rawdata = tech_dataset.rawdata.set_index("patent_number")
total_data = pd.concat([tech_dataset.data, ref_dataset.data], axis=0)
total_rawdata = pd.concat([tech_dataset.rawdata.set_index("patent_number"), ref_dataset.rawdata.set_index("patent_number")], axis=0)

In [16]:
total_data

Unnamed: 0_level_0,patent_number,main_class,sub_class,patent_classes,claims,TC5,TC5_digitized,class
patent_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
6983309,6983309,"[G06Q010, H04L063, H04L063, G06Q020, G06Q020, ...","[Y10S707, Y10S707, H04L051]","[G06Q010, H04L063, H04L063, G06Q020, G06Q020, ...",1. An electronic apparatus comprising: a trans...,8,0,1939
6982420,6982420,"[H01J037, G06V020]",[],"[H01J037, G06V020]",1. A sample observation method comprising a st...,1,0,2157
6983071,6983071,"[G06V030, G06V030, G06V010, G06V010]",[G06V030],"[G06V030, G06V030, G06V010, G06V010, G06V030]",1. A character segmentation device for removin...,0,0,1958
6982717,6982717,[G06T005],[],[G06T005],1. A game apparatus comprising: an image gener...,0,0,1948
6983073,6983073,"[G06T005, G06T005, G06T005]",[],"[G06T005, G06T005, G06T005]",1. A method for recovering an image defined as...,1,0,1948
...,...,...,...,...,...,...,...,...
10521802,10521802,"[G06Q050, G06Q010, G06F040, G06F040, G06Q030, ...",[G06F016],"[G06Q050, G06Q010, G06F040, G06F040, G06Q030, ...",1. A computer-implemented method for reporting...,2,0,2539
10517556,10517556,"[A61B006, A61B006, A61B006, A61B006, A61B006, ...","[G01R033, A61B005, G01R033, G01R033, A61B005, ...","[A61B006, A61B006, A61B006, A61B006, A61B006, ...",1. A computer-implemented method for increasin...,0,0,240
10523658,10523658,"[H04L009, H04L009, H04L063, H04L067, H04L063, ...",[],"[H04L009, H04L009, H04L063, H04L067, H04L063, ...",1. A method comprising: establishing a first s...,0,0,2983
10523420,10523420,"[H04L009, H04L009, H04L009, H04B010, H04B010]",[],"[H04L009, H04L009, H04L009, H04B010, H04B010]","1. An apparatus, comprising: a memory to store...",11,1,2983


- Used dataset

In [18]:
L1_criterion = tech_dataset.data["TC5"].quantile(0.9)

In [17]:
# used_test_data_TC = used_test_data[(used_test_data["TC5"]>0) & (used_test_data["TC5"]<L1_criterion)].reset_index()
used_test_data_TC = used_test_data[used_test_data["TC5"]!=0].reset_index()
used_test_index_TC = tech_dataset.data.index.get_indexer(pd.Index(used_test_data_TC["patent_number"].astype(str)))

In [28]:
def breakdown(ipcs):
    return ([ipc[0] for ipc in ipcs], [ipc[:3] for ipc in ipcs], [ipc[:4] for ipc in ipcs], ipcs)

In [26]:
def validate_reliability(model=None, idx=None, L1_threshold=0.5, n_iter=30, step_size=40):
    cnt_nonexist = 0
    cnt_noFC = 0
    cnt_diverge = 0
    cnt_same_ipcs = 0
    cnt_diff_ipcs = 0
    
    input_class = torch.tensor(tech_dataset.tokenizers["class_enc"].encode(tech_dataset.X_class[used_test_index_TC][idx])).unsqueeze(0)
    input_claim = tech_dataset.tokenize(tech_dataset.tokenizers["claim_enc"], tech_dataset.X_claim[used_test_index_TC][idx])
    input_claim = {k: v.unsqueeze(0) for k, v in input_claim.items()}
    batch_input = {"class": torch.tensor(input_class), "claim": input_claim}
    input_inf = to_device(batch_input, model.device)

    output_class = torch.tensor(tech_dataset.tokenizers["class_dec"].encode(tech_dataset.X_class[used_test_index_TC][idx])).unsqueeze(0)
    batch_output = {"text_outputs": torch.tensor(output_class)}
    output_inf = to_device(batch_output, model.device)

    enc_outputs, z, mu, logvar = model.encode(input_inf)
    org_z = copy.deepcopy(z.view(1,-1).cpu().detach().numpy())
    pred_outputs = model.predict(z)
    org_y = copy.deepcopy(pred_outputs.argmax(1).cpu().detach().numpy())
    dec_inputs = None

    if used_test_data_TC.iloc[idx]["TC5"] > 0:
        forward_refs = used_rawdata.loc[used_test_data_TC.iloc[idx]["patent_number"]]["forward_refs"].split(";")
        ref_info = total_data.loc[[ref for ref in forward_refs if ref in total_data.index]]
        if len(ref_info) == 0:
            cnt_nonexist += 1
            return (cnt_nonexist, cnt_noFC, cnt_diverge, cnt_same_ipcs, cnt_diff_ipcs), None
        else:
            ref_ipcs = ref_info["patent_classes"].apply(lambda x: set(x))
            ref_FCs = ref_info["TC"+str(n_TC)]

            tokenizer = tech_dataset.tokenizers["class_dec"]
            
            org_text = tokenizer.decode_batch(input_class.cpu().detach().numpy())[0]
            org_text = org_text[org_text.index(tokenizer.sos_token)+1:org_text.index(tokenizer.eos_token)]
            if set(org_text)==set(np.concatenate(ref_ipcs.apply(lambda x: list(x)).values)):
                cnt_same_ipcs += 1

            inclusions = [None, None, None, None]
            higher_impacts = [None, None, None, None]
            similar_refs_out = [None, None, None, None]
            unsimilar_refs_out = [None, None, None, None]
            optimised = False
            for i in range(n_iter):
                pred_outputs = model.predict(z)
                z.retain_grad()
                FC_estimated = np.round(np.exp(pred_outputs[0,1].item()), 4) # estimated forward citations
                FC_estimated_inv = np.round(np.exp(pred_outputs[0,0].item()), 4)
                
                L1_error = (1-torch.exp(pred_outputs[0,1]))
                L1_error.backward(retain_graph=True)

                grad_for_update = (step_size * z.grad)
                z_ = z - grad_for_update

                z.grad.zero_()
                dec_outputs = model.decode(z_, enc_outputs, dec_inputs=None)
                dec_outputs = dec_outputs.argmax(-1)

                tokenizer = tech_dataset.tokenizers["class_dec"]
                gen_text = tokenizer.decode_batch(dec_outputs.cpu().detach().numpy())[0]
                if tokenizer.eos_token in gen_text:
                    gen_text = gen_text[gen_text.index(tokenizer.sos_token)+1:gen_text.index(tokenizer.eos_token)]
                else:
                    gen_text = gen_text[gen_text.index(tokenizer.sos_token)+1:]
                if gen_text != []:
                    gen_text = [gen_text[0]] + list(np.array(gen_text[1:])[np.unique(gen_text[1:], return_index=True)[1]])                
                    gen_text = set(gen_text)
                else: continue
                
                if FC_estimated>=L1_threshold:
                    optimised = True
    
                    gen_text_breakdown = breakdown(gen_text)
                    ref_ipcs_breakdown = (ref_ipcs.apply(lambda x: breakdown(x)[0]), ref_ipcs.apply(lambda x: breakdown(x)[1]), ref_ipcs.apply(lambda x: breakdown(x)[2]), ref_ipcs)
            
                    for i in range(4):
                        if inclusions[i] is not None: continue
                        temp_gen_text = gen_text_breakdown[i]
                        temp_ref_ipcs = ref_ipcs_breakdown[i]
                    
                        hit_index = temp_ref_ipcs.apply(lambda x: 1 if set(x)==set(temp_gen_text) else 0)==1
                        similar_refs = temp_ref_ipcs[hit_index].index
                        similar_refs_out[i] = similar_refs
                        unsimilar_refs = temp_ref_ipcs[~hit_index].index
                        unsimilar_refs_out[i] = unsimilar_refs
                        if len(similar_refs) == 0:
                            inclusions[i] = 0
                            higher_impacts[i] = None
                        elif len(unsimilar_refs) == 0:
                            inclusions[i] = 1
                            similar_mean_FC = np.mean(ref_FCs.loc[similar_refs])
                            if similar_mean_FC <= 0:
                                higher_impacts[i] = 0
                            else:
                                higher_impacts[i] = 1
                        else:
                            inclusions[i] = 1
                            similar_mean_FC = np.mean(ref_FCs.loc[similar_refs])
                            unsimilar_mean_FC = np.mean(ref_FCs.loc[unsimilar_refs])
                            if similar_mean_FC >= unsimilar_mean_FC:
                                if similar_mean_FC <= 0:
                                    higher_impacts[i] = None
                                else:
                                    higher_impacts[i] = 1
                            else:
                                higher_impacts[i] = 0
                    if None not in inclusions:
                        break
                z = z_
                
            if optimised:
                cnt_diff_ipcs += 1
                return (cnt_nonexist, cnt_noFC, cnt_diverge, cnt_same_ipcs, cnt_diff_ipcs), {"index": idx, "patent_id": used_test_data_TC.iloc[idx]["patent_number"], 
                         "org_text": org_text, "gen_text": gen_text, "ref_ipcs": ref_ipcs, "ref_FCs": ref_FCs,
                         "inclusions": inclusions, "higher_impacts": higher_impacts, 
                         "FC_estimated": FC_estimated,
                         "similar_refs": similar_refs_out, "unsimilar_refs": unsimilar_refs_out}
            else:
                cnt_diverge += 1
                return (cnt_nonexist, cnt_noFC, cnt_diverge, cnt_same_ipcs, cnt_diff_ipcs), None
    else:
        pass
        cnt_noFC += 1
        return (cnt_nonexist, cnt_noFC, cnt_diverge, cnt_same_ipcs, cnt_diff_ipcs), None

In [290]:
# Temporary testing
idx = 0

cnt_nonexist = 0
cnt_noFC = 0
cnt_diverge = 0
cnt_same_ipcs = 0
cnt_diff_ipcs = 0

input_class = torch.tensor(tech_dataset.tokenizers["class_enc"].encode(tech_dataset.X_class[used_test_index_TC][idx])).unsqueeze(0)
input_claim = tech_dataset.tokenize(tech_dataset.tokenizers["claim_enc"], tech_dataset.X_claim[used_test_index_TC][idx])
input_claim = {k: v.unsqueeze(0) for k, v in input_claim.items()}
batch_input = {"class": torch.tensor(input_class), "claim": input_claim}
input_inf = to_device(batch_input, model.device)

output_class = torch.tensor(tech_dataset.tokenizers["class_dec"].encode(tech_dataset.X_class[used_test_index_TC][idx])).unsqueeze(0)
batch_output = {"text_outputs": torch.tensor(output_class)}
output_inf = to_device(batch_output, model.device)

enc_outputs, z, mu, logvar = model.encode(input_inf)
org_z = copy.deepcopy(z.view(1,-1).cpu().detach().numpy())
pred_outputs = model.predict(z)
org_y = copy.deepcopy(pred_outputs.argmax(1).cpu().detach().numpy())
dec_inputs = None


In [303]:
used_test_data_TC.iloc[idx]

index                                                              0
patent_number                                                8423759
patent_number.1                                              8423759
main_class         ['H04L009', 'H04L063', 'H04L009', 'H04L009', '...
sub_class                                                         []
patent_classes     ['H04L009', 'H04L063', 'H04L009', 'H04L009', '...
claims             1. A method of bootstrapping configuration for...
TC5                                                               39
TC5_digitized                                                      1
class                                                           2335
Name: 0, dtype: object

In [291]:
used_test_data_TC.iloc[idx]["TC5"]

39

In [292]:
forward_refs = used_rawdata.loc[used_test_data_TC.iloc[idx]["patent_number"]]["forward_refs"].split(";")
ref_info = total_data.loc[[ref for ref in forward_refs if ref in total_data.index]]

In [299]:
ref_info

Unnamed: 0_level_0,patent_number,main_class,sub_class,patent_classes,claims,TC5,TC5_digitized,class
patent_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
9954866,9954866,"[H04L009, H04L009, H04L009, H04L063, H04L063, ...","[H04L009, H04L009]","[H04L009, H04L009, H04L009, H04L063, H04L063, ...","1. A computer-implemented method, comprising: ...",2,0,2983
9215076,9215076,"[H04L009, H04L063, H04L009, H04L009, H04L063, ...",[H04L009],"[H04L009, H04L063, H04L009, H04L009, H04L063, ...","1. A computer-implemented method, comprising: ...",85,1,2335
9215076,9215076,"[H04L009, H04L063, H04L009, H04L009, H04L063, ...",[H04L009],"[H04L009, H04L063, H04L009, H04L009, H04L063, ...","1. A computer-implemented method, comprising: ...",85,1,2983
9819654,9819654,"[H04L067, H04L063, H04L063, H04L063, H04L063, ...",[H04L063],"[H04L067, H04L063, H04L063, H04L063, H04L063, ...","1. A system, comprising: one or more processor...",2,0,2998
9882900,9882900,"[H04L063, H04L009, H04L009, H04L009, H04L063, ...",[H04L009],"[H04L063, H04L009, H04L009, H04L009, H04L063, ...","1. A computer-implemented method, comprising: ...",0,0,2996
9420007,9420007,"[H04L063, H04L009, H04L063, H04L063, G06F021, ...","[H04L063, G06F021]","[H04L063, H04L009, H04L063, H04L063, G06F021, ...",1. A system of a virtual computing resource se...,2,0,2996
9197409,9197409,"[H04L009, H04L009, G06F021, H04L009, H04L009, ...","[H04L063, H04L063, H04L009]","[H04L009, H04L009, G06F021, H04L009, H04L009, ...",1. A computer-implemented method of authentica...,2,0,2335
9197409,9197409,"[H04L009, H04L009, G06F021, H04L009, H04L009, ...","[H04L063, H04L063, H04L009]","[H04L009, H04L009, G06F021, H04L009, H04L009, ...",1. A computer-implemented method of authentica...,2,0,2983
9311500,9311500,"[G06F021, H04L009, H04L009, G06F021, H04L063, ...","[H04L009, H04L009, H04L009]","[G06F021, H04L009, H04L009, G06F021, H04L063, ...","1. A computer-implemented method, comprising: ...",4,0,2513
10356062,10356062,"[H04L063, H04L063, G06F021, H04L009, H04L063, ...",[H04L009],"[H04L063, H04L063, G06F021, H04L009, H04L063, ...","1. A computer-implemented method, comprising: ...",0,0,2996


In [300]:
ref_ipcs = ref_info["patent_classes"].apply(lambda x: set(x))
ref_FCs = ref_info["TC"+str(n_TC)]

tokenizer = tech_dataset.tokenizers["class_dec"]

org_text = tokenizer.decode_batch(input_class.cpu().detach().numpy())[0]
org_text = org_text[org_text.index(tokenizer.sos_token)+1:org_text.index(tokenizer.eos_token)]
if set(org_text)==set(np.concatenate(ref_ipcs.apply(lambda x: list(x)).values)):
    cnt_same_ipcs += 1

In [310]:
ref_ipcs.apply(lambda x: list(x)).values

array([list(['H04L009', 'H04L063', 'G06F021']),
       list(['H04L009', 'H04L063', 'G06F021']),
       list(['H04L009', 'H04L063', 'G06F021']),
       list(['H04L063', 'H04L009', 'H04L067']),
       list(['H04L009', 'H04L063']),
       list(['H04L009', 'H04L063', 'G06F021']),
       list(['H04L063', 'H04L009', 'G06F021']),
       list(['H04L063', 'H04L009', 'G06F021']),
       list(['H04L063', 'H04L009', 'G06F021']),
       list(['G06F021', 'H04L063', 'H04L009']),
       list(['H04L009', 'H04L063', 'G06F021']),
       list(['H04L009', 'H04L063']),
       list(['H04L009', 'H04L063', 'G06F021']),
       list(['H04L063', 'G06F012', 'G06F021']),
       list(['H04L009', 'H04L063', 'H04L067']),
       list(['H04L009', 'H04L063']),
       list(['H04L009', 'H04L063', 'H04L067'])], dtype=object)

In [308]:
set(np.concatenate(ref_ipcs.apply(lambda x: list(x)).values))

{'G06F012', 'G06F021', 'H04L009', 'H04L063', 'H04L067'}

In [301]:
org_text

['H04L009',
 'H04L063',
 'H04L009',
 'H04L009',
 'H04L063',
 'H04L063',
 'H04L063',
 'H04L063']

In [312]:
inclusions = [None, None, None, None]
higher_impacts = [None, None, None, None]
similar_refs_out = [None, None, None, None]
unsimilar_refs_out = [None, None, None, None]
optimised = False

In [313]:
pred_outputs = model.predict(z)
z.retain_grad()
FC_estimated = np.round(np.exp(pred_outputs[0,1].item()), 4) # estimated forward citations
FC_estimated_inv = np.round(np.exp(pred_outputs[0,0].item()), 4)

L1_error = (1-torch.exp(pred_outputs[0,1]))
L1_error.backward(retain_graph=True)

grad_for_update = (step_size * z.grad)
z_ = z - grad_for_update

z.grad.zero_()
dec_outputs = model.decode(z_, enc_outputs, dec_inputs=None)
dec_outputs = dec_outputs.argmax(-1)

In [314]:
dec_outputs

tensor([[   0, 3183, 3701, 3701, 3701, 3701, 3183, 3183, 3183, 3183, 3183, 3183,
         3182, 3701, 3152, 3152, 3152, 3701, 3152, 3152, 3152, 3152, 3152, 3152,
         3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152,
         3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152,
         3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152,
         3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152,
         3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152,
         3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152,
         3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152,
         3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152,
         3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152,
         3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152, 3152,
         3152, 3152, 3152, 3

In [317]:
tokenizer = tech_dataset.tokenizers["class_dec"]
gen_text = tokenizer.decode_batch(dec_outputs.cpu().detach().numpy())[0]
gen_text

['<SOS>',
 'G06Q020',
 'H04L063',
 'H04L063',
 'H04L063',
 'H04L063',
 'G06Q020',
 'G06Q020',
 'G06Q020',
 'G06Q020',
 'G06Q020',
 'G06Q020',
 'G06Q010',
 'H04L063',
 'G06F016',
 'G06F016',
 'G06F016',
 'H04L063',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F

In [318]:
if tokenizer.eos_token in gen_text:
    gen_text = gen_text[gen_text.index(tokenizer.sos_token)+1:gen_text.index(tokenizer.eos_token)]
else:
    gen_text = gen_text[gen_text.index(tokenizer.sos_token)+1:]

In [319]:
gen_text

['G06Q020',
 'H04L063',
 'H04L063',
 'H04L063',
 'H04L063',
 'G06Q020',
 'G06Q020',
 'G06Q020',
 'G06Q020',
 'G06Q020',
 'G06Q020',
 'G06Q010',
 'H04L063',
 'G06F016',
 'G06F016',
 'G06F016',
 'H04L063',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G06F016',
 'G0

In [321]:
if gen_text != []:
    gen_text = [gen_text[0]] + list(np.array(gen_text[1:])[np.unique(gen_text[1:], return_index=True)[1]])                
    gen_text = set(gen_text)

In [322]:
gen_text

{'G06F016', 'G06Q010', 'G06Q020', 'H04L063'}

In [323]:
FC_estimated

0.618

In [324]:
breakdown(gen_text)

(['G', 'G', 'G', 'H'],
 ['G06', 'G06', 'G06', 'H04'],
 ['G06Q', 'G06F', 'G06Q', 'H04L'],
 {'G06F016', 'G06Q010', 'G06Q020', 'H04L063'})

In [325]:
gen_text_breakdown = breakdown(gen_text)
ref_ipcs_breakdown = (ref_ipcs.apply(lambda x: breakdown(x)[0]), ref_ipcs.apply(lambda x: breakdown(x)[1]), ref_ipcs.apply(lambda x: breakdown(x)[2]), ref_ipcs)

In [326]:
ref_ipcs_breakdown

(patent_number
 9954866     [H, H, G]
 9215076     [H, H, G]
 9215076     [H, H, G]
 9819654     [H, H, H]
 9882900        [H, H]
 9420007     [H, H, G]
 9197409     [H, H, G]
 9197409     [H, H, G]
 9311500     [H, H, G]
 10356062    [G, H, H]
 9906564     [H, H, G]
 10375067       [H, H]
 10044503    [H, H, G]
 9292711     [H, G, G]
 10412059    [H, H, H]
 9258117        [H, H]
 9237019     [H, H, H]
 Name: patent_classes, dtype: object,
 patent_number
 9954866     [H04, H04, G06]
 9215076     [H04, H04, G06]
 9215076     [H04, H04, G06]
 9819654     [H04, H04, H04]
 9882900          [H04, H04]
 9420007     [H04, H04, G06]
 9197409     [H04, H04, G06]
 9197409     [H04, H04, G06]
 9311500     [H04, H04, G06]
 10356062    [G06, H04, H04]
 9906564     [H04, H04, G06]
 10375067         [H04, H04]
 10044503    [H04, H04, G06]
 9292711     [H04, G06, G06]
 10412059    [H04, H04, H04]
 9258117          [H04, H04]
 9237019     [H04, H04, H04]
 Name: patent_classes, dtype: object,
 patent_nu

In [333]:
for i in range(1):
    if inclusions[i] is not None: continue
    temp_gen_text = gen_text_breakdown[i]
    temp_ref_ipcs = ref_ipcs_breakdown[i]

    hit_index = temp_ref_ipcs.apply(lambda x: 1 if set(x)==set(temp_gen_text) else 0)==1
    similar_refs = temp_ref_ipcs[hit_index].index
    similar_refs_out[i] = similar_refs
    unsimilar_refs = temp_ref_ipcs[~hit_index].index
    unsimilar_refs_out[i] = unsimilar_refs
    if len(similar_refs) == 0:
        inclusions[i] = 0
        higher_impacts[i] = None
    elif len(unsimilar_refs) == 0:
        inclusions[i] = 1
        similar_mean_FC = np.mean(ref_FCs.loc[similar_refs])
        if similar_mean_FC <= 0:
            higher_impacts[i] = 0
        else:
            higher_impacts[i] = 1
    else:
        inclusions[i] = 1
        similar_mean_FC = np.mean(ref_FCs.loc[similar_refs])
        unsimilar_mean_FC = np.mean(ref_FCs.loc[unsimilar_refs])
        if similar_mean_FC >= unsimilar_mean_FC:
            if similar_mean_FC <= 0:
                higher_impacts[i] = None
            else:
                higher_impacts[i] = 1
        else:
            higher_impacts[i] = 0

In [341]:
higher_impacts

[1, 1, None, None]

In [339]:
unsimilar_refs

Index(['9819654', '9882900', '10375067', '10412059', '9258117', '9237019'], dtype='object', name='patent_number')

In [335]:
temp_gen_text

['G', 'G', 'G', 'H']

In [336]:
temp_ref_ipcs

patent_number
9954866     [H, H, G]
9215076     [H, H, G]
9215076     [H, H, G]
9819654     [H, H, H]
9882900        [H, H]
9420007     [H, H, G]
9197409     [H, H, G]
9197409     [H, H, G]
9311500     [H, H, G]
10356062    [G, H, H]
9906564     [H, H, G]
10375067       [H, H]
10044503    [H, H, G]
9292711     [H, G, G]
10412059    [H, H, H]
9258117        [H, H]
9237019     [H, H, H]
Name: patent_classes, dtype: object

In [337]:
hit_index

patent_number
9954866      True
9215076      True
9215076      True
9819654     False
9882900     False
9420007      True
9197409      True
9197409      True
9311500      True
10356062     True
9906564      True
10375067    False
10044503     True
9292711      True
10412059    False
9258117     False
9237019     False
Name: patent_classes, dtype: bool

In [334]:
inclusions

[1, None, None, None]

- Load computed dict_out (validation)

In [84]:
load_dict_out = True
save_dict_out = False
L1_threshold = 0.9 # or None
n_iter = 30
step_size = 40
analysis_config = analysis_date + "_thre" + str(L1_threshold)
print(analysis_config)

2025-06-07_1732_thre0.9


In [85]:
if load_dict_out:
    with open("../results/validation/"+analysis_config+"/dict_out.pickle", "rb") as f:
        dict_out = pickle.load(f)
    print("dict_out loaded")
else:
    model = final_model.module if torch.cuda.is_available() else final_model
    cnt_nonexist, cnt_noFC, cnt_diverge, cnt_same_ipcs, cnt_diff_ipcs = 0, 0, 0, 0, 0
    dict_out = {"index": [], "patent_id": [], "org_text": [], "gen_text": [], "ref_ipcs": [], "ref_FCs": [],
                "inclusions": [], "higher_impacts": [], "FC_estimated": [], "similar_refs": [], "unsimilar_refs": []}
    for idx in tqdm(range(len(used_test_index_TC))):
#     for idx in tqdm(range(500)):
        cnts, results = validate_reliability(model=model, idx=idx, L1_threshold=L1_threshold, n_iter=n_iter, step_size=step_size)
        cnt_nonexist += cnts[0]
        cnt_noFC += cnts[1]
        cnt_diverge += cnts[2]
        cnt_same_ipcs += cnts[3]
        cnt_diff_ipcs += cnts[4]
        if results is not None:
            for k,v in results.items():
                dict_out[k].append(v)
    for k, v in dict_out.items():
        dict_out[k] = np.array(v)
    dict_out["cnts"] = {"cnt_nonexist": cnt_nonexist, "cnt_noFC": cnt_noFC, 
                "cnt_diverge": cnt_diverge, "cnt_same_ipcs": cnt_same_ipcs, "cnt_diff_ipcs": cnt_diff_ipcs}
    
    if save_dict_out:
        save_dir = "../results/validation/"+analysis_config
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        with open(save_dir+"/dict_out.pickle", "wb") as f:
            pickle.dump(dict_out, f)

dict_out loaded


In [86]:
dict_out["cnts"]

{'cnt_nonexist': 2983,
 'cnt_noFC': 0,
 'cnt_diverge': 10,
 'cnt_same_ipcs': 618,
 'cnt_diff_ipcs': 6927}

In [87]:
ipcs_comparison = pd.concat([pd.Series(dict_out["org_text"]).apply(lambda x: set(x)), pd.Series(dict_out["gen_text"]).apply(lambda x: set(x))], axis=1)

In [88]:
n_differentiated = ipcs_comparison.apply(lambda x: 1 if x[0]!=x[1] else 0, axis=1).sum()

In [89]:
n_differentiated / len(ipcs_comparison)

0.9411000433087917

In [90]:
print("#total:",len(used_test_index_TC))
print("#valid data:",len(dict_out["index"]))
print("#nonexist data:",dict_out["cnts"]["cnt_nonexist"])
print("#no forward citations:",dict_out["cnts"]["cnt_noFC"])
print("#diverged:",dict_out["cnts"]["cnt_diverge"])
print("#same ipcs:",dict_out["cnts"]["cnt_same_ipcs"])
print("\n")
for i in range(1,4):
    inclusions = np.array(dict_out["inclusions"])[:,i]
    ratio_included = np.round(len(inclusions[inclusions==1])/len(inclusions), 4)
    higher_impacts = np.array(dict_out["higher_impacts"])[:,i][np.array(dict_out["higher_impacts"])[:,i] != None]
    ratio_higher_impact = np.round(len(higher_impacts[higher_impacts==1])/len(higher_impacts), 4)
    print(f"for level {i+1}, Ratio generated IPCs are included in citing patents: {ratio_included}")
    print(f"for level {i+1}, Ratio generated IPCs have higher impact than other citing patents: {ratio_higher_impact}")
    print("\n")

#total: 9920
#valid data: 6927
#nonexist data: 2983
#no forward citations: 0
#diverged: 10
#same ipcs: 618


for level 2, Ratio generated IPCs are included in citing patents: 0.34
for level 2, Ratio generated IPCs have higher impact than other citing patents: 0.5248


for level 3, Ratio generated IPCs are included in citing patents: 0.1318
for level 3, Ratio generated IPCs have higher impact than other citing patents: 0.4989


for level 4, Ratio generated IPCs are included in citing patents: 0.037
for level 4, Ratio generated IPCs have higher impact than other citing patents: 0.4758




In [91]:
temp_inclusions = np.array(dict_out["inclusions"])[np.array(dict_out["inclusions"])[:,-1]==0]

In [92]:
temp_inclusions = np.array(dict_out["inclusions"])[np.array(dict_out["inclusions"])[:,-1]==0]
for i in range(3):
    temp_ratio = len(temp_inclusions[temp_inclusions[:,i]==0]) / len(temp_inclusions)
    print(f"for level {i+1}, Hit ratio when level 4 is not hit: {np.round(temp_ratio,4)}")

for level 1, Hit ratio when level 4 is not hit: 0.5128
for level 2, Hit ratio when level 4 is not hit: 0.6854
for level 3, Hit ratio when level 4 is not hit: 0.9015


In [93]:
tech_dataset.data.shape

(133654, 8)

In [94]:
from scipy.stats import ttest_ind
whole_patent_classes = tech_dataset.data["patent_classes"].apply(lambda x: set(x))

hit_similar_FCs, hit_unsimilar_FCs, hit_diff_FCs = [], [], []
hit_similar_FCs_mean, hit_unsimilar_FCs_mean = [], []
hit_similar_FCs_rank = []
whole_FC_ttest = {"statistic": [], "pvalue": []}
whole_FCs_diff = []
hit_samples_index = dict_out["inclusions"][:,-1]==1

hit_patent_ids = dict_out["patent_id"][hit_samples_index]
hit_similar_refs = dict_out["similar_refs"][hit_samples_index][:,-1]
hit_unsimilar_refs = dict_out["unsimilar_refs"][hit_samples_index][:,-1]

for i in range(len(hit_patent_ids)):
    hit_FCs = dict_out["ref_FCs"][hit_samples_index][i]
    hit_FCs = hit_FCs.loc[~hit_FCs.index.duplicated(keep="first")]
    hit_similar_FC = hit_FCs.loc[hit_similar_refs[i].drop_duplicates()]
    hit_similar_FC_rank = hit_FCs.rank(pct=True).loc[hit_similar_refs[i].drop_duplicates()]
    hit_unsimilar_FC = hit_FCs.loc[hit_unsimilar_refs[i].drop_duplicates()]
    if len(hit_unsimilar_FC)>0:
        hit_diff_FC = hit_similar_FC.mean() - hit_unsimilar_FC.mean()
    else:
        hit_diff_FC = hit_similar_FC.mean()
    
    org_whole_FC = tech_dataset.data.loc[whole_patent_classes[whole_patent_classes==set(dict_out["org_text"][hit_samples_index][i])].index]["TC5"]
    gen_whole_FC = tech_dataset.data.loc[whole_patent_classes[whole_patent_classes==set(dict_out["gen_text"][hit_samples_index][i])].index]["TC5"]
    if len(org_whole_FC)>0:
        whole_FC_diff = gen_whole_FC.mean() - org_whole_FC.mean()
    else:
        whole_FC_diff = gen_whole_FC.mean()
    
    hit_similar_FCs.append(hit_similar_FC)
    hit_similar_FCs_rank.append(hit_similar_FC_rank)
    hit_similar_FCs_mean.append(hit_similar_FC.mean())
    hit_unsimilar_FCs.append(hit_unsimilar_FC)
    if len(hit_unsimilar_FC)>0:
        hit_unsimilar_FCs_mean.append(hit_unsimilar_FC.mean())
    else:
        hit_unsimilar_FCs_mean.append(0)
    hit_diff_FCs.append(hit_diff_FC)
    
    ttest_res = ttest_ind(gen_whole_FC, org_whole_FC, equal_var=False)
    
    whole_FC_ttest["statistic"].append(ttest_res.statistic)
    whole_FC_ttest["pvalue"].append(ttest_res.pvalue)
    if set(dict_out["org_text"][hit_samples_index][i]) != set(dict_out["gen_text"][hit_samples_index][i]):
        whole_FCs_diff.append(whole_FC_diff)
    
hit_similar_FCs = np.concatenate(hit_similar_FCs)
hit_similar_FCs_rank = np.concatenate(hit_similar_FCs_rank)
hit_similar_FCs_mean = np.array(hit_similar_FCs_mean)
hit_unsimilar_FCs = np.concatenate(hit_unsimilar_FCs)
hit_unsimilar_FCs_mean = np.array(hit_unsimilar_FCs_mean)
hit_diff_FCs = np.array(hit_diff_FCs)

print("Distribution of forward citations (hit)\n",pd.Series(hit_similar_FCs).describe(),"\n")
print("Distribution of forward citations (not hit)\n",pd.Series(hit_unsimilar_FCs).describe(),"\n")
print("Distribution of mean forward citations (hit)\n",pd.Series(hit_similar_FCs_mean).describe(),"\n")
print("Distribution of mean forward citations (not hit)\n",pd.Series(hit_unsimilar_FCs_mean).describe(),"\n")
print("Distribution of difference of forward citations\n",pd.Series(hit_diff_FCs).describe(),"\n")

  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)


Distribution of forward citations (hit)
 count    682.000000
mean      27.520528
std       67.112002
min        0.000000
25%        1.000000
50%        4.000000
75%       15.000000
max      843.000000
dtype: float64 

Distribution of forward citations (not hit)
 count    3492.000000
mean       22.391753
std        46.206764
min         0.000000
25%         1.000000
50%         4.000000
75%        19.000000
max       481.000000
dtype: float64 

Distribution of mean forward citations (hit)
 count    256.000000
mean      13.594364
std       39.055363
min        0.000000
25%        0.666667
50%        3.000000
75%        9.125000
max      421.000000
dtype: float64 

Distribution of mean forward citations (not hit)
 count    256.000000
mean       9.779638
std       20.097793
min        0.000000
25%        0.500000
50%        3.645833
75%       10.401316
max      137.148148
dtype: float64 

Distribution of difference of forward citations
 count    256.000000
mean       3.814726
std       32.

In [342]:
whole_patent_classes = tech_dataset.data["patent_classes"].apply(lambda x: set(x))

In [344]:
hit_similar_FCs, hit_unsimilar_FCs, hit_diff_FCs = [], [], []
hit_similar_FCs_mean, hit_unsimilar_FCs_mean = [], []
hit_similar_FCs_rank = []
whole_FC_ttest = {"statistic": [], "pvalue": []}
whole_FCs_diff = []
hit_samples_index = dict_out["inclusions"][:,-1]==1

In [345]:
hit_patent_ids = dict_out["patent_id"][hit_samples_index]
hit_similar_refs = dict_out["similar_refs"][hit_samples_index][:,-1]
hit_unsimilar_refs = dict_out["unsimilar_refs"][hit_samples_index][:,-1]

In [353]:
i = 0

In [354]:
hit_FCs = dict_out["ref_FCs"][hit_samples_index][i]
hit_FCs = hit_FCs.loc[~hit_FCs.index.duplicated(keep="first")]
hit_similar_FC = hit_FCs.loc[hit_similar_refs[i].drop_duplicates()]
hit_similar_FC_rank = hit_FCs.rank(pct=True).loc[hit_similar_refs[i].drop_duplicates()]
hit_unsimilar_FC = hit_FCs.loc[hit_unsimilar_refs[i].drop_duplicates()]

In [359]:
hit_unsimilar_FC

patent_number
10236013      0
8135700       4
9959028       6
10236011      0
10236012      0
10311887      0
9509269     111
10133460      5
10391361      7
10410649      0
10297265      0
8135736       3
10416666      4
Name: TC5, dtype: int64

In [360]:
if len(hit_unsimilar_FC)>0:
    hit_diff_FC = hit_similar_FC.mean() - hit_unsimilar_FC.mean()
else:
    hit_diff_FC = hit_similar_FC.mean()

In [361]:
hit_diff_FC

-4.76923076923077

In [362]:
org_whole_FC = tech_dataset.data.loc[whole_patent_classes[whole_patent_classes==set(dict_out["org_text"][hit_samples_index][i])].index]["TC5"]
gen_whole_FC = tech_dataset.data.loc[whole_patent_classes[whole_patent_classes==set(dict_out["gen_text"][hit_samples_index][i])].index]["TC5"]

In [364]:
gen_whole_FC

patent_number
6990503    23
7020612     5
7039873     2
7065741     1
7093012    20
           ..
9223872     1
9223853     1
9223861     0
9223782     0
9223902    11
Name: TC5, Length: 348, dtype: int64

In [365]:
if len(org_whole_FC)>0:
    whole_FC_diff = gen_whole_FC.mean() - org_whole_FC.mean()
else:
    whole_FC_diff = gen_whole_FC.mean()

In [366]:
whole_FC_diff

-2.43103448275862

In [367]:
hit_similar_FCs.append(hit_similar_FC)
hit_similar_FCs_rank.append(hit_similar_FC_rank)
hit_similar_FCs_mean.append(hit_similar_FC.mean())
hit_unsimilar_FCs.append(hit_unsimilar_FC)

In [None]:
if len(hit_unsimilar_FC)>0:
    hit_unsimilar_FCs_mean.append(hit_unsimilar_FC.mean())
else:
    hit_unsimilar_FCs_mean.append(0)
hit_diff_FCs.append(hit_diff_FC)

ttest_res = ttest_ind(gen_whole_FC, org_whole_FC, equal_var=False)

whole_FC_ttest["statistic"].append(ttest_res.statistic)
whole_FC_ttest["pvalue"].append(ttest_res.pvalue)
if set(dict_out["org_text"][hit_samples_index][i]) != set(dict_out["gen_text"][hit_samples_index][i]):
    whole_FCs_diff.append(whole_FC_diff)

In [264]:
dict_out["patent_id"][hit_samples_index]

array([8108343, 8032383, 8024799, 8819826, 8355610, 7010095, 8977643,
       7242681, 7085753, 7076481, 7181459, 8059265, 8571618, 8849670,
       7185000, 7062083, 7243093, 7401069, 8886540, 8797295, 7440981])

In [368]:
cnt_same, cnt_diff = 0, 0
ranks_same, ranks_diff = [], []
value_same, value_diff = [], []
org_patents_same, org_patents_diff = [], []
cols_val = ["patent_id", "org_ipcs", "org_FC", "gen_ipcs", "is_same", 
            "forward_ref", "ref_ipcs", "ref_FC", "ref_FC_rank"]
df_val = pd.DataFrame(columns=cols_val)
hit_similar_refs = dict_out["similar_refs"][hit_samples_index][:,-1]

In [376]:
dict_out["ref_ipcs"][hit_samples_index][0]

patent_number
10236013    {G10L013, H04W004, G06F003, H04M001, G10L025, ...
8135700                                    {G06F016, Y10S715}
8135700                                    {G06F016, Y10S715}
9959028     {E21B044, G06F003, E21B047, E21B049, H04L067, ...
10236011    {H04R001, G10L013, H04W004, G06F003, H04M001, ...
10236012    {H04R001, G10L013, H04W004, G06F003, H04M001, ...
10311887    {H04R001, G10L013, G06F003, H04W004, H04M001, ...
9509269                           {G11B020, G11B027, H03G003}
10133460                 {E21B047, H04L067, G06F003, H04L069}
10391361                 {G06F016, A63B022, A63B024, A63B071}
10410649    {H04R001, G10L013, H04W004, G06F003, H04M001, ...
10297265    {H04R001, G10L013, G06F003, H04W004, G10L025, ...
8135736                                    {G06F016, Y10S715}
8135736                                    {G06F016, Y10S715}
10416666        {B64C039, G06F003, H04L065, G05D001, G06Q010}
9576050                                             {G06

In [377]:
dict_out["patent_id"][hit_samples_index][i]

7542816

In [380]:
tech_dataset.data.index

Index(['6983309', '6982420', '6983071', '6982717', '6983073', '6983051',
       '6983229', '6983242', '6983320', '6982659',
       ...
       '9226105', '9223299', '9226051', '9225639', '9225549', '9223620',
       '9223405', '9225764', '9226068', '9223929'],
      dtype='object', name='patent_number', length=133654)

In [396]:
dict_out["patent_id"][hit_samples_index].shape

(381,)

In [95]:
cnt_same, cnt_diff = 0, 0
ranks_same, ranks_diff = [], []
value_same, value_diff = [], []
org_patents_same, org_patents_diff = [], []
cols_val = ["patent_id", "org_ipcs", "org_FC", "gen_ipcs", "is_same", 
            "forward_ref", "ref_ipcs", "ref_FC", "ref_FC_rank"]
df_val = pd.DataFrame(columns=cols_val)
hit_similar_refs = dict_out["similar_refs"][hit_samples_index][:,-1]

for i in tqdm(range(len(dict_out["ref_ipcs"][hit_samples_index]))):
    pid = dict_out["patent_id"][hit_samples_index][i]
    pid = str(pid)
    org_FC = tech_dataset.data.loc[pid]["TC5"]
    orgs = set(dict_out["org_text"][dict_out["inclusions"][:,-1]==1][i])
    gens = set(dict_out["gen_text"][dict_out["inclusions"][:,-1]==1][i])
    is_same = 1 if orgs==gens or orgs.union(gens)==orgs else 0
    hit_FCs = dict_out["ref_FCs"][hit_samples_index][i]
    hit_FCs = hit_FCs.loc[~hit_FCs.index.duplicated(keep="first")]
    hit_similar_FC = hit_FCs.loc[hit_similar_refs[i][hit_similar_refs[i].duplicated()]]
#     hit_similar_FC = hit_similar_FC.drop_duplicates()
    hit_similar_FC_rank = hit_FCs.rank(pct=True).loc[hit_similar_refs[i][hit_similar_refs[i].duplicated()]]
#     hit_similar_FC_rank = hit_similar_FC_rank.drop_duplicates()
    
    for ref in hit_similar_refs[i].drop_duplicates():
        ref_ipcs = dict_out["ref_ipcs"][hit_samples_index][i].loc[ref]
        if isinstance(ref_ipcs, pd.Series): ref_ipcs = ref_ipcs[0]
        hit_similar_FC = hit_FCs.loc[ref]
        if isinstance(hit_similar_FC, pd.Series): hit_similar_FC = hit_similar_FC[0]
        hit_similar_FC_rank = hit_FCs.rank(pct=True).loc[ref]
        if isinstance(hit_similar_FC_rank, pd.Series): hit_similar_FC_rank = hit_similar_FC_rank[0]
        
        df_container = pd.DataFrame([[pid, orgs, org_FC, gens, is_same, ref, ref_ipcs, hit_similar_FC, hit_similar_FC_rank]], columns=cols_val)
        df_val = pd.concat([df_val, df_container])

df_val = df_val.set_index("patent_id")

100%|██████████| 256/256 [00:03<00:00, 76.96it/s] 


In [96]:
df_val.loc[:,"ref_FC_new"] = df_val.apply(lambda x: str(x["ref_FC"])+" ("+str(np.round(x["ref_FC_rank"],2))+")", axis=1)

In [97]:
df_val

Unnamed: 0_level_0,org_ipcs,org_FC,gen_ipcs,is_same,forward_ref,ref_ipcs,ref_FC,ref_FC_rank,ref_FC_new
patent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
7542816,"{Y10S707, G11B027, G06F016}",18,{G06F016},1,9576050,{G06F016},6,0.821429,6 (0.82)
8108343,{G06F016},11,{G06F016},1,10331657,{G06F016},2,0.500000,2 (0.5)
8032383,{G10L015},51,{G10L015},1,9858925,{G10L015},35,0.822222,35 (0.82)
8032383,{G10L015},51,{G10L015},1,10395654,{G10L015},3,0.400000,3 (0.4)
8032383,{G10L015},51,{G10L015},1,10079014,{G10L015},0,0.107407,0 (0.11)
...,...,...,...,...,...,...,...,...,...
7080090,"{G06F021, Y10S707, G06F016}",3,"{G06F021, G06F016}",1,10346429,"{G06F021, G06F016}",12,1.000000,12 (1.0)
8904496,"{H04L009, H04W012, G06F021}",6,"{H04L063, G06F021, H04W012}",0,10284538,"{H04L063, G06F021, H04W012}",0,0.750000,0 (0.75)
9043922,{G06F021},16,"{H04L063, G06F021, G06F016}",0,9977920,"{H04L063, G06F021, G06F016}",1,0.583333,1 (0.58)
7475425,{H04L063},17,{H04L063},1,9356941,{H04L063},15,1.000000,15 (1.0)


In [98]:
L1_criterion = tech_dataset.data["TC5"].quantile(0.9)
print("total hit:", len(df_val))
print("same:",len(df_val[df_val["is_same"]==1]))
print("diff:",len(df_val[df_val["is_same"]==0]))
print("over L1 criterion:", len(df_val[df_val["is_same"]==0][df_val[df_val["is_same"]==0]["ref_FC"]>=L1_criterion]))
print("ratio:",len(df_val[df_val["is_same"]==0][df_val[df_val["is_same"]==0]["ref_FC"]>=L1_criterion]) / len(df_val[df_val["is_same"]==0]))

total hit: 682
same: 634
diff: 48
over L1 criterion: 8
ratio: 0.16666666666666666


In [99]:
## 인용 IPC 중 생성 IPC와 동일한 게 있으면서, 입력 IPC와 생성 IPC가 다르고, 피인용수가 전체 데이터셋의 L1 기준 이상인 샘플
df_val[df_val["is_same"]==0][df_val[df_val["is_same"]==0]["ref_FC"]>=L1_criterion].drop(labels=["is_same", "ref_FC", "ref_FC_rank"], axis=1)

Unnamed: 0_level_0,org_ipcs,org_FC,gen_ipcs,forward_ref,ref_ipcs,ref_FC_new
patent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7242681,{H04L063},10,"{H04L063, G06F021}",9875344,"{H04L063, G06F021}",20 (0.8)
8307177,"{G06F011, G06F009, G06F016}",165,"{G06F011, G06F003, G06F016}",9588849,"{G06F011, G06F003, G06F016}",24 (0.84)
7328155,{G10L015},29,"{G10L015, G01C021}",8219399,"{G10L015, G01C021}",29 (0.75)
7363549,{G06F011},28,"{G06F011, G06F016}",7979441,"{G06F011, G06F016}",18 (0.77)
7363549,{G06F011},28,"{G06F011, G06F016}",8108429,"{G06F011, G06F016}",40 (1.0)
7363549,{G06F011},28,"{G06F011, G06F016}",7904913,"{G06F011, G06F016}",24 (0.86)
7363549,{G06F011},28,"{G06F011, G06F016}",7689602,"{G06F011, G06F016}",17 (0.73)
7363549,{G06F011},28,"{G06F011, G06F016}",8131723,"{G06F011, G06F016}",29 (0.95)


In [82]:
## 인용 IPC 중 생성 IPC와 동일하면서, 입력 IPC와는 다르고, 피인용수가 입력 특허보다 많은 샘플
df_val[df_val["is_same"]==0].drop(labels=["is_same", "ref_FC", "ref_FC_rank"], axis=1).apply(lambda x: x.name if int(x["org_FC"]) <= int(x["ref_FC_new"].split("(")[0]) else np.nan, axis=1).dropna()

patent_id
7242681    7242681
8621259    8621259
8189878    8189878
7917869    7917869
7685179    7685179
7409390    7409390
7328155    7328155
7363549    7363549
7363549    7363549
7505979    7505979
dtype: object

In [391]:
df_val.loc["7188306"]

org_ipcs                {Y10S707, G06F040}
org_FC                                   3
gen_ipcs       {Y10S707, G06F040, G06F016}
is_same                                  0
forward_ref                        8327260
ref_ipcs       {G06F040, Y10S707, G06F016}
ref_FC                                  12
ref_FC_rank                            1.0
ref_FC_new                        12 (1.0)
Name: 7188306, dtype: object

**5.1 in-depth 사례를 4.3으로 옮기기 위해, iteration 과정 추출 (231018)**

7600135

In [None]:
idx = dict_out["index"][np.where(dict_out["patent_id"]=="7600135")[0][0]]

In [None]:
tech_dataset.data.iloc[used_test_index_TC[idx]]

In [None]:
model = final_model.module if torch.cuda.is_available() else final_model

In [None]:
input_class = torch.tensor(tech_dataset.tokenizers["class_enc"].encode(tech_dataset.X_class[used_test_index_TC][idx])).unsqueeze(0)
input_claim = tech_dataset.tokenize(tech_dataset.tokenizers["claim_enc"], tech_dataset.X_claim[used_test_index_TC][idx])
input_claim = {k: v.unsqueeze(0) for k, v in input_claim.items()}
batch_input = {"class": torch.tensor(input_class), "claim": input_claim}
input_inf = to_device(batch_input, model.device)

output_class = torch.tensor(tech_dataset.tokenizers["class_dec"].encode(tech_dataset.X_class[used_test_index_TC][idx])).unsqueeze(0)
batch_output = {"text_outputs": torch.tensor(output_class)}
output_inf = to_device(batch_output, model.device)

In [None]:
forward_refs = used_rawdata.loc[used_test_data_TC.iloc[idx]["number"]]["forward_refs"].split(";")
ref_info = total_data.loc[[ref for ref in forward_refs if ref in total_data.index]]
if len(ref_info) == 0:
    pass
else:
    ref_ipcs = ref_info["ipcs"].apply(lambda x: set(x))
    ref_FCs = ref_info["TC"+str(n_TC)]

In [None]:
idx = dict_out["index"][np.where(dict_out["patent_id"]=="7636945")[0][0]]

In [None]:
tech_dataset.data.iloc[used_test_index_TC[idx]]

7636945

In [None]:
model = final_model.module if torch.cuda.is_available() else final_model

In [None]:
input_class = torch.tensor(tech_dataset.tokenizers["class_enc"].encode(tech_dataset.X_class[used_test_index_TC][idx])).unsqueeze(0)
input_claim = tech_dataset.tokenize(tech_dataset.tokenizers["claim_enc"], tech_dataset.X_claim[used_test_index_TC][idx])
input_claim = {k: v.unsqueeze(0) for k, v in input_claim.items()}
batch_input = {"class": torch.tensor(input_class), "claim": input_claim}
input_inf = to_device(batch_input, model.device)

output_class = torch.tensor(tech_dataset.tokenizers["class_dec"].encode(tech_dataset.X_class[used_test_index_TC][idx])).unsqueeze(0)
batch_output = {"text_outputs": torch.tensor(output_class)}
output_inf = to_device(batch_output, model.device)

In [None]:
forward_refs = used_rawdata.loc[used_test_data_TC.iloc[idx]["number"]]["forward_refs"].split(";")
ref_info = total_data.loc[[ref for ref in forward_refs if ref in total_data.index]]
if len(ref_info) == 0:
    pass
else:
    ref_ipcs = ref_info["ipcs"].apply(lambda x: set(x))
    ref_FCs = ref_info["TC"+str(n_TC)]

In [None]:
while 1:
    i = 0
    enc_outputs, z, mu, logvar = model.encode(input_inf)
    pred_outputs = model.predict(z)

    dec_outputs = model.decode(z, enc_outputs, dec_inputs=None)
    dec_outputs = dec_outputs.argmax(-1)

    tokenizer = tech_dataset.tokenizers["class_dec"]
    gen_text = tokenizer.decode_batch(dec_outputs.cpu().detach().numpy())[0]

    if tokenizer.eos_token in gen_text:
        gen_text = gen_text[gen_text.index(tokenizer.sos_token)+1:gen_text.index(tokenizer.eos_token)]
    else:
        gen_text = gen_text[gen_text.index(tokenizer.sos_token)+1:]
    if gen_text != []:
        gen_text = [gen_text[0]] + list(np.array(gen_text[1:])[np.unique(gen_text[1:], return_index=True)[1]])                
        gen_text = set(gen_text)

    FC_estimated = np.round(np.exp(pred_outputs[0,1].item()), 4)
    if FC_estimated > 0.5: continue

    print(f"Iteration 0, Generated IPC {gen_text}, L1 prob {FC_estimated}")

    inclusions = [None, None, None, None]
    higher_impacts = [None, None, None, None]
    similar_refs_out = [None, None, None, None]
    unsimilar_refs_out = [None, None, None, None]

    optimised = False
    for i in range(1,n_iter+1):
        pred_outputs = model.predict(z)
        z.retain_grad()
        L1_error = (1-torch.exp(pred_outputs[0,1]))
        L1_error.backward(retain_graph=True)
        grad_for_update = (step_size * z.grad)
        z_ = z - grad_for_update

        z.grad.zero_()
        dec_outputs = model.decode(z_, enc_outputs, dec_inputs=None)
        dec_outputs = dec_outputs.argmax(-1)

        pred_outputs_ = model.predict(z_)
        FC_estimated = np.round(np.exp(pred_outputs_[0,1].item()), 4) # estimated forward citations
        FC_estimated_inv = np.round(np.exp(pred_outputs_[0,0].item()), 4)

        tokenizer = tech_dataset.tokenizers["class_dec"]
        gen_text = tokenizer.decode_batch(dec_outputs.cpu().detach().numpy())[0]
        if tokenizer.eos_token in gen_text:
            gen_text = gen_text[gen_text.index(tokenizer.sos_token)+1:gen_text.index(tokenizer.eos_token)]
        else:
            gen_text = gen_text[gen_text.index(tokenizer.sos_token)+1:]
        if gen_text != []:
            gen_text = [gen_text[0]] + list(np.array(gen_text[1:])[np.unique(gen_text[1:], return_index=True)[1]])                
            gen_text = set(gen_text)
        else: continue

        print(f"Iteration {i}, Generated IPC {gen_text}, L1 prob {FC_estimated}")

        z = z_

        if FC_estimated>=L1_threshold:
            optimised = True
            
    break

In [None]:
df_val[df_val["is_same"]==0][df_val[df_val["is_same"]==0]["ref_FC"]>=L1_criterion]["ref_FC"].astype(int).describe()

In [None]:
df_val[df_val["is_same"]==0][df_val[df_val["is_same"]==0]["ref_FC"]>=L1_criterion]["ref_FC_rank"].describe()

In [None]:
# ## 인용 IPC 중 생성 IPC와 동일한 게 있으면서, 입력 IPC와 생성 IPC가 다르고, 피인용수가 전체 데이터셋의 L1 기준인 9 이상인 샘플
# df_val[df_val["is_same"]==0][df_val[df_val["is_same"]==0]["ref_FC"]>=L1_criterion]

- Patent_matched 전체

In [None]:
df_val["ref_FC"].astype(float).describe()

In [None]:
df_val["ref_FC_rank"].astype(float).describe()

In [None]:
df_val["ref_FC"].astype(float).hist(bins=100)

In [None]:
df_val["ref_FC_rank"].astype(float).hist(bins=30)

- IPC_original == IPC_identified

In [None]:
df_val[df_val["is_same"]==1]["ref_FC"].astype(int).describe()

In [None]:
df_val[df_val["is_same"]==1]["ref_FC_rank"].astype(float).describe()

In [None]:
df_val[df_val["is_same"]==1]["ref_FC"].astype(float).hist(bins=100)

In [None]:
df_val[df_val["is_same"]==1]["ref_FC_rank"].astype(float).hist(bins=30)

- IPC_original != IPC_identified

In [None]:
df_val[df_val["is_same"]==0]["ref_FC"].astype(int).describe()

In [None]:
df_val[df_val["is_same"]==0]["ref_FC_rank"].astype(float).describe()

In [None]:
df_val[df_val["is_same"]==0]["ref_FC"].astype(float).hist(bins=30)

In [None]:
df_val[df_val["is_same"]==0]["ref_FC_rank"].astype(float).hist(bins=30)

In [None]:
df_val_adv = df_val[df_val.apply(lambda x: True if x["org_FC"]<=x["ref_FC"] else False, axis=1)]

In [None]:
L1_criterion = tech_dataset.data["TC5"].quantile(0.9)
print("total hit:", len(df_val_adv))
print("same:",len(df_val_adv[df_val_adv["is_same"]==1]))
print("diff:",len(df_val_adv[df_val_adv["is_same"]==0]))
print("over L1 criterion:", len(df_val_adv[df_val_adv["is_same"]==0][df_val_adv[df_val_adv["is_same"]==0]["ref_FC"]>=L1_criterion]))
print("ratio:",len(df_val_adv[df_val_adv["is_same"]==0][df_val_adv[df_val_adv["is_same"]==0]["ref_FC"]>=L1_criterion]) / len(df_val_adv[df_val_adv["is_same"]==0]))

In [None]:
## 인용 IPC 중 생성 IPC와 동일한 게 있으면서, 입력 IPC와 생성 IPC가 다르고, 피인용수가 전체 데이터셋의 L1 기준인 9 이상이며, 인용 특허의 피인용 수가 입력 특허보다 많은 샘플
df_val_adv[df_val_adv["is_same"]==0][df_val_adv[df_val_adv["is_same"]==0]["ref_FC"]>=L1_criterion]

In [None]:
analysis_config

In [None]:
if not os.path.exists("../results/validation/"+analysis_config):
    os.mkdir("../results/validation/"+analysis_config)
with open("../results/validation/"+analysis_config+"/dict_out.pickle", "wb") as f:
    pickle.dump(dict_out, f)
with open("../results/validation/"+analysis_config+"/df_val.pickle", "wb") as f:
    pickle.dump(df_val, f)

In [None]:
from scipy.stats import ttest_ind
whole_FC_ttest = {"statistic": [], "pvalue": []}
whole_FCs_diff = []
for i in tqdm(range(len(dict_out["patent_id"]))):
    org_whole_FC = tech_dataset.data.loc[whole_patent_classes[whole_patent_classes==set(dict_out["org_text"][i])].index]["TC5"]
    gen_whole_FC = tech_dataset.data.loc[whole_patent_classes[whole_patent_classes==set(dict_out["gen_text"][i])].index]["TC5"]
    
    if len(org_whole_FC)>0 and len(gen_whole_FC)>0:
        whole_FC_diff = gen_whole_FC.mean() - org_whole_FC.mean()
    elif len(org_whole_FC)==0 and len(gen_whole_FC)>0:
        whole_FC_diff = gen_whole_FC.mean()
    elif len(org_whole_FC)>0 and len(gen_whole_FC)==0:
        whole_FC_diff = org_whole_FC.mean()
    else:
        whole_FC_diff = 0.0
    
    ttest_res = ttest_ind(gen_whole_FC, org_whole_FC, equal_var=False)
    
    if set(dict_out["org_text"][i]) != set(dict_out["gen_text"][i]):    
        whole_FC_ttest["statistic"].append(ttest_res.statistic)
        whole_FC_ttest["pvalue"].append(ttest_res.pvalue)
        if set(dict_out["org_text"][i]) != set(dict_out["gen_text"][i]):
            whole_FCs_diff.append(whole_FC_diff)

In [None]:
pd.Series(whole_FCs_diff).describe()

In [None]:
pd.Series(whole_FC_ttest["statistic"])[~pd.Series(whole_FC_ttest["statistic"]).isna()].describe()

In [None]:
pd.Series(whole_FC_ttest["statistic"]).loc[pd.Series(whole_FC_ttest["pvalue"]).dropna()[pd.Series(whole_FC_ttest["pvalue"]).dropna()<0.05].index].describe()