In [2]:
root_dir = '/home2/glee/dissertation/1_tech_gen_impact/class2class/Tech_Gen/'
master_dir = '/home2/glee/dissertation/1_tech_gen_impact/master/Tech_Gen/'
import sys
sys.path.append(root_dir)

import copy
import gc
import os
import argparse
import math
import time
import pickle
import re
import multiprocess as mp
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)
warnings.filterwarnings(action='ignore', category=DeprecationWarning)
sys.path.append("/share/tml_package")
from tml import utils
from scipy import io
from tqdm import tqdm
from collections import OrderedDict

import torch
from torch.nn import functional as F
from torch.nn import DataParallel as DP
from torch.utils.data import TensorDataset, DataLoader, Subset, Dataset
from accelerate import Accelerator
import pytorch_model_summary

import optuna
from optuna.samplers import RandomSampler, TPESampler
from optuna.integration import SkoptSampler

import numpy as np
import pandas as pd
import scipy.stats
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import matthews_corrcoef, precision_recall_fscore_support, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

from data import TechDataset, CVSampler
from models import Transformer, Predictor
from train_utils import EarlyStopping, perf_eval, objective_cv, build_model, train_model, validate_model_mp
from utils import token2class, DotDict, to_device

from cleantext.sklearn import CleanTransformer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# PART 1: Configuration

In [6]:
configs.data.target_ipc

['G06F']

In [19]:
# analysis_date = "2023-04-22_15:39"
analysis_date = "2023-05-09_0331"

args = argparse.Namespace(
    do_eval = True,
    do_save=False,
    config_file=os.path.join(root_dir, "configs", "USED_configs", "[CONFIGS]"+analysis_date+".json"),
    eval_train_set=False)

# args = argparse.Namespace(
#     data_type="class",
#     data_file=None,
#     target_ipc=None,
#     pred_type="classification",
#     n_TC = 5,
#     use_pretrained_tokenizer=False,
#     do_train=None,
#     do_tune=None,
#     n_folds=None,
#     batch_size=512,
#     max_epochs=20,
#     use_accelerator=None,
#     do_save=False,
#     n_gpus=4,
#     light=True,
# #     config_file=os.path.join(root_dir, "configs", "USED_configs", "[CONFIGS]"+analysis_date+".json"),
#     config_file=None,
#     eval_train_set=False)

data_dir = os.path.join(master_dir, "data")
model_dir = os.path.join(root_dir, "models")
result_dir = os.path.join(root_dir, "results")
config_dir = os.path.join(root_dir, "configs")

# args = parser.parse_args()
if args.config_file is not None:
    config_file = args.config_file
    configs = DotDict().load(config_file)
    org_config_keys = {key: list(configs[key].keys()) for key in configs.keys()}
else:
    config_file = os.path.join(config_dir, "configs_light.json") if args.light else os.path.join(config_dir, "configs.json")
    configs = DotDict().load(config_file)
    org_config_keys = {key: list(configs[key].keys()) for key in configs.keys()}

    instant_configs = {key: value for (key, value) in vars(args).items() if value is not None} # if any argument passed when main.py executed
    instant_configs_for_update = {configkey: {key: value for (key,value) in instant_configs.items() if key in org_config_keys[configkey]} for configkey in org_config_keys.keys()}
    for key, value in configs.items():
        value.update(instant_configs_for_update[key])

regex_ipc = re.compile('[A-Z](?![\\D])')
if regex_ipc.match(str(configs.data.target_ipc)) is None:
    configs.data.update({"target_ipc": "ALL"})
elif len(configs.data.target_ipc) > 5:
    configs.data.update({"target_ipc": configs.data.target_ipc[:4]})

if configs.model.model_type == "enc-pred-dec":
    configs.train.loss_weights["recon"] = configs.train.loss_weights["recon"] / sum(configs.train.loss_weights.values())
    configs.train.loss_weights["y"] = 1 - configs.train.loss_weights["recon"]
elif configs.model.model_type == "enc-pred":
    configs.train.loss_weights = {"recon": 0, "y": 1}
elif configs.model.model_type == "enc-dec":
    configs.train.loss_weights = {"recon": 1, "y": 0}

if configs.train.use_accelerator:
    accelerator = Accelerator()
    device_ids = list(range(torch.cuda.device_count()))
    device = accelerator.device
    configs.train.update({"accelerator": accelerator})
else:
    if torch.cuda.is_available():
        device_ids = list(range(torch.cuda.device_count()))
        gpu_usages = [np.sum([float(usage.split("uses")[-1].replace(" ","").replace("MB","")) for usage in torch.cuda.list_gpu_processes(id).split("GPU memory") if not usage=="" and "no processes are running" not in usage]) for id in device_ids]
        device_ids = np.argsort(gpu_usages)[:configs.train.n_gpus]
        device_ids = list(map(lambda x: torch.device('cuda', x),list(device_ids)))
        device = device_ids[0] # main device
        torch.cuda.set_device(device)
    else:
        device = torch.device('cpu')
        device_ids = []

configs.data.update({"root_dir": root_dir,
                        "data_dir": data_dir,
                        "model_dir": model_dir,
                        "result_dir": result_dir,
                        "pretrained_enc": configs.model.pretrained_enc,
                        "pretrained_dec": configs.model.pretrained_dec,
                        "data_nrows": None})
configs.train.update({"device": device,
                        "device_ids": device_ids,
                        "root_dir": root_dir,
                        "data_dir": data_dir,
                        "model_dir": model_dir,
                        "use_keywords": configs.data.use_keywords,
                        "early_stop_patience": int(0.3*configs.train.max_epochs)})
configs.model.update({"device": device,
                        "device_ids": device_ids,
                        "n_directions": 2 if configs.model.bidirec else 1,
                        "use_accelerator": configs.train.use_accelerator})

## Set hyperparameters for model training (To be TUNED)
if configs.train.do_train and configs.train.do_tune:
    n_layers = configs.model.n_layers = None
    d_embedding = configs.model.d_embedding = None
    d_enc_hidden = configs.model.d_enc_hidden = None
    d_pred_hidden = configs.model.d_pred_hidden = None
    learning_rate = configs.train.learning_rate = None
    batch_size = configs.train.batch_size = None
    config_name = "HPARAM_TUNING"
    final_model_path = None
else:
    n_layers = configs.model.n_layers
    d_embedding = configs.model.d_embedding
    d_enc_hidden = configs.model.d_enc_hidden
    d_pred_hidden = configs.model.d_pred_hidden
    d_latent = configs.model.d_latent

    key_components = {"data": ["target_ipc", "vocab_size"], "model": ["n_layers", "d_enc_hidden", "d_pred_hidden", "d_latent", "d_embedding", "d_ff", "n_head", "d_head"], "train": ["learning_rate", "batch_size", "max_epochs"]}
    config_name = ""
    for key in key_components.keys():
        for component in key_components[key]:
            config_name += "["+str(configs[key][component])+component+"]"
    final_model_path = os.path.join(model_dir, f"[Final_model]{config_name}.ckpt")

configs.train.update({"config_name": config_name,
                        "final_model_path": final_model_path})

In [21]:
configs.data

DotDict({'data_type': 'class+claim', 'pred_type': 'classification', 'target_period': [2007, 2012], 'target_ipc': 'ALL', 'target_keywords': '', 'pred_target': 'citation', 'ipc_level': 3, 'claim_level': 1, 'class_level': 3, 'n_TC': 5, 'use_keywords': True, 'max_seq_len_class': 30, 'max_seq_len_claim': 200, 'vocab_size': 1000, 'use_pretrained_tokenizer': False, 'root_dir': '/home2/glee/dissertation/1_tech_gen_impact/class2class/Tech_Gen/', 'data_dir': '/home2/glee/dissertation/1_tech_gen_impact/master/Tech_Gen/data', 'model_dir': '/home2/glee/dissertation/1_tech_gen_impact/class2class/Tech_Gen/models', 'result_dir': '/home2/glee/dissertation/1_tech_gen_impact/class2class/Tech_Gen/results', 'pretrained_enc': False, 'pretrained_dec': False, 'data_nrows': None})

# PART 2: Dataset setting

In [22]:
org_config_keys

{'data': ['data_type',
  'pred_type',
  'target_period',
  'target_ipc',
  'target_keywords',
  'pred_target',
  'ipc_level',
  'claim_level',
  'class_level',
  'n_TC',
  'use_keywords',
  'max_seq_len_class',
  'max_seq_len_claim',
  'vocab_size',
  'use_pretrained_tokenizer'],
 'train': ['do_train',
  'do_tune',
  'use_accelerator',
  'n_folds',
  'n_trials',
  'learning_rate',
  'batch_size',
  'max_epochs',
  'n_gpus',
  'use_early_stopping',
  'weight_decay',
  'adam_epsilon',
  'warmup_steps',
  'loss_weights',
  'alternate_train',
  'teach_force_ratio',
  'max_epochs_for_tune',
  'early_stop_patience_for_tune',
  'mem_verbose'],
 'model': ['is_pretrained',
  'pretrained_enc',
  'pretrained_dec',
  'model_type',
  'model_name',
  'n_enc_vocab',
  'n_dec_vocab',
  'n_enc_seq_class',
  'n_dec_seq_class',
  'n_enc_seq_claim',
  'n_dec_seq_claim',
  'n_layers',
  'd_hidden',
  'd_enc_hidden_pretrained',
  'd_enc_hidden',
  'd_dec_hidden',
  'd_pred_hidden',
  'd_embedding',
  'd_lat

In [12]:
tstart = time.time()
org_config_keys_temp = copy.copy(org_config_keys["data"])
org_config_keys_temp.pop(org_config_keys_temp.index("data_file"))
org_config_keys_temp.pop(org_config_keys_temp.index("max_seq_len_claim"))
org_config_keys_temp.pop(org_config_keys_temp.index("max_seq_len_class"))
dataset_config_name = "-".join([str(key)+"="+str(value) for (key,value) in configs.data.items() if key in org_config_keys_temp])
dataset_path = os.path.join(data_dir, "pickled_dataset", "[tech_dataset]"+dataset_config_name+".pickle")
if os.path.exists(dataset_path) and args.do_save is False:
    print("Load pickled dataset...")
    with open(dataset_path, "rb") as f:
        tech_dataset = pickle.load(f)   # Load pickled dataset if dataset with same configuration already saved
        if tech_dataset.pretrained_enc != configs.data.pretrained_enc or tech_dataset.pretrained_dec != configs.data.pretrained_dec:
            tech_dataset.pretrained_enc = configs.data.pretrained_enc
            tech_dataset.pretrained_dec = configs.data.pretrained_dec
            tech_dataset.tokenizers = tech_dataset.get_tokenizers()
        for tk in tech_dataset.tokenizers.values():
            if "vocab_size" not in dir(tk):
                tk.vocab_size = tk.get_vocab_size()
    print("Pickled dataset loaded")
else:
    print("Make dataset...")
#     if args.debug:
#         configs.data.update({"data_nrows": 1000})
#         dataset_path += ".debug"
    tech_dataset = TechDataset(configs.data)
    if not args.debug:
        rawdata_for_save = copy.deepcopy(tech_dataset.rawdata)
        with open(dataset_path, "wb") as f:
            tech_dataset.rawdata = None
            pickle.dump(tech_dataset, f)
        tech_dataset.rawdata = rawdata_for_save
tend = time.time()
print(f"{np.round(tend-tstart,4)} sec elapsed for loading patents for class [{configs.data.target_ipc}]")

Make dataset...


AttributeError: data_file

In [4]:
configs.model.update({"tokenizers": tech_dataset.tokenizers,
                    "n_enc_seq_claim": tech_dataset.max_seq_len_claim,
                    "n_dec_seq_claim": tech_dataset.max_seq_len_claim,
                    "n_enc_seq_class": tech_dataset.max_seq_len_class,
                    "n_dec_seq_class": tech_dataset.max_seq_len_class,
                    "n_outputs": 1 if configs.data.pred_type=="regression" else tech_dataset.n_outputs,
                    "i_padding": tech_dataset.tokenizers["class_enc"].pad_id})

## Inference

In [17]:
final_model_path

'/home2/glee/dissertation/1_tech_gen_impact/class2class/Tech_Gen/models/[Final_model][ALLtarget_ipc][1000vocab_size][4n_layers][16d_enc_hidden][8d_pred_hidden][32d_latent][128d_embedding][16d_ff][4n_head][16d_head][0.0005learning_rate][512batch_size][20max_epochs].ckpt'

In [18]:
final_model = build_model(configs.model, tokenizers=tech_dataset.tokenizers)
if os.path.exists(final_model_path):
    best_states = torch.load(final_model_path)
else:
    raise Exception("Model need to be trained first")
converted_states = OrderedDict()
for k, v in best_states.items():
    if 'module' not in k:
        k = 'module.'+k
    else:
        k = k.replace('features.module.', 'module.features.')
    converted_states[k] = v
final_model.load_state_dict(converted_states)

del best_states
del converted_states
torch.cuda.empty_cache()
print("Model successfully loaded")

NameError: name 'tech_dataset' is not defined

In [None]:
# Copy predictor
temp_path = os.path.join(model_dir, "temp", "temp.ckpt")
predictor = Predictor(final_model.module.config).to(final_model.module.device)
torch.save(final_model.module.predictor.state_dict(), temp_path)
predictor.load_state_dict(torch.load(temp_path, map_location=final_model.module.device))

In [8]:
result_path = os.path.join(root_dir, "results")

used_train_data = pd.read_excel(os.path.join(result_path, "[DATASET]"+analysis_date+".xlsx"), sheet_name="TRAIN_dataset")
used_test_data = pd.read_excel(os.path.join(result_path, "[DATASET]"+analysis_date+".xlsx"), sheet_name="TEST_dataset")
used_train_index = tech_dataset.data.index.get_indexer(pd.Index(used_train_data["number"]))
used_test_index = tech_dataset.data.index.get_indexer(pd.Index(used_test_data["number"]))

In [9]:
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt

In [11]:
batch_size = 16

In [12]:
used_train_dataset = Subset(tech_dataset, used_train_index)
train_loader = DataLoader(used_train_dataset, batch_size=batch_size, drop_last=True)

In [24]:
zs, ys, preds = [], [], []
newzs = []
for batch_data in tqdm(train_loader):
    batch_data = to_device(batch_data, final_model.module.device)
    y = batch_data["targets"].cpu().detach().numpy()
    
    enc_outputs, z, mu, logvar = final_model.module.encode(batch_data["text_inputs"])
    pred_outputs = final_model.module.predictor(z)
    torch.cuda.empty_cache()

    zs.append(z.cpu().detach().numpy())
    ys.append(y)
    preds.append(pred_outputs.argmax(1).cpu().detach().numpy())
    
    torch.cuda.empty_cache()
                                                
zs = np.concatenate(zs)
ys = np.concatenate(ys)
preds = np.concatenate(preds)

100%|██████████| 2542/2542 [01:13<00:00, 34.46it/s]


In [76]:
## TRAIN set
idx = 99

input_class = torch.tensor(tech_dataset.tokenizers["class_enc"].encode(tech_dataset.X_class[used_train_index][idx])).unsqueeze(0)
input_claim = tech_dataset.tokenize(tech_dataset.tokenizers["claim_enc"], tech_dataset.X_claim[used_train_index][idx])
input_claim = {k: v.unsqueeze(0) for k, v in input_claim.items()}
batch_input = {"class": torch.tensor(input_class), "claim": input_claim}
input_inf = to_device(batch_input, final_model.module.device)

output_class = torch.tensor(tech_dataset.tokenizers["class_dec"].encode(tech_dataset.X_class[used_train_index][idx])).unsqueeze(0)
batch_output = {"text_outputs": torch.tensor(output_class)}
output_inf = to_device(batch_output, final_model.module.device)

In [138]:
## TEST set
idx = 11

input_class = torch.tensor(tech_dataset.tokenizers["class_enc"].encode(tech_dataset.X_class[used_test_index][idx])).unsqueeze(0)
input_claim = tech_dataset.tokenize(tech_dataset.tokenizers["claim_enc"], tech_dataset.X_claim[used_test_index][idx])
input_claim = {k: v.unsqueeze(0) for k, v in input_claim.items()}
batch_input = {"class": torch.tensor(input_class), "claim": input_claim}
input_inf = to_device(batch_input, final_model.module.device)

output_class = torch.tensor(tech_dataset.tokenizers["class_dec"].encode(tech_dataset.X_class[used_test_index][idx])).unsqueeze(0)
batch_output = {"text_outputs": torch.tensor(output_class)}
output_inf = to_device(batch_output, final_model.module.device)

In [117]:
visualize = False

In [139]:
near_mean_idx = np.argsort(np.sum(abs(zs - np.mean(zs, axis=0)), axis=1))[:2500]
near_mean_idx_ = np.union1d(near_mean_idx, np.random.choice(np.where(ys==1)[0], 50))
enc_outputs, z, mu, logvar = final_model.module.encode(input_inf)
org_z = copy.deepcopy(z.view(1,-1).cpu().detach().numpy())
pred_outputs = final_model.module.predict(z)
org_y = copy.deepcopy(pred_outputs.argmax(1).cpu().detach().numpy())
dec_inputs = None

if visualize:
    zs_for_tsne = np.concatenate([zs[near_mean_idx_], org_z])
    ys_for_tsne = np.concatenate([ys[near_mean_idx_], org_y])
    tsne = TSNE(early_exaggeration=10, learning_rate="auto", n_iter=500, init="random", verbose=0, metric="cosine", square_distances=True)
    z_tsne = tsne.fit_transform(zs_for_tsne)
    plt.scatter(z_tsne[:-1,0], z_tsne[:-1,1], c=ys_for_tsne[:-1], cmap="bwr")
    plt.scatter(z_tsne[-1,0], z_tsne[-1,1], c="k", marker="X")
    plt.text(z_tsne[-1,0]+0.5, z_tsne[-1,1]+0.5, "origin", weight="bold")
    plt.show()

tokenizer = tech_dataset.tokenizers["class_dec"]

org_text = tokenizer.decode_batch(input_class.cpu().detach().numpy())[0]
org_text = org_text[org_text.index(tokenizer.sos_token)+1:org_text.index(tokenizer.eos_token)]
print("Original class:\n",org_text,"\n")

dec_outputs = final_model.module.decode(z, enc_outputs, dec_inputs=None)
dec_outputs = dec_outputs.argmax(-1)

gen_text = tokenizer.decode_batch(dec_outputs.cpu().detach().numpy())[0]
gen_text = gen_text[gen_text.index(tokenizer.sos_token)+1:gen_text.index(tokenizer.eos_token)]
print(f"Generated class:\n", gen_text,"\n")

Original class:
 ['H04N9/00', 'H01L27/00', 'H04N3/00', 'H04N5/00', 'H04N9/00'] 

Generated class:
 ['H04N9/00', 'H01L25/00', 'H01L27/00', 'H04N3/00', 'H04N5/00'] 



In [140]:
n_iter = 20
step_size = 20

for i in range(n_iter):
    print(f"Iteration {i}")
    pred_outputs = final_model.module.predict(z)
    z.retain_grad()
    FC_estimated = pred_outputs[0,1] # estimated forward citations
    FC_estimated_inv = pred_outputs[0,0].item()
    if i % 1 == 0:
        print(f"Estimated prob. for L1 forward citations (Iter {i}): {np.round(np.exp(FC_estimated.item()), 4)} ({np.round(np.exp(FC_estimated_inv),4)})")
    FC_estimated.backward(retain_graph=True)
        
    grad_for_update = (step_size * z.grad)
#     if i % 1 == 0:
#         print(f"sum of gradient (Iter {i}): {grad_for_update.sum()}")
    z_ = z + grad_for_update
    
    if visualize:
        curr_z = copy.deepcopy(z_.view(1,-1).cpu().detach().numpy())
        curr_y = copy.deepcopy(pred_outputs.argmax(1).cpu().detach().numpy())

        zs_for_tsne = np.concatenate([zs[near_mean_idx_], org_z, curr_z])
        ys_for_tsne = np.concatenate([ys[near_mean_idx_], org_y, curr_y])

        z_tsne = tsne.fit_transform(zs_for_tsne)
        plt.scatter(z_tsne[:-1,0], z_tsne[:-1,1], c=ys_for_tsne[:-1], cmap="bwr")
        plt.scatter(z_tsne[-2,0], z_tsne[-2,1], c="k", marker="X")
        plt.text(z_tsne[-2,0]+0.5, z_tsne[-2,1]+0.5, "origin", weight="bold")
        plt.scatter(z_tsne[-1,0], z_tsne[-1,1], c="c", marker="x")
        plt.text(z_tsne[-1,0]-4, z_tsne[-1,1]-2.5, f"Iter_{i}", c="c")
        plt.show()
    
    z.grad.zero_()
    dec_outputs = final_model.module.decode(z_, enc_outputs, dec_inputs=None)
    dec_outputs = dec_outputs.argmax(-1)
        
    tokenizer = tech_dataset.tokenizers["class_dec"]
    gen_text = tokenizer.decode_batch(dec_outputs.cpu().detach().numpy())[0]
    gen_text = gen_text[gen_text.index(tokenizer.sos_token)+1:gen_text.index(tokenizer.eos_token)]
    print("Original class:\n",org_text,"\n")

    if i % 1 == 0:
        print(f"Generated class (Iter {i}):\n", gen_text,"\n")
    
    z = z_

Iteration 0
Estimated prob. for L1 forward citations (Iter 0): 0.4325 (0.5675)
Original class:
 ['H04N9/00', 'H01L27/00', 'H04N3/00', 'H04N5/00', 'H04N9/00'] 

Generated class (Iter 0):
 ['H04N9/00', 'H01L27/00', 'H04N3/00', 'H04N5/00', 'H04N9/00'] 

Iteration 1
Estimated prob. for L1 forward citations (Iter 1): 0.5071 (0.4929)
Original class:
 ['H04N9/00', 'H01L27/00', 'H04N3/00', 'H04N5/00', 'H04N9/00'] 

Generated class (Iter 1):
 ['H04N9/00', 'H01L27/00', 'H04N3/00', 'H04N5/00', 'H04N9/00'] 

Iteration 2
Estimated prob. for L1 forward citations (Iter 2): 0.5283 (0.4717)
Original class:
 ['H04N9/00', 'H01L27/00', 'H04N3/00', 'H04N5/00', 'H04N9/00'] 

Generated class (Iter 2):
 ['H04N3/00', 'H01L27/00', 'H04N3/00', 'H04N5/00', 'H04N9/00'] 

Iteration 3
Estimated prob. for L1 forward citations (Iter 3): 0.8297 (0.1703)
Original class:
 ['H04N9/00', 'H01L27/00', 'H04N3/00', 'H04N5/00', 'H04N9/00'] 

Generated class (Iter 3):
 ['H04N3/00', 'G06T3/00', 'H01L27/00', 'H04N3/00', 'H04N5/00'

In [263]:
idx = 63
    
input_class = torch.tensor(tech_dataset.tokenizers["class_enc"].encode(tech_dataset.X_class[used_test_index][idx])).unsqueeze(0)
input_claim = tech_dataset.tokenize(tech_dataset.tokenizers["claim_enc"], tech_dataset.X_claim[used_test_index][idx])
input_claim = {k: v.unsqueeze(0) for k, v in input_claim.items()}
batch_input = {"class": torch.tensor(input_class), "claim": input_claim}
input_inf = to_device(batch_input, final_model.module.device)

output_class = torch.tensor(tech_dataset.tokenizers["class_dec"].encode(tech_dataset.X_class[used_test_index][idx])).unsqueeze(0)
batch_output = {"text_outputs": torch.tensor(output_class)}
output_inf = to_device(batch_output, final_model.module.device)

near_mean_idx = np.argsort(np.sum(abs(zs - np.mean(zs, axis=0)), axis=1))[:2500]
near_mean_idx_ = np.union1d(near_mean_idx, np.random.choice(np.where(ys==1)[0], 50))
enc_outputs, z, mu, logvar = final_model.module.encode(input_inf)
org_z = copy.deepcopy(z.view(1,-1).cpu().detach().numpy())
pred_outputs = final_model.module.predict(z)
org_y = copy.deepcopy(pred_outputs.argmax(1).cpu().detach().numpy())
dec_inputs = None

if visualize:
    zs_for_tsne = np.concatenate([zs[near_mean_idx_], org_z])
    ys_for_tsne = np.concatenate([ys[near_mean_idx_], org_y])
    tsne = TSNE(early_exaggeration=10, learning_rate="auto", n_iter=500, init="random", verbose=0, metric="cosine", square_distances=True)
    z_tsne = tsne.fit_transform(zs_for_tsne)
    plt.scatter(z_tsne[:-1,0], z_tsne[:-1,1], c=ys_for_tsne[:-1], cmap="bwr")
    plt.scatter(z_tsne[-1,0], z_tsne[-1,1], c="k", marker="X")
    plt.text(z_tsne[-1,0]+0.5, z_tsne[-1,1]+0.5, "origin", weight="bold")
    plt.show()

tokenizer = tech_dataset.tokenizers["class_dec"]

print(f"<< Iteration {0} >>")

org_text = tokenizer.decode_batch(input_class.cpu().detach().numpy())[0]
org_text = org_text[org_text.index(tokenizer.sos_token)+1:org_text.index(tokenizer.eos_token)]
org_text = [t.split("/")[0] for t in org_text]
org_text = [t[:4]+"0"+t[4:] if (len(t[4:])<2) else t for t in org_text]

dec_outputs = final_model.module.decode(z, enc_outputs, dec_inputs=None)
dec_outputs = dec_outputs.argmax(-1)

gen_text = tokenizer.decode_batch(dec_outputs.cpu().detach().numpy())[0]
gen_text = gen_text[gen_text.index(tokenizer.sos_token)+1:gen_text.index(tokenizer.eos_token)]
gen_text = [gen_text[0]] + list(np.array(gen_text[1:])[np.unique(gen_text[1:], return_index=True)[1]])
gen_text = [t.split("/")[0] for t in gen_text]
gen_text = [t[:4]+"0"+t[4:] if (len(t[4:])<2) else t for t in gen_text]
print("Original class:\n",org_text,"\nGenerated class:\n", gen_text,"\n")

n_iter = 20
step_size = 20

for i in range(n_iter):
    print(f"<< Iteration {i+1} >>")
    pred_outputs = final_model.module.predict(z)
    z.retain_grad()
    FC_estimated = pred_outputs[0,1] # estimated forward citations
    FC_estimated_inv = pred_outputs[0,0].item()
    if i % 1 == 0:
        print(f"Estimated prob. for L1 (L2): {np.round(np.exp(FC_estimated.item()), 4)} ({np.round(np.exp(FC_estimated_inv),4)})")
    FC_estimated.backward(retain_graph=True)
        
    grad_for_update = (step_size * z.grad)
#     if i % 1 == 0:
#         print(f"sum of gradient (Iter {i}): {grad_for_update.sum()}")
    z_ = z + grad_for_update
    
    if visualize:
        curr_z = copy.deepcopy(z_.view(1,-1).cpu().detach().numpy())
        curr_y = copy.deepcopy(pred_outputs.argmax(1).cpu().detach().numpy())

        zs_for_tsne = np.concatenate([zs[near_mean_idx_], org_z, curr_z])
        ys_for_tsne = np.concatenate([ys[near_mean_idx_], org_y, curr_y])

        z_tsne = tsne.fit_transform(zs_for_tsne)
        plt.scatter(z_tsne[:-1,0], z_tsne[:-1,1], c=ys_for_tsne[:-1], cmap="bwr")
        plt.scatter(z_tsne[-2,0], z_tsne[-2,1], c="k", marker="X")
        plt.text(z_tsne[-2,0]+0.5, z_tsne[-2,1]+0.5, "origin", weight="bold")
        plt.scatter(z_tsne[-1,0], z_tsne[-1,1], c="c", marker="x")
        plt.text(z_tsne[-1,0]-4, z_tsne[-1,1]-2.5, f"Iter_{i}", c="c")
        plt.show()
    
    z.grad.zero_()
    dec_outputs = final_model.module.decode(z_, enc_outputs, dec_inputs=None)
    dec_outputs = dec_outputs.argmax(-1)
        
    tokenizer = tech_dataset.tokenizers["class_dec"]
    gen_text = tokenizer.decode_batch(dec_outputs.cpu().detach().numpy())[0]
    gen_text = gen_text[gen_text.index(tokenizer.sos_token)+1:gen_text.index(tokenizer.eos_token)]
    gen_text = [gen_text[0]] + list(np.array(gen_text[1:])[np.unique(gen_text[1:], return_index=True)[1]])
    gen_text = [t.split("/")[0] for t in gen_text]
    gen_text = [t[:4]+"0"+t[4:] if (len(t[4:])<2) else t for t in gen_text]
    print("Original class:\n",org_text,"\nGenerated class:\n", gen_text,"\n")
    
    z = z_

<< Iteration 0 >>
Original class:
 ['H01L29', 'H01L23', 'H01L27', 'H01L29'] 
Generated class:
 ['H01L29', 'H01L23', 'H01L27', 'H01L29'] 

<< Iteration 1 >>
Estimated prob. for L1 (L2): 0.356 (0.644)
Original class:
 ['H01L29', 'H01L23', 'H01L27', 'H01L29'] 
Generated class:
 ['H01L29', 'H01L23', 'H01L27', 'H01L29'] 

<< Iteration 2 >>
Estimated prob. for L1 (L2): 0.6018 (0.3982)
Original class:
 ['H01L29', 'H01L23', 'H01L27', 'H01L29'] 
Generated class:
 ['H01L29', 'G11C11', 'H01L27', 'H01L29'] 

<< Iteration 3 >>
Estimated prob. for L1 (L2): 0.7649 (0.2351)
Original class:
 ['H01L29', 'H01L23', 'H01L27', 'H01L29'] 
Generated class:
 ['H01L29', 'G01R33', 'H01L27', 'H01L29'] 

<< Iteration 4 >>
Estimated prob. for L1 (L2): 0.8727 (0.1273)
Original class:
 ['H01L29', 'H01L23', 'H01L27', 'H01L29'] 
Generated class:
 ['H01L29', 'G01R33', 'H01L27', 'H01L29'] 

<< Iteration 5 >>
Estimated prob. for L1 (L2): 0.9007 (0.0993)
Original class:
 ['H01L29', 'H01L23', 'H01L27', 'H01L29'] 
Generated 

In [278]:
import sys
sys.path.append("/home2/glee/patent_data/")
import uspto
import json
pn = tech_dataset.X_class[used_test_index].index[63]
with open(uspto.get_filepath("/data/uspto/USPAT", pn=pn), "r") as f:
    p_sample = json.load(f)

In [279]:
p_sample

{'guid': 'US-9449926-B2',
 'publicationReferenceDocumentNumber': '9449926',
 'compositeId': '54703846!US-US-09449926',
 'publicationReferenceDocumentNumber1': '09449926',
 'datePublishedKwicHits': None,
 'datePublished': '2016-09-20T00:00:00Z',
 'inventionTitle': 'Semiconductor device',
 'type': 'USPAT',
 'mainClassificationCode': '1/1',
 'applicantName': ['Renesas Electronics Corporation'],
 'assigneeName': ['RENESAS ELECTRONICS CORPORATION'],
 'uspcFullClassificationFlattened': None,
 'ipcCodeFlattened': 'H01L29/74;H01L29/868',
 'cpcInventiveFlattened': 'H01L29/66136;H01L23/552;H01L29/0692;H01L29/861;H01L29/08;H01L29/7397;H01L27/0664;H01L29/868',
 'cpcAdditionalFlattened': 'H01L29/1095;H01L2924/0002;H01L29/456;H01L2924/0002;H01L2924/00',
 'applicationFilingDate': ['2015-11-09T00:00:00Z'],
 'applicationFilingDateKwicHits': None,
 'relatedApplFilingDate': None,
 'primaryExaminer': 'Ho; Tu-Tu',
 'assistantExaminer': None,
 'applicationNumber': '14/935481',
 'frontPageStart': 1,
 'frontP

In [280]:
data_dir = "/share/patentsview/"
citations = pd.read_csv(os.path.join(data_dir, "g_us_patent_citation.tsv"), delimiter="\t", usecols=[0,2,3]).dropna(axis=0)

In [282]:
citations[citations["citation_patent_id"]==pn]

Unnamed: 0,patent_id,citation_patent_id,citation_date
106556307,10510904,9449926,2016-09-01
