In [120]:
import copy
import glob
import gc
import os
import sys
import warnings
import os
import sys
import math
import time
import re
sys.path.append("/share/tml_package")
from tml import utils
from scipy import io
from tqdm import tqdm
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader

from data import TechDataset, CVSampler
from model import Encoder_SEQ, SEQ2SEQ, Attention, AttnDecoder_SEQ
from train_utils import run_epoch, EarlyStopping, perf_eval

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import cleantext
from cleantext.sklearn import CleanTransformer

## Data preprocessing

### Patent classification

In [None]:
TOKEN_SOS = '<SOS>'
TOKEN_EOS = '<EOS>'
TOKEN_PAD = '<PAD>'
tokens = [TOKEN_SOS, TOKEN_EOS, TOKEN_PAD]
regex = re.compile("[0-9a-zA-Z]+")

In [None]:
data_root = "/home2/glee/Tech_Gen/data/"
rawdata = pd.read_csv(os.path.join(data_root, "collection_final.csv"))
rawdata_dropna = rawdata.dropna(axis=0, subset=['main ipc', 'sub ipc'])[['number','main ipc', 'sub ipc']]
cols_year = ['<1976']+list(np.arange(1976,2018).astype(str))
n_TC = 3

In [None]:
ipc, num = np.unique(rawdata_dropna['main ipc'].apply(lambda x: x.split(' ')[0]), return_counts=True)
ipc_vocab_size = pd.concat([pd.Series(ipc[np.argsort(num)[::-1]]), pd.Series(num[np.argsort(num)[::-1]])], axis=1)

In [None]:
print("Level 1")
display(np.unique(rawdata_dropna['main ipc'].apply(lambda x: x.split(' ')[0][:3])))

In [None]:
print("Level 2")
display(np.unique(rawdata_dropna['main ipc'].apply(lambda x: x.split(' ')[0])))

In [None]:
print("Level 3")
display(np.unique(rawdata_dropna['main ipc'].apply(lambda x: x.replace(' ',''))))

In [None]:
ipc

In [None]:
ipc_vocab_size.iloc[:20]

In [None]:
# target_ipc = "A61K"
target_ipc = "G01N"

In [None]:
main_ipcs = [x for x in pd.unique(rawdata_dropna['main ipc']) if target_ipc in x]
rawdata_ipc = rawdata_dropna.loc[rawdata_dropna['main ipc'].isin(main_ipcs)]
data = rawdata_ipc[['number']].copy(deep=True)
data['main_ipc'] = rawdata_ipc['main ipc'].apply(lambda x: regex.findall(x)[0])
data['sub_ipc'] = rawdata_ipc['sub ipc'].apply(lambda x: [regex.findall(xx)[0] for xx in x.split(';')])

rawdata_tc = rawdata.loc[rawdata_ipc.index][['year']+cols_year]
data['TC'+str(n_TC)] = rawdata_tc.apply(lambda x: x[np.arange(x['year']+1 if x['year']<2017 else 2017, x['year']+n_TC+1 if x['year']+n_TC<2018 else 2018).astype(str)].sum(), axis=1)

data = data.set_index('number')
# main_ipcs = [regex.findall(x)[0] for x in main_ipcs]
main_ipcs = [target_ipc]
sub_ipcs = list(np.unique(np.concatenate(list(data['sub_ipc'].values))))
all_ipcs = list(np.union1d(main_ipcs, sub_ipcs))
seq_len = data['sub_ipc'].apply(lambda x: len(x)).max() + 3

vocab_w2i = {all_ipcs[i]: i for i in range(len(all_ipcs))}
vocab_w2i.update({tokens[i]: len(all_ipcs)+i for i in range(len(tokens))})
vocab_i2w = {i: all_ipcs[i] for i in range(len(all_ipcs))}
vocab_i2w.update({len(all_ipcs)+i: tokens[i] for i in range(len(tokens))})
vocab_size = len(vocab_w2i)

In [None]:
aa = {'r': 1, 'y': 3}

In [None]:
aa['r'] = aa['r'] / sum(aa.values())
aa['y'] = 1 - aa['r']
display(aa)

In [None]:
regex = re.compile("[0-9a-zA-Z\/]+")

In [None]:
data['main_ipc']

In [None]:
np.unique(rawdata_ipc['main ipc'].apply(lambda x: "".join(regex.findall(x))))

In [None]:
rawdata_ipc['main ipc'].apply(lambda x: regex.findall(x)[0])

In [None]:
aaa = rawdata_ipc['sub ipc'].apply(lambda x: ["".join(regex.findall(xx)) for xx in x.split(';')])

In [None]:
aaa.iloc[:20]

In [None]:
rawdata_ipc['main ipc'].apply(lambda x: "".join(regex.findall(x)))

In [None]:
data['sub_ipc']

In [None]:
rawdata_ipc['main ipc'].apply(lambda x: regex.findall(x)[0])

In [None]:
sub_ipcs

In [None]:
vocab_i2w.values()

In [None]:
X_df = pd.DataFrame(index=data.index)
X_df['main'] = data['main_ipc'].apply(lambda x: vocab_w2i[x])
X_df['sub'] = data['sub_ipc'].apply(lambda x: [vocab_w2i[xx] for xx in x])
main_sub_combined = X_df.apply(lambda x: [x['main']]+x['sub'], axis=1)
X_df['seq'] = main_sub_combined.apply(lambda x: np.concatenate([[vocab_w2i['<SOS>']]+x+[vocab_w2i['<EOS>']], np.zeros(seq_len-(len(x)+2))+vocab_w2i['<PAD>']]).astype(int))

# xaxis = np.concatenate([np.tile([i], X_df['sub'].apply(lambda x: len(x)).values[i]) for i in range(len(X_df))])
# X = np.zeros((len(self.data), len(self.all_ipcs))) # (#samples, #ipcs)
# X[tuple(np.arange(len(X_df))), tuple(X_df['main'].values)] += 10
# X[tuple(xaxis), tuple(np.concatenate(X_df['sub'].values))] += 1

# X = np.zeros((len(data), seq_len, len(all_ipcs)+len(tokens)))
# X[tuple(np.sort(np.tile(np.arange(len(X_df)), seq_len))), tuple(np.tile(np.arange(seq_len), len(X_df))), tuple(np.concatenate(X_df['seq'].values))] += 1

In [None]:
batch_size = 32
learning_rate = 0.001

In [None]:
target_ipc = "A23"

In [None]:
print("TRANSFORM one-by-one")
tstart = time.time()
tech_dataset = TechDataset(device=device, data_dir=data_root, do_transform=True, params={'target_ipc': target_ipc})
data_loader = DataLoader(tech_dataset, batch_size=batch_size)
tend = time.time()
print(f"{tend-tstart} sec Elapsed")

print("TRANSFORM as a whole")
tstart = time.time()
tech_dataset = TechDataset(device=device, data_dir=data_root, do_transform=False, params={'target_ipc': target_ipc})
data_loader = DataLoader(tech_dataset, batch_size=batch_size)
tend = time.time()
print(f"{tend-tstart} sec Elapsed")

In [None]:
tech_dataset.data

In [None]:
import importlib
import data
importlib.reload(data)
from data import CVSampler, TechDataset

In [None]:
tstart = time.time()
xx, yy = next(iter(data_loader))
tend = time.time()
print(f"{tend-tstart} sec Elapsed")

In [None]:
enc = Encoder_SEQ(embedding_dim=128, vocab_size=vocab_size, hidden_dim=32, n_layers=1, device=device, padding_idx=tech_dataset.vocab_w2i['<PAD>'])
enc = enc.to(dtype=torch.float)

In [None]:
enc(xx)

In [None]:
dec = Decoder_SEQ(embedding_dim=128, vocab_size=vocab_size, hidden_dim=32, n_layers=1, device=device)

In [None]:
next_input = torch.from_numpy(np.tile(vocab_w2i[TOKEN_SOS], 32)).unsqueeze(1)

In [None]:
dec.initHidden(len(next_input)).shape

In [None]:
o,h = dec(next_input, hidden=None)

In [None]:
o.shape

In [None]:
seq2seq = SEQ2SEQ(device=device, dataset=tech_dataset, enc=enc, dec=dec, max_len=tech_dataset.seq_len)

In [None]:
outputs, zz = seq2seq(xx)

In [None]:
outputs.shape

In [None]:
np.unique([tech_dataset.vocab_i2w[i] for i in outputs[:,0,:].argmax(1).detach().numpy()])

In [None]:
loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
loss_fn

In [None]:
loss_fn(outputs.transpose(0,1).transpose(1,2), xx)

In [None]:
xx

In [None]:
trues = xx.clone()

In [None]:
optimizer = torch.optim.Adam(seq2seq.parameters(), lr=learning_rate)

In [None]:
batch_losses = []

outputs, z = seq2seq(xx) # outputs shape: (seq_len, batch_size, vocab_size)
preds = outputs.transpose(0,1).transpose(1,2) # preds shape: (batch_size, vocab_size, seq_len), regard seq_len as additional dimension
trues = xx.clone()
loss = loss_fn(preds, trues)
batch_losses.append(loss.item())

optimizer.zero_grad()
loss.backward()
optimizer.step()

# if batch % 10 == 0 or batch == len(dataloader)-1:
#     loss, current = loss.item(), batch*len(X)
#     if batch == len(dataloader)-1:
#         current = size
#     print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]", end='\r', flush=True)

In [None]:
torch.from_numpy(np.tile([0], 32)).unsqueeze(1).shape

In [None]:
torch.from_numpy(np.tile(np.arange(30), (32,1))).squeeze(0).shape


In [None]:
max_epochs = 5

In [None]:
early_stopping = EarlyStopping(patience=10, verbose=True, path="../models/ES_checkpoint.ckpt")
for ep in range(max_epochs):
    print(f"Epoch {ep+1}\n"+str("-"*30))
    train_loss = run_epoch(data_loader, seq2seq, loss_fn, mode='train', optimizer=optimizer)

In [None]:
sampler = CVSampler(tech_dataset, n_folds=1, test_ratio=0.3)

In [None]:
cv_idx = sampler.get_idx_dict()

In [None]:
print(f"#Samples\nTrain: {len(cv_idx[0]['train'])}, Validation: {len(cv_idx[0]['val'])}, Test: {len(cv_idx[0]['test'])}")

In [None]:
import importlib
import data
importlib.reload(data)
from data import TechDataset, CVSampler

import model
importlib.reload(model)
from model import Encoder_SEQ, Decoder_SEQ, SEQ2SEQ, Attention, AttnDecoder_SEQ

import train_utils
importlib.reload(train_utils)
from train_utils import run_epoch, EarlyStopping, perf_eval

### Patent claims

In [2]:
data = pd.read_csv("../data/USPTO-2m/train.csv", nrows=100)
claims = data['claims']

In [4]:
claims = data['claims']

In [155]:
def text_cleaning(text_list=None):
    if not isinstance(text_list, pd.core.series.Series): text_list = pd.Series(text_list)
    
    basic_cleaner = CleanTransformer(
                    lower=True, no_line_breaks=True, normalize_whitespace=True,
                    no_punct=True, strip_lines=True,
                    no_currency_symbols=True, replace_with_currency_symbol="",
                    no_numbers=True, replace_with_number="",
                    no_digits=True, replace_with_digit="")
    stop_words = stopwords.words("english")
    stemmer = PorterStemmer()
    
    # Split claims and remove claim separator
    cleaned = text_list.apply(lambda text: [claim.replace("<CLAIM SEP>", "") for claim in text.split("<CLAIM SEP>")])
    # Basic text cleaning
    cleaned = cleaned.apply(basic_cleaner.transform)
    # Remove stopwords
    cleaned = cleaned.apply(lambda text: [np.array([word for word in claim.split() if word not in stop_words]) for claim in text])
    # Stemming
    cleaned = cleaned.apply(lambda text: [[stemmer.stem(word) for word in claim] for claim in text])
    # Remove duplicates and sorting
    cleaned = cleaned.apply(lambda text: [list(np.array(claim)[np.sort(np.unique(claim, return_index=True)[1])]) for claim in text])

    return cleaned

In [153]:
a = text_cleaning(text_list=claims)

In [156]:
a

0     [[system, compris, volum, materi, first, forma...
1     [[garden, glove, protect, fingertip, fingernai...
2     [[hockey, helmet, receiv, head, wearer, crown,...
3     [[toilet, recreat, vehicl, compris, bowl, flus...
4     [[combin, mount, surfac, water, valv, bodi, he...
                            ...                        
95    [[method, compris, thermal, coupl, second, tec...
96    [[circuit, compris, compar, oper, receiv, inpu...
97    [[heat, transfer, system, compris, hollow, sup...
98    [[method, determin, condit, damper, remot, loc...
99    [[method, monitor, amount, refriger, system, e...
Name: claims, Length: 100, dtype: object

'system'

## Generate new sample from latent vector

In [None]:
data_type = 'sequence'
n_folds = 1
learning_rate = 5e-3
batch_size = 1
max_epochs = 2
n_gpus = 1
embedding_dim = 128
hidden_dim = 32
n_layers = 3
bidirec = None

data_dir = "/home2/glee/Tech_Gen/data/"

In [None]:
target_ipc = 'G01N'
train_params = {'target_ipc': target_ipc}

In [None]:
print("Load dataset...")
tstart = time.time()
tech_dataset = TechDataset(device=device, data_dir=data_dir, do_transform=False, params=train_params)
data_loader = DataLoader(tech_dataset, batch_size=batch_size)
tend = time.time()
print(f"{np.round(tend-tstart,4)} sec elapsed for loading patents for class [{train_params['target_ipc']}]")

In [None]:
xs, ys = next(iter(data_loader))
x = xs[0].unsqueeze(0).to(device)

One-directional

In [None]:
bidirec = True
# hidden_dim_enc = 2 if bidirec else 1
hidden_dim_dec = hidden_dim
n_directions = 2 if bidirec else 1
hidden_dim_enc = hidden_dim * n_directions if bidirec else hidden_dim

enc = Encoder_SEQ(embedding_dim=embedding_dim, hidden_dim=hidden_dim, vocab_size=tech_dataset.vocab_size, n_layers=n_layers, bidirec=bidirec, device=device).to(device)
att = Attention(hidden_dim_enc, hidden_dim).to(device)
dec = AttnDecoder_SEQ(embedding_dim=embedding_dim, vocab_size=tech_dataset.vocab_size, hidden_dim=hidden_dim, hidden_dim_enc=hidden_dim_enc, attention=att, n_layers=n_layers, device=device, max_len=tech_dataset.seq_len).to(device)

In [None]:
model = SEQ2SEQ(device=device, dataset=tech_dataset, enc=enc, dec=dec, max_len=tech_dataset.seq_len)

In [None]:
model(x)

In [None]:
o, h = enc(x)
display(o.shape)
display(h.shape)

In [None]:
next_input = torch.from_numpy(np.tile([tech_dataset.vocab_w2i['<SOS>']], batch_size)).to(device)
inputs = next_input.unsqueeze(1)

In [None]:
embedded = dec.dropout(dec.embedding(inputs))
embedded.shape

In [None]:
a = dec.attention(h, o)
a = a.unsqueeze(1)
a.shape

In [None]:
weighted = torch.bmm(a, o)
weighted.shape

In [None]:
gru_input = torch.cat((embedded, weighted), dim=2)
gru_input.shape

In [None]:
o, h = dec.gru(gru_input, h)
display(o.shape)
display(h.shape)

In [None]:
dec.fc_out

In [None]:
p = dec.fc_out(torch.cat((embedded.squeeze(1), weighted.squeeze(1), o.squeeze(1)), dim=1))
p.shape

In [None]:
h.shape

In [None]:
dec(next_input, h, o)

In [None]:
embedded = enc.dropout(enc.embedding(x))
h_init = enc.initHidden(len(x))
o, h = enc.gru(embedded, h_init)
# o, h = enc(x)

h = h.view(n_layers, n_directions, batch_size, hidden_dim)

print(f"Output: {o.shape}\nHidden: {h.shape}")
display(o[0, -1, :])
display(h[-1].view(1, batch_size, -1))

print(f"Decoder input: {h[-1].view(batch_size, -1).shape}")

Bi-directional

In [None]:
bidirec = True
hidden_dim_dec = hidden_dim
n_directions = 2 if bidirec else 1
hidden_dim_enc = hidden_dim * n_directions if bidirec else hidden_dim

enc = Encoder_SEQ(embedding_dim=embedding_dim, hidden_dim=hidden_dim, vocab_size=tech_dataset.vocab_size, n_layers=n_layers, bidirec=bidirec, device=device).to(device)
att = Attention(hidden_dim_enc, hidden_dim).to(device)
dec = AttnDecoder_SEQ(embedding_dim=embedding_dim, vocab_size=tech_dataset.vocab_size, hidden_dim=hidden_dim, hidden_dim_enc=hidden_dim_enc, attention=att, n_layers=n_layers, device=device, max_len=tech_dataset.seq_len).to(device)

In [None]:
embedded = enc.dropout(enc.embedding(x))
h_init = enc.initHidden(len(x))
o, h = enc.gru(embedded, h_init)
# o, h = enc(x)
print(f"Output: {o.shape}\nHidden: {h.shape}\n\n")

h = h.view(n_layers, n_directions, batch_size, hidden_dim)

print("Forward path")
display(o[0, -1, :hidden_dim])
display(h[-1, 0, 0, :])
print("Backward path")
display(o[0, 0, hidden_dim:])
display(h[-1, -1, 0, :])

print(f"Decoder input: {h[-1].view(batch_size, -1).shape}")

In [None]:
hh = torch.cat([h,h], axis=1)

In [None]:
h

In [None]:
h[:,0,:]

In [None]:
hh_ = torch.permute(hh, (1,0,2))

In [None]:
h.view(-1)

In [None]:
hh_.reshape(2,-1)[0]