In [15]:
import copy
import glob
import gc
import os
import sys
import warnings
import os
import sys
import math
import time
sys.path.append("/share/tml_package")
from tml import utils
from scipy import io
from tqdm import tqdm
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader

from data import TechDataset, CVSampler
from model import Encoder_SEQ, Decoder_SEQ, SEQ2SEQ, Attention, AttnDecoder_SEQ
from train_utils import run_epoch, EarlyStopping, perf_eval

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Data preprocessing

In [110]:
TOKEN_SOS = '<SOS>'
TOKEN_EOS = '<EOS>'
TOKEN_PAD = '<PAD>'
tokens = [TOKEN_SOS, TOKEN_EOS, TOKEN_PAD]
regex = re.compile("[0-9a-zA-Z]+")

In [4]:
data_root = "/home2/glee/Tech_Gen/data/"
rawdata = pd.read_csv(os.path.join(data_root, "collection_final.csv"))
rawdata_dropna = rawdata.dropna(axis=0, subset=['main ipc', 'sub ipc'])[['number','main ipc', 'sub ipc']]
cols_year = ['<1976']+list(np.arange(1976,2018).astype(str))
n_TC = 3

In [5]:
ipc, num = np.unique(rawdata_dropna['main ipc'].apply(lambda x: x.split(' ')[0]), return_counts=True)
ipc_vocab_size = pd.concat([pd.Series(ipc[np.argsort(num)[::-1]]), pd.Series(num[np.argsort(num)[::-1]])], axis=1)

In [7]:
ipc_vocab_size.iloc[:20]

Unnamed: 0,0,1
0,A61K,44658
1,C07K,7686
2,A01N,6265
3,C12N,2174
4,A61L,2008
5,A61F,1841
6,A23L,1075
7,G01N,820
8,A23K,767
9,C07D,692


In [111]:
target_ipc = "A61K"

In [113]:
main_ipcs = [x for x in pd.unique(rawdata_dropna['main ipc']) if target_ipc in x]
rawdata_ipc = rawdata_dropna.loc[rawdata_dropna['main ipc'].isin(main_ipcs)]
data = rawdata_ipc[['number']].copy(deep=True)
data['main_ipc'] = rawdata_ipc['main ipc'].apply(lambda x: regex.findall(x)[0])
data['sub_ipc'] = rawdata_ipc['sub ipc'].apply(lambda x: [regex.findall(xx)[0] for xx in x.split(';')])

rawdata_tc = rawdata.loc[rawdata_ipc.index][['year']+cols_year]
data['TC'+str(n_TC)] = rawdata_tc.apply(lambda x: x[np.arange(x['year']+1 if x['year']<2017 else 2017, x['year']+n_TC+1 if x['year']+n_TC<2018 else 2018).astype(str)].sum(), axis=1)

data = data.set_index('number')
# main_ipcs = [regex.findall(x)[0] for x in main_ipcs]
main_ipcs = [target_ipc]
sub_ipcs = list(np.unique(np.concatenate(list(data['sub_ipc'].values))))
all_ipcs = list(np.union1d(main_ipcs, sub_ipcs))
seq_len = data['sub_ipc'].apply(lambda x: len(x)).max() + 3

vocab_w2i = {all_ipcs[i]: i for i in range(len(all_ipcs))}
vocab_w2i.update({tokens[i]: len(all_ipcs)+i for i in range(len(tokens))})
vocab_i2w = {i: all_ipcs[i] for i in range(len(all_ipcs))}
vocab_i2w.update({len(all_ipcs)+i: tokens[i] for i in range(len(tokens))})
vocab_size = len(vocab_w2i)

In [36]:
X_df = pd.DataFrame(index=data.index)
X_df['main'] = data['main_ipc'].apply(lambda x: vocab_w2i[x])
X_df['sub'] = data['sub_ipc'].apply(lambda x: [vocab_w2i[xx] for xx in x])
main_sub_combined = X_df.apply(lambda x: [x['main']]+x['sub'], axis=1)
X_df['seq'] = main_sub_combined.apply(lambda x: np.concatenate([[vocab_w2i['<SOS>']]+x+[vocab_w2i['<EOS>']], np.zeros(seq_len-(len(x)+2))+vocab_w2i['<PAD>']]).astype(int))

# xaxis = np.concatenate([np.tile([i], X_df['sub'].apply(lambda x: len(x)).values[i]) for i in range(len(X_df))])
# X = np.zeros((len(self.data), len(self.all_ipcs))) # (#samples, #ipcs)
# X[tuple(np.arange(len(X_df))), tuple(X_df['main'].values)] += 10
# X[tuple(xaxis), tuple(np.concatenate(X_df['sub'].values))] += 1

# X = np.zeros((len(data), seq_len, len(all_ipcs)+len(tokens)))
# X[tuple(np.sort(np.tile(np.arange(len(X_df)), seq_len))), tuple(np.tile(np.arange(seq_len), len(X_df))), tuple(np.concatenate(X_df['seq'].values))] += 1

In [116]:
batch_size = 32
learning_rate = 0.001

In [118]:
print("TRANSFORM one-by-one")
tstart = time.time()
tech_dataset = TechDataset(device=device, data_dir=data_root, do_transform=True, params={'target_ipc': target_ipc})
data_loader = DataLoader(tech_dataset, batch_size=batch_size)
tend = time.time()
print(f"{tend-tstart} sec Elapsed")

print("TRANSFORM as a whole")
tstart = time.time()
tech_dataset = TechDataset(device=device, data_dir=data_root, do_transform=False, params={'target_ipc': target_ipc})
data_loader = DataLoader(tech_dataset, batch_size=batch_size)
tend = time.time()
print(f"{tend-tstart} sec Elapsed")

TRANSFORM one-by-one
27.436416625976562 sec Elapsed
TRANSFORM as a whole
26.884346961975098 sec Elapsed


In [323]:
tstart = time.time()
xx, yy = next(iter(data_loader))
tend = time.time()
print(f"{tend-tstart} sec Elapsed")

0.0036962032318115234 sec Elapsed


In [339]:
enc = Encoder_SEQ(embedding_dim=128, vocab_size=vocab_size, hidden_dim=32, n_layers=1, device=device, padding_idx=tech_dataset.vocab_w2i['<PAD>'])
enc = enc.to(dtype=torch.float)

In [340]:
enc(xx)

tensor([[[-0.0181, -0.0901,  0.1597,  ...,  0.1031, -0.0934, -0.1351],
         [-0.0181, -0.0901,  0.1597,  ...,  0.1031, -0.0934, -0.1351],
         [-0.0181, -0.0901,  0.1597,  ...,  0.1031, -0.0934, -0.1351],
         ...,
         [-0.0181, -0.0901,  0.1597,  ...,  0.1031, -0.0934, -0.1351],
         [-0.0181, -0.0901,  0.1597,  ...,  0.1031, -0.0934, -0.1351],
         [-0.0181, -0.0901,  0.1597,  ...,  0.1031, -0.0934, -0.1351]]],
       grad_fn=<StackBackward>)

In [386]:
dec = Decoder_SEQ(embedding_dim=128, vocab_size=vocab_size, hidden_dim=32, n_layers=1, device=device)

In [387]:
next_input = torch.from_numpy(np.tile(vocab_w2i[TOKEN_SOS], 32)).unsqueeze(1)

In [371]:
dec.initHidden(len(next_input)).shape

torch.Size([1, 32, 32])

In [388]:
o,h = dec(next_input, hidden=None)

In [389]:
o.shape

torch.Size([32, 6032])

In [466]:
seq2seq = SEQ2SEQ(device=device, dataset=tech_dataset, enc=enc, dec=dec, max_len=tech_dataset.seq_len)

In [467]:
outputs, zz = seq2seq(xx)

In [481]:
outputs.shape

torch.Size([168, 32, 6032])

In [480]:
np.unique([tech_dataset.vocab_i2w[i] for i in outputs[:,0,:].argmax(1).detach().numpy()])

array(['A01C1/06', 'A23K1/14', 'A61B5/055', 'A61K31/06', 'A61K31/133',
       'A61K31/53', 'A61K31/713', 'A61K35/64', 'A61K36/63', 'A61K39/085',
       'A61K47/40', 'A61K8/44', 'A61M5/20', 'A61P15/00', 'B01J19/10',
       'B01J39/14', 'B05D7/00', 'B32B5/24', 'C01B15/037', 'C01B21/24',
       'C03C3/15', 'C07B39/00', 'C07B59/00', 'C07C229/24', 'C07C239/00',
       'C07C25/00', 'C07C275/10', 'C07C275/40', 'C07C43/315', 'C07C59/72',
       'C07C69/28', 'C07D207/27', 'C07D241/24', 'C07D253/00',
       'C07D261/20', 'C07D277/34', 'C07D295/32', 'C07D305/08',
       'C07D487/18', 'C07F9/44', 'C07K1/113', 'C07K14/555', 'C08F2/50',
       'C08F224/00', 'C08G65/332', 'C08G77/20', 'C08G77/38', 'C08G77/458',
       'C08J3/03', 'C09C1/62', 'C09J133/02', 'C09K15/24', 'C12N1/20',
       'C12N5/0775', 'C12P17/12', 'C13K5/00', 'C40B30/04', 'G01N21/80',
       'G01N33/573'], dtype='<U10')

In [485]:
loss_fn = torch.nn.CrossEntropyLoss()

In [486]:
loss_fn

CrossEntropyLoss()

In [510]:
loss_fn(outputs.transpose(0,1).transpose(1,2), xx)

tensor(8.7048, grad_fn=<NllLoss2DBackward>)

In [487]:
xx

torch.Size([32, 168])

In [516]:
trues = xx.clone()

In [521]:
optimizer = torch.optim.Adam(seq2seq.parameters(), lr=learning_rate)

In [522]:
batch_losses = []

outputs, z = seq2seq(xx) # outputs shape: (seq_len, batch_size, vocab_size)
preds = outputs.transpose(0,1).transpose(1,2) # preds shape: (batch_size, vocab_size, seq_len), regard seq_len as additional dimension
trues = xx.clone()
loss = loss_fn(preds, trues)
batch_losses.append(loss.item())

optimizer.zero_grad()
loss.backward()
optimizer.step()

# if batch % 10 == 0 or batch == len(dataloader)-1:
#     loss, current = loss.item(), batch*len(X)
#     if batch == len(dataloader)-1:
#         current = size
#     print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]", end='\r', flush=True)

In [126]:
torch.from_numpy(np.tile([0], 32)).unsqueeze(1).shape

torch.Size([32, 1])

In [145]:
torch.from_numpy(np.tile(np.arange(30), (32,1))).squeeze(0).shape


torch.Size([32, 30])

In [526]:
max_epochs = 5

In [537]:
early_stopping = EarlyStopping(patience=10, verbose=True, path="../models/ES_checkpoint.ckpt")
for ep in range(max_epochs):
    print(f"Epoch {ep+1}\n"+str("-"*30))
    train_loss = run_epoch(data_loader, seq2seq, loss_fn, mode='train', optimizer=optimizer)

Epoch 1
------------------------------
loss: 7.744429 [ 4800/44658]

KeyboardInterrupt: 

In [546]:
sampler = CVSampler(tech_dataset, n_folds=1, test_ratio=0.3)

In [552]:
cv_idx = sampler.get_idx_dict()

In [553]:
print(f"#Samples\nTrain: {len(cv_idx[0]['train'])}, Validation: {len(cv_idx[0]['val'])}, Test: {len(cv_idx[0]['test'])}")

#Samples
Train: 25008, Validation: 6252, Test: 13398


In [345]:
import importlib
import data
importlib.reload(data)
from data import TechDataset, CVSampler

import model
importlib.reload(model)
from model import Encoder_SEQ, Decoder_SEQ, SEQ2SEQ, Attention, AttnDecoder_SEQ

import train_utils
importlib.reload(train_utils)
from train_utils import run_epoch, EarlyStopping, perf_eval

## Generate new sample from latent vector

In [346]:
data_type = 'sequence'
n_folds = 1
learning_rate = 5e-3
batch_size = 1
max_epochs = 2
n_gpus = 1
embedding_dim = 128
hidden_dim = 32
n_layers = 3
bidirec = None

data_dir = "/home2/glee/Tech_Gen/data/"

In [189]:
target_ipc = 'G01N'
train_params = {'target_ipc': target_ipc}

In [190]:
print("Load dataset...")
tstart = time.time()
tech_dataset = TechDataset(device=device, data_dir=data_dir, do_transform=False, params=train_params)
data_loader = DataLoader(tech_dataset, batch_size=batch_size)
tend = time.time()
print(f"{np.round(tend-tstart,4)} sec elapsed for loading patents for class [{train_params['target_ipc']}]")

Load dataset...
0.8273 sec elapsed for loading patents for class [G01N]


In [332]:
xs, ys = next(iter(data_loader))
x = xs[0].unsqueeze(0).to(device)

One-directional

In [347]:
bidirec = True
# hidden_dim_enc = 2 if bidirec else 1
hidden_dim_dec = hidden_dim
n_directions = 2 if bidirec else 1
hidden_dim_enc = hidden_dim * n_directions if bidirec else hidden_dim

enc = Encoder_SEQ(embedding_dim=embedding_dim, hidden_dim=hidden_dim, vocab_size=tech_dataset.vocab_size, n_layers=n_layers, bidirec=bidirec, device=device).to(device)
att = Attention(hidden_dim_enc, hidden_dim).to(device)
dec = AttnDecoder_SEQ(embedding_dim=embedding_dim, vocab_size=tech_dataset.vocab_size, hidden_dim=hidden_dim, hidden_dim_enc=hidden_dim_enc, attention=att, n_layers=n_layers, device=device, max_len=tech_dataset.seq_len).to(device)

In [348]:
model = SEQ2SEQ(device=device, dataset=tech_dataset, enc=enc, dec=dec, max_len=tech_dataset.seq_len)

In [349]:
model(x)

(tensor([[[ 0.0000e+00,  3.1341e-01, -4.2600e-01,  ...,  4.2356e-02,
            3.6929e-01,  6.3694e-01],
          [ 0.0000e+00,  5.2404e-01,  1.2323e+00,  ...,  1.5604e-01,
           -2.9833e-01, -2.6807e-01],
          [ 0.0000e+00, -4.7191e-01, -2.9242e-01,  ..., -1.4210e-01,
           -2.7083e-01,  3.7827e-02],
          ...,
          [ 1.0000e+00,  1.0887e+00, -8.0131e-01,  ..., -2.4319e-01,
           -5.7250e-02, -7.0056e-02],
          [ 0.0000e+00, -9.6376e-02, -4.5436e-01,  ...,  2.0529e-01,
            3.7771e-01,  4.6745e-01],
          [ 0.0000e+00, -5.0294e-01,  3.5158e-01,  ...,  5.5043e-01,
            1.0792e-01, -2.8503e-04]]], device='cuda:0', grad_fn=<CopySlices>),
 tensor([[[ 0.9369,  0.1477, -0.9490,  0.7942,  0.6514, -0.9209,  0.0945,
           -0.7410, -0.1404,  0.1339, -0.3090, -0.8749, -0.9897,  0.7289,
            0.0482,  0.1574,  0.8850,  0.2707,  0.8615,  0.1815,  0.9266,
           -0.3052,  0.7899,  0.8589,  0.9722, -0.2754, -0.3244,  0.0674,
     

In [350]:
o, h = enc(x)
display(o.shape)
display(h.shape)

torch.Size([1, 14, 64])

torch.Size([3, 1, 64])

In [351]:
next_input = torch.from_numpy(np.tile([tech_dataset.vocab_w2i['<SOS>']], batch_size)).to(device)
inputs = next_input.unsqueeze(1)

In [352]:
embedded = dec.dropout(dec.embedding(inputs))
embedded.shape

torch.Size([1, 1, 128])

In [353]:
a = dec.attention(h, o)
a = a.unsqueeze(1)
a.shape

torch.Size([1, 1, 14])

In [354]:
weighted = torch.bmm(a, o)
weighted.shape

torch.Size([1, 1, 64])

In [355]:
gru_input = torch.cat((embedded, weighted), dim=2)
gru_input.shape

torch.Size([1, 1, 192])

In [356]:
o, h = dec.gru(gru_input, h)
display(o.shape)
display(h.shape)

torch.Size([1, 1, 64])

torch.Size([3, 1, 64])

In [357]:
dec.fc_out

Linear(in_features=256, out_features=124, bias=True)

In [358]:
p = dec.fc_out(torch.cat((embedded.squeeze(1), weighted.squeeze(1), o.squeeze(1)), dim=1))
p.shape

torch.Size([1, 124])

In [359]:
h.shape

torch.Size([3, 1, 64])

In [307]:
dec(next_input, h, o)

(tensor([[-0.2593, -0.1279, -0.0694, -0.7554,  0.0180,  0.6668,  0.0036, -0.5078,
           1.1024, -0.2065,  0.5753,  0.9603, -0.1081, -1.2144, -0.8858,  0.0778,
          -0.5076, -0.2697,  0.4934, -0.4087, -0.3356,  0.3361,  0.6147, -0.0450,
           0.0830,  1.1113, -0.3555, -0.1468,  0.0836,  1.3751, -1.2525,  0.0981,
          -0.9191,  0.2434,  0.1560, -0.0868,  0.3900, -0.8439,  0.2458,  0.6249,
           0.0519, -0.4166, -0.2436, -1.2634, -0.1646, -0.3555,  0.0968,  0.2803,
          -0.2863,  1.3421,  0.4019,  0.4331,  0.4422, -0.2289,  0.6627, -0.3243,
          -0.1927,  0.7590, -0.5286,  0.3681,  0.4035, -0.3187, -0.0964,  0.0607,
           0.4015, -0.0627, -1.0599,  1.2583, -0.5918, -0.8080,  0.2754, -0.0161,
          -0.0553,  0.5829, -0.5438, -0.5946, -0.4520,  0.5416,  0.5283, -0.3772,
           0.5215,  1.0873,  0.0737, -0.0935,  0.2990, -0.4810, -0.6650, -0.0434,
           0.5959, -0.5721, -0.0806,  0.2492,  0.9251,  0.1382, -0.3697, -0.7398,
          -0.306

In [289]:
embedded = enc.dropout(enc.embedding(x))
h_init = enc.initHidden(len(x))
o, h = enc.gru(embedded, h_init)
# o, h = enc(x)

h = h.view(n_layers, n_directions, batch_size, hidden_dim)

print(f"Output: {o.shape}\nHidden: {h.shape}")
display(o[0, -1, :])
display(h[-1].view(1, batch_size, -1))

print(f"Decoder input: {h[-1].view(batch_size, -1).shape}")

Output: torch.Size([1, 14, 32])
Hidden: torch.Size([3, 1, 1, 32])


tensor([ 0.0492,  0.0935,  0.1818,  0.1779, -0.1556, -0.1263,  0.1913,  0.0110,
         0.3083,  0.0604, -0.0022,  0.0829, -0.0466, -0.0852, -0.0333,  0.0906,
        -0.1912, -0.2740, -0.3841, -0.0205,  0.4372,  0.2543, -0.4319,  0.0114,
         0.3794, -0.1230, -0.0715,  0.0517, -0.2004,  0.0962,  0.3339, -0.0496],
       device='cuda:0', grad_fn=<SliceBackward>)

tensor([[[ 0.0492,  0.0935,  0.1818,  0.1779, -0.1556, -0.1263,  0.1913,
           0.0110,  0.3083,  0.0604, -0.0022,  0.0829, -0.0466, -0.0852,
          -0.0333,  0.0906, -0.1912, -0.2740, -0.3841, -0.0205,  0.4372,
           0.2543, -0.4319,  0.0114,  0.3794, -0.1230, -0.0715,  0.0517,
          -0.2004,  0.0962,  0.3339, -0.0496]]], device='cuda:0',
       grad_fn=<ViewBackward>)

Decoder input: torch.Size([1, 32])


Bi-directional

In [290]:
bidirec = True
hidden_dim_dec = hidden_dim
n_directions = 2 if bidirec else 1
hidden_dim_enc = hidden_dim * n_directions if bidirec else hidden_dim

enc = Encoder_SEQ(embedding_dim=embedding_dim, hidden_dim=hidden_dim, vocab_size=tech_dataset.vocab_size, n_layers=n_layers, bidirec=bidirec, device=device).to(device)
att = Attention(hidden_dim_enc, hidden_dim).to(device)
dec = AttnDecoder_SEQ(embedding_dim=embedding_dim, vocab_size=tech_dataset.vocab_size, hidden_dim=hidden_dim, hidden_dim_enc=hidden_dim_enc, attention=att, n_layers=n_layers, device=device, max_len=tech_dataset.seq_len).to(device)

In [291]:
embedded = enc.dropout(enc.embedding(x))
h_init = enc.initHidden(len(x))
o, h = enc.gru(embedded, h_init)
# o, h = enc(x)
print(f"Output: {o.shape}\nHidden: {h.shape}\n\n")

h = h.view(n_layers, n_directions, batch_size, hidden_dim)

print("Forward path")
display(o[0, -1, :hidden_dim])
display(h[-1, 0, 0, :])
print("Backward path")
display(o[0, 0, hidden_dim:])
display(h[-1, -1, 0, :])

print(f"Decoder input: {h[-1].view(batch_size, -1).shape}")

Output: torch.Size([1, 14, 64])
Hidden: torch.Size([6, 1, 32])


Forward path


tensor([-0.3894,  0.1263,  0.3037,  0.1269, -0.2423, -0.0099, -0.2339, -0.1918,
         0.2486,  0.1912,  0.1601,  0.1636,  0.0296,  0.0112, -0.1474, -0.3327,
        -0.3583,  0.0165,  0.1912,  0.1198, -0.0237, -0.4782, -0.4266,  0.1147,
        -0.1303,  0.0557,  0.2897, -0.2572, -0.0884, -0.1820,  0.0453, -0.1080],
       device='cuda:0', grad_fn=<SliceBackward>)

tensor([-0.3894,  0.1263,  0.3037,  0.1269, -0.2423, -0.0099, -0.2339, -0.1918,
         0.2486,  0.1912,  0.1601,  0.1636,  0.0296,  0.0112, -0.1474, -0.3327,
        -0.3583,  0.0165,  0.1912,  0.1198, -0.0237, -0.4782, -0.4266,  0.1147,
        -0.1303,  0.0557,  0.2897, -0.2572, -0.0884, -0.1820,  0.0453, -0.1080],
       device='cuda:0', grad_fn=<SliceBackward>)

Backward path


tensor([ 0.0863,  0.0435,  0.0377,  0.1924,  0.4576, -0.2691, -0.1307, -0.3650,
         0.0121, -0.0415, -0.0392, -0.0167, -0.1014, -0.0982,  0.0960,  0.0190,
         0.0186, -0.2130, -0.2207, -0.1993, -0.1893,  0.2042, -0.1854,  0.2084,
        -0.1118,  0.0477, -0.1088,  0.1508, -0.1061, -0.3245, -0.0222, -0.1679],
       device='cuda:0', grad_fn=<SliceBackward>)

tensor([ 0.0863,  0.0435,  0.0377,  0.1924,  0.4576, -0.2691, -0.1307, -0.3650,
         0.0121, -0.0415, -0.0392, -0.0167, -0.1014, -0.0982,  0.0960,  0.0190,
         0.0186, -0.2130, -0.2207, -0.1993, -0.1893,  0.2042, -0.1854,  0.2084,
        -0.1118,  0.0477, -0.1088,  0.1508, -0.1061, -0.3245, -0.0222, -0.1679],
       device='cuda:0', grad_fn=<SliceBackward>)

Decoder input: torch.Size([1, 64])
