In [None]:
import sys
sys.path.insert(0, 'language_model/')

import warnings
warnings.simplefilter('ignore', UserWarning)

In [None]:
import codeop
import os
import pprint
from copy import deepcopy

import pandas as pd
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as O
from namespace import Namespace
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

from dataset import StandardDataset
from language_model.lm_train import train_language_model

pp = pprint.PrettyPrinter(width=180, indent=2, compact=False)
print(f'GPU: {T.cuda.is_available()} | CUDA: {T.version.cuda}')

def from_home(x):
    return os.path.join(os.environ['HOME'], x)

# 1. Setup

In [None]:
ROOT_DIR   = from_home('workspace/ml-data/msc-research')

# DJANGO_DIR = os.path.join(ROOT_DIR, 'raw-datasets/testing') # simple django
DJANGO_DIR = os.path.join(ROOT_DIR, 'raw-datasets/django')
CONALA_DIR = os.path.join(ROOT_DIR, 'raw-datasets/conala-corpus')

DATASET_DIR = DJANGO_DIR
EMB_DIR     = os.path.join(ROOT_DIR, 'embeddings')

print(f'Dataset: {os.path.basename(DATASET_DIR)}')

## 1.1. Read dataset

In [None]:
a = [len(l.strip().split()) for l in open(DATASET_DIR + '/all.anno').readlines()]
c = [len(l.strip().split()) for l in open(DATASET_DIR + '/all.code').readlines()]
assert len(a) == len(c)

d = pd.DataFrame([{'a': _a, 'c': _c} for (_a, _c) in zip(a, c)])
d.describe()

a = round(len(list(filter(lambda x: x <= 24, a))) / len(a), 3)
c = round(len(list(filter(lambda x: x <= 20, c))) / len(c), 3)
a, c

## 1.2. Construct config

In [None]:
CFG = Namespace() # main config

# sub-config for dataset
CFG.dataset_cfg = Namespace()
CFG.dataset_cfg.__dict__ = {
    'root_dir': DATASET_DIR,
    'anno_min_freq': 10,
    'code_min_freq': 10,
    'anno_seq_maxlen': 24,
    'code_seq_maxlen': 20,
    'emb_file': os.path.join(EMB_DIR, 'glove.6B.200d-ft-9-1.txt.pickle'),
}

dataset = StandardDataset(config=CFG.dataset_cfg, shuffle_at_init=True, seed=42)

# sub-config for NL intents
CFG.anno = Namespace() 
CFG.anno.__dict__ = {
    'lstm_hidden_size': 64,
    'lstm_dropout_p': 0.2,
    'att_dropout_p': 0.1,
    'lang': dataset.anno_lang,
    'load_pretrained_emb': True,
    'emb_size': 200,
}

# sub-config for source code
CFG.code = Namespace() 
CFG.code.__dict__ = {
    'lstm_hidden_size': 64,
    'lstm_dropout_p': 0.2,
    'att_dropout_p': 0.1,
    'lang': dataset.code_lang,
    'load_pretrained_emb': False,
    'emb_size': 32,
}

CFG.__dict__.update({
    'exp_name': f'{os.path.basename(DATASET_DIR)}-p{0}-a{1}',
    'cuda': True,
    'batch_size': 128,
    'num_epochs': 50,
    'train_split': 0.7,
})

print(f'Dataset: {os.path.basename(CFG.dataset_cfg.root_dir)}')

In [None]:
exp_dir = f'./experiments/{CFG.exp_name}'
log_dir = os.path.join(exp_dir, 'tb_logs')
os.makedirs(exp_dir, exist_ok=False)
tb_writer = SummaryWriter(log_dir=log_dir)

exp_dir

---

In [None]:
# toks = dataset.code_lang.to_numeric('return dict', tokenize_mode='anno', pad_mode='post', max_len=10)
# ws = dataset.code_lang.to_tokens(T.tensor(toks))

In [None]:
# i = np.random.randint(len(dataset))
# a, c = dataset[i]
# assert len(a) == CFG.dataset_cfg.anno_seq_maxlen, f'{i}'
# assert len(c) == CFG.dataset_cfg.code_seq_maxlen, f'{i}'
# pp.pprint(a)
# pp.pprint(dataset.anno_lang.to_tokens(a))
# print('-'*120)
# pp.pprint(c)
# pp.pprint(dataset.code_lang.to_tokens(c))

# 2. Compute LM probabilities

## 2.1. Get train/test/valid splits

In [None]:
_tp, _vp = 0.1, 0.2
splits = dataset.train_test_valid_split(test_p=_tp, valid_p=_vp, seed=42)

for kind in splits:
    for t in splits[kind]:
        vs = splits[kind][t]
        vs = T.cat(vs)
        vs = vs[vs != 0]
        splits[kind][t] = vs
        
print(f'train {(1-_tp-_vp)*len(dataset):.2f} | test {_tp*len(dataset)} | dev {_vp*len(dataset)}')

## 2.2. Train language model

**Note:** Must do this for both anno and code.

In [None]:
CFG.language_model = Namespace()
CFG.language_model.__dict__ = {
    'dataset'     : os.path.basename(DATASET_DIR),
    'model'       : 'LSTM', # type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU, Transformer)
    'n_head'      : None,   # number of heads in the enc/dec of the Transformers
    'emb_size'    : 32,     # size of the word embeddings
    'n_hid'       : 64,     # number of hidden units per layer
    'n_layers'    : 1,      # number of layers
    'lr'          : 0.25,    # initial learning rate
    'clip'        : 0.25,   # gradient clipping
    'dropout_p'   : 0.05,    # dropout applied to layers
    'tied'        : False,  # whether to tie the word embeddings and softmax weights
    'log_interval': 100,
    'epochs'      : 500, # upper epoch limit
    'batch_size'  : 128,
    'seed'        : None # for reproducibility
}

# CFG.language_model

In [None]:
lm_cfg = CFG.language_model

for kind in ['anno', 'code']:
    print(f'Training LM for {kind}\n')

    lm_cfg.kind = kind
    lm_cfg.bptt = CFG.dataset_cfg.__dict__[f'{kind}_seq_maxlen'] # seq len
    lm_cfg.save_path = f'./data/lm/lm-{lm_cfg.kind}-{lm_cfg.dataset}-epochs{lm_cfg.epochs}.pt' # path to save the final model
    
#     train_language_model(lm_cfg, 
#                          num_tokens=len(getattr(dataset, f'{kind}_lang')),
#                          train_nums=T.stack(splits[kind]['train']),
#                          test_nums=T.stack(splits[kind]['test']),
#                          valid_nums=T.stack(splits[kind]['valid']))
    
    train_language_model(lm_cfg, 
                         num_tokens=len(getattr(dataset, f'{kind}_lang')),
                         train_nums=splits[kind]['train'],
                         test_nums=splits[kind]['test'],
                         valid_nums=splits[kind]['valid'])
    
    print('*' * 120, '\n')

## 2.3. Compute LM probs

In [None]:
lm_cfg = CFG.language_model
lm_paths = {k: f'./data/lm/lm-{k}-{lm_cfg.dataset}-epochs{lm_cfg.epochs}.pt' for k in ['anno', 'code']}

for f in lm_paths.values():
    assert os.path.exists(f), f'Language Model: file <{f}> does not exist!'
    
_ = dataset.compute_lm_probs(lm_paths)

---

In [None]:
i = np.random.randint(len(dataset))
a, c, pa, pc = dataset[i]
' '.join(dataset.anno_lang.to_tokens(a)[0]), ' '.join(dataset.code_lang.to_tokens(c)[0])

In [None]:
# class MyLMProb:
#     def __init__(self, model_path):        
#         self.model = T.load(open(model_path, 'rb'), map_location={'cuda:0': 'cpu'})
#         self.model = self.model.cpu()
#         self.model.eval()

#     def get_prob(self, nums, verbose=False):
#         with T.no_grad():
#             inp = T.tensor([int(nums[0])]).long().unsqueeze(0)
#             hidden = self.model.init_hidden(bsz=1)
#             log_probs = []
            
#             for i in range(1, len(nums)):
#                 output, hidden = self.model(inp, hidden)
                
#                 #word_weights = output.squeeze().data.double().exp()
#                 #prob = word_weights[nums[i]] / word_weights.sum()
#                 probs = F.softmax(output.squeeze(), dim=-1)
#                 prob = probs[nums[i]]
                
#                 # append current log prob
#                 log_probs += [T.log(prob)]
#                 inp.data.fill_(int(nums[i]))

#             if verbose:
#                 for i in range(len(log_probs)):
#                     print(f'{nums[i+1]:4d}: P(w|s) = {np.exp(log_probs[i]):8.4f} | logP(w|s) = {log_probs[i]:8.4f}')
#                 print(f'=> sum_prob = {sum(log_probs):.4f}')

#         return sum(log_probs) / len(log_probs)

In [None]:
# lm_probs = {'anno': [], 'code': []}

# pad_idx = {
#     'anno': dataset.anno_lang.token2index['<pad>'],
#     'code': dataset.code_lang.token2index['<pad>']
# } 

# for kind in lm_probs:
#     lm = MyLMProb(lm_paths[kind])
#     p = pad_idx[kind]

#     for vec in tqdm(getattr(dataset, kind), total=len(dataset), desc=f'P({kind})'):
#         lm_probs[kind] += [np.exp(lm.get_prob(vec[vec != pad_idx[kind]], verbose=False))]
    
#     lm_probs[kind] = sum(lm_probs[kind])
#     break

In [None]:
# kind = 'anno'
# lm = MyLMProb(lm_paths[kind])
# s = {}
# for t, i in tqdm(getattr(dataset, f'{kind}_lang').token2index.items()):
#     if i in [0, 2, 3]:
#         continue
#     q = T.tensor([2, i, 3])
#     s[i] = np.exp(lm.get_prob(q))
    
# xs, ys = zip(*sorted(s.items(), key=lambda k: -k[1]))

# plt.figure(figsize=(14,6))
# plt.bar(xs, ys)
# plt.xticks(xs, rotation=90)

# sum(ys)

# 3. Dual CS/CG Model

In [None]:
def get_embeddings(config: Namespace):
    emb = nn.Embedding(len(config.lang), config.emb_size, padding_idx=config.lang.pad_idx)
    
    if config.load_pretrained_emb:
        assert config.lang.emb_matrix is not None
        emb.weight = nn.Parameter(T.tensor(config.lang.emb_matrix, dtype=T.float32))
        emb.weight.requires_grad = False
        
    return emb


class Model(nn.Module):
    def __init__(self, config: Namespace, model_type):
        """
        :param model_type: cs / cg
        cs: code -> anno
        cg: anno -> code
        """
        super(Model, self).__init__()
        
        assert model_type in ['cs', 'cg']
        self.model_type = model_type
        
        src_cfg = config.anno if model_type == 'cg' else config.code
        tgt_cfg = config.code if model_type == 'cg' else config.anno
        
        # 1. ENCODER
        self.src_embedding = get_embeddings(src_cfg)
        self.encoder = nn.LSTM(input_size=src_cfg.emb_size,
                               hidden_size=src_cfg.lstm_hidden_size,
                               dropout=src_cfg.lstm_dropout_p,
                               bidirectional=True,
                               batch_first=True)
        
        self.decoder_cell_init_linear = nn.Linear(in_features=2*src_cfg.lstm_hidden_size,
                                                  out_features=tgt_cfg.lstm_hidden_size)
        
        # 2. ATTENTION
        # project source encoding to decoder rnn's h space (W from Luong score general)
        self.att_src_W = nn.Linear(in_features=2*src_cfg.lstm_hidden_size,
                                   out_features=tgt_cfg.lstm_hidden_size,
                                   bias=False)
        
        # transformation of decoder hidden states and context vectors before reading out target words
        # this produces the attentional vector in (W from Luong eq. 5)
        self.att_vec_W = nn.Linear(in_features=2*src_cfg.lstm_hidden_size + tgt_cfg.lstm_hidden_size,
                                   out_features=tgt_cfg.lstm_hidden_size,
                                   bias=False)
        
        # 3. DECODER
        self.tgt_embedding = get_embeddings(tgt_cfg)
        self.decoder = nn.LSTMCell(input_size=tgt_cfg.emb_size + tgt_cfg.lstm_hidden_size,
                                   hidden_size=tgt_cfg.lstm_hidden_size)
       
        # prob layer over target language
        self.readout = nn.Linear(in_features=tgt_cfg.lstm_hidden_size,
                                 out_features=len(tgt_cfg.lang),
                                 bias=False)
        
        self.dropout = nn.Dropout(tgt_cfg.att_dropout_p)
        
        # 4. COPY MECHANISM
        self.copy_gate = ... # TODO
        
        # save configs
        self.src_cfg = src_cfg
        self.tgt_cfg = tgt_cfg
        
        device = T.device('cuda' if CFG.cuda else 'cpu')
        self.to(device)
        print(f'[{model_type}] using [{device}]')
        
        
    def forward(self, src, tgt):
        """
        src: bs, max_src_len
        tgt: bs, max_tgt_len
        """
        enc_out, (h0_dec, c0_dec) = self.encode(src)
        scores, att_mats = self.decode(enc_out, h0_dec, c0_dec, tgt)
        
        return scores, att_mats
    
    
    def encode(self, src):
        """
        src : bs x max_src_len (emb look-up indices)
        out : bs x max_src_len x 2*hid_size
        h/c0: bs x tgt_hid_size
        """
        emb = self.src_embedding(src)
        out, (hn, cn) = self.encoder(emb) # hidden is zero by default
        
        # construct initial state for the decoder
        c0_dec = self.decoder_cell_init_linear(T.cat([cn[0], cn[1]], dim=1))
        h0_dec = c0_dec.tanh()
        
        return out, (h0_dec, c0_dec)
    
    
    def decode(self, src_enc, h0_dec, c0_dec, tgt):
        """
        src_enc: bs, max_src_len, 2*hid_size (== encoder output)
        h/c0   : bs, tgt_hid_size
        tgt    : bs, max_tgt_len (emb look-up indices)
        """
        batch_size, tgt_len = tgt.shape
        scores, att_mats = [], []
        
        hidden = (h0_dec, c0_dec)
        
        emb = self.tgt_embedding(tgt) # bs, max_tgt_len, tgt_emb_size
        
        att_vec = T.zeros(batch_size, self.tgt_cfg.lstm_hidden_size, requires_grad=False)
        if CFG.cuda:
            att_vec = att_vec.cuda()
        
        # Luong W*hs: same for each timestep of the decoder
        src_enc_att = self.att_src_W(src_enc) # bs, max_src_len, tgt_hid_size
        
        for t in range(tgt_len):
            emb_t = emb[:, t, :]
            x = T.cat([emb_t, att_vec], dim=-1)
            h_t, c_t = self.decoder(x, hidden)

            ctx_t, att_mat = self.luong_attention(h_t, src_enc, src_enc_att)
            
            # Luong eq. (5)
            att_t = self.att_vec_W(T.cat([h_t, ctx_t], dim=1))
            att_t = att_t.tanh()
            att_t = self.dropout(att_t)
            
            # Luong eq. (6)
            score_t = self.readout(att_t)
            score_t = F.softmax(score_t, dim=-1)
            
            scores   += [score_t]
            att_mats += [att_mat]
            
            # for next state t+1
            att_vec = att_t
            hidden  = (h_t, c_t)
        
        # bs, max_tgt_len, tgt_vocab_size
        scores = T.stack(scores).permute((1, 0, 2))
        
        # each element: bs, max_src_len, max_tgt_len
        att_mats = T.cat(att_mats, dim=1)
        
        return scores, att_mats
            
        
    def luong_attention(self, h_t, src_enc, src_enc_att, mask=None):
        """
        h_t               : bs, hid_size
        src_enc (hs)      : bs, max_src_len, 2*src_hid_size 
        src_enc_att (W*hs): bs, max_src_len, tgt_hid_size
        mask              : bs, max_src_len
        
        ctx_vec    : bs, 2*src_hid_size
        att_weight : bs, max_src_len
        att_mat    : bs, 1, max_src_len
        """
        
        # bs x src_max_len
        score = T.bmm(src_enc_att, h_t.unsqueeze(2)).squeeze(2)
        
        if mask:
            score.data.masked_fill_(mask, -np.inf)
        
        att_mat = score.unsqueeze(1)
        att_weights = F.softmax(score, dim=-1)
        
        # sum per timestep
        ctx_vec = T.sum(att_weights.unsqueeze(2) * src_enc, dim=1)
        
        return ctx_vec, att_mat
    
    
    def beam_search(self, src, width=3):
        """
        Choose most probable sequence, considering top `width` candidates.
        """

        hyp = []

        batch_size, src_len = src.shape
        enc_out, (h0_dec, c0_dec) = self.encode(src)

        scores, att_mats = [], []

        hidden = (h0_dec, c0_dec)

        att_vec = T.zeros(batch_size, self.tgt_cfg.lstm_hidden_size, requires_grad=False).cuda()

        # Luong W*hs: same for each timestep of the decoder
        src_enc_att = self.att_src_W(src_enc) # bs, max_src_len, tgt_hid_size

        for t in range(tgt_len):
            emb_t = self.tgt_embedding(hyp[-1])
            x = T.cat([emb_t, att_vec], dim=-1)
            h_t, c_t = self.decoder(x, hidden)

            ctx_t, att_mat = self.luong_attention(h_t, src_enc, src_enc_att)

            att_t = F.tanh(self.att_vec_W(T.cat([h_t, ctx_t], dim=1)))
            # att_t = self.dropout(att_t)

            score_t = F.softmax(self.readout(att_t), dim=-1)

            scores   += [score_t]
            att_mats += [att_mat]

            # for next state t+1
            att_vec = att_t
            hidden  = (h_t, c_t)

        # bs, max_tgt_len, tgt_vocab_size
        scores = T.stack(scores).permute((1, 0, 2))

        # each element: bs, max_src_len, max_tgt_len
        att_mats = T.cat(att_mats, dim=1)



        return hyp

# 4. Train

## 4.1. Setup

In [None]:
def JSD(a, b, mask=None):
    eps = 1e-8
    
    assert a.shape == b.shape
    _, n, _ = a.shape 
            
    xa = F.softmax(a, dim=2) + eps
    xb = F.softmax(b, dim=2) + eps
    
    # common, averaged dist
    avg = 0.5 * (xa + xb)
    
    # kl
    xa = T.sum(xa * T.log(xa / avg), dim=2)
    xb = T.sum(xb * T.log(xb / avg), dim=2)
    
    # js
    xa = T.sum(xa, dim=1) / n
    xb = T.sum(xb, dim=1) / n
    
    return 0.5 * (xa + xb)

In [None]:
cg_model     = Model(CFG, model_type='cg')
cg_model.opt = O.Adam(lr=0.001, params=filter(lambda p: p.requires_grad, cg_model.parameters()))

cs_model     = Model(CFG, model_type='cs')
cs_model.opt = O.Adam(lr=0.001, params=filter(lambda p: p.requires_grad, cs_model.parameters()))

# TODO: very hacky
n = int(CFG.train_split * len(dataset))
train_dataset = deepcopy(dataset)
train_dataset.anno = dataset.anno[:n]
train_dataset.code = dataset.code[:n]
train_dataset.df   = dataset.df.iloc[:n]
train_dataset.lm_probs['anno'] = dataset.lm_probs['anno'][:n]
train_dataset.lm_probs['code'] = dataset.lm_probs['code'][:n]
# ---

kwargs = {'num_workers': 4, 'pin_memory': True} if CFG.cuda else {}
train_loader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True, **kwargs)
print(f'DataLoader: {len(train_loader)} batches of size {CFG.batch_size} (total: {len(train_dataset)})')

__cg_l = 0
__cs_l = 0
__att_l = 0
__dual_l = 0
__rep_every = 50
__tb_every = __rep_every // 4

CFG.to_file(os.path.join(exp_dir, 'config.json'))

## 4.2. Loop

In [None]:
ts = 0

for epoch_idx in range(1, CFG.num_epochs+1):
    
    for batch_idx, (anno, code, anno_lm_p, code_lm_p) in enumerate(train_loader, start=1):        
        anno_len, code_len = anno.shape[1], code.shape[1]
        
        if CFG.cuda:
            anno, code, anno_lm_p, code_lm_p = map(lambda t: t.cuda(), [anno, code, anno_lm_p, code_lm_p])
            
        # binary mask indicating the presence of padding token
        anno_mask = T.tensor(anno != dataset.anno_lang.token2index['<pad>']).byte()
        code_mask = T.tensor(code != dataset.code_lang.token2index['<pad>']).byte()
            
        # forward pass
        code_pred, code_att_mat = cg_model(src=anno, tgt=code)
        anno_pred, anno_att_mat = cs_model(src=code, tgt=anno)
                                    
        # loss computation
        l_cg_ce, l_cs_ce = 0, 0
        
        # CG cross-entropy loss
        for t in range(code_len):
            probs = code_pred[:, t, :].gather(1, code[:, t].view(-1, 1)).squeeze(1)
            l_cg_ce += -T.log(probs) * code_mask[:, t] / code_len
                    
        # CS cross-entropy loss
        for t in range(anno_len):
            probs = anno_pred[:, t, :].gather(1, anno[:, t].view(-1, 1)).squeeze(1)
            l_cs_ce += -T.log(probs) * anno_mask[:, t] / anno_len
            
        # dual loss: P(x,y) = P(x).P(y|x) = P(y).P(x|y)
        l_dual = (code_lm_p - l_cs_ce - anno_lm_p + l_cg_ce) ** 2
                
        # attention loss: JSD
        l_att = JSD(anno_att_mat, code_att_mat.transpose(2,1)) + \
                JSD(anno_att_mat.transpose(2,1), code_att_mat)
                
        # final loss
        p, a = 0, 0
        l_cg = T.mean(l_cg_ce + p * 0.5 * l_dual + a * 0.9 * l_att)
        l_cs = T.mean(l_cs_ce + p * 0.5 * l_dual + a * 0.9 * l_att)
                
        # optimize CG
        cg_model.opt.zero_grad()
        l_cg.backward(retain_graph=True)
        cg_model.opt.step()
                
        # optimize CS
        cs_model.opt.zero_grad()
        l_cs.backward()
        cs_model.opt.step()
        
        # tensorboard
        if batch_idx % __tb_every == 0:
            for name, param in cg_model.named_parameters():
                tb_writer.add_histogram(f'CG-{name}', param, ts)
            for name, param in cs_model.named_parameters():
                tb_writer.add_histogram(f'CS-{name}', param, ts)
            tb_writer.add_scalar('train/CG_loss', l_cg.item(), ts)
            tb_writer.add_scalar('train/CS_loss', l_cs.item(), ts)
            tb_writer.add_scalar('train/ATT_loss', l_att.mean().item(), ts)
            tb_writer.add_scalar('train/DUAL_loss', l_dual.mean().item(), ts)
            ts += 1
                
        # reporting
        __cg_l   += l_cg.item() / __rep_every
        __cs_l   += l_cs.item() / __rep_every
        __att_l  += l_att.mean().item()  / __rep_every
        __dual_l += l_dual.mean().item() / __rep_every
        
        if batch_idx % __rep_every == 0:
            status = [f'Epoch {epoch_idx:>5d}/{CFG.num_epochs:>3d}', f'Batch {batch_idx:>5d}/{len(train_loader):5d}',
                      f'CG {__cg_l:7.5f}', f'CS {__cs_l:7.5f}', f'ATT {__att_l:7.5f}', f'DUAL {__dual_l:7.5f}']
            print(' | '.join(status))
            __cg_l, __cs_l, __att_l, __dual_l = 0, 0, 0, 0
    # --- epoch end

In [None]:
torch.save(cg_model.state_dict(), os.path.join(exp_dir, 'cg_model.pt'))
torch.save(cs_model.state_dict(), os.path.join(exp_dir, 'cs_model.pt'))

tb_writer.close()

# 5. Evaluate

In [None]:
cg_model = Model(CFG, model_type='cg')
cs_model = Model(CFG, model_type='cs')

# exp_dir = f'./experiments/{CFG.exp_name}'
exp_dir = f'./experiments/{os.path.basename(DATASET_DIR)}-p{0}-a{1}-minfreq2'

cg_model.load_state_dict(torch.load(os.path.join(exp_dir, 'cg_model.pt')))
cs_model.load_state_dict(torch.load(os.path.join(exp_dir, 'cs_model.pt')))

exp_dir

## 5.1. Metrics

In [None]:
def is_valid_code(line):
    "valid <=> (complete ^ valid) v (incomplete ^ valid_prefix)"
    try:
        codeop.compile_command(line)
    except SyntaxError:
        return False
    
    return True

def to_tok(xs, mode):
    z = (xs)[0].cpu()
    z = z[(z!=0)&(z!=1)&(z!=2)&(z!=3)]
    if mode == 'code':
        return dataset.code_lang.to_tokens(z)[0]
    if mode == 'anno':
        return dataset.anno_lang.to_tokens(z)[0]

In [None]:
# TODO: very hacky
n = int(CFG.train_split * len(dataset))
test_dataset = deepcopy(dataset)
test_dataset.anno = dataset.anno[n:]
test_dataset.code = dataset.code[n:]
test_dataset.df   = dataset.df.iloc[n:]
test_dataset.lm_probs['anno'] = dataset.lm_probs['anno'][n:]
test_dataset.lm_probs['code'] = dataset.lm_probs['code'][n:]
# ---

test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)
assert len(test_loader) == len(dataset) - n

In [None]:
ms = ['ind_match', 'exact_match', 'coverage']
metrics = {
    'anno': {k: 0 for k in ms},
    'code': {k: 0 for k in ms}
}
metrics['code']['pov'] = 0

anno_toks, code_toks = [], []

with T.no_grad():
    cg_model.eval()
    cs_model.eval()
    
    for batch_idx, (anno, code, _, _) in tqdm(enumerate(test_loader, start=1), total=len(test_loader)): 
        if CFG.cuda:
            anno, code = anno.cuda(), code.cuda() 
            
        # binary mask indicating the presence of padding token
#         anno_mask = T.tensor(anno != dataset.anno_lang.token2index['<pad>']).byte()
#         code_mask = T.tensor(code != dataset.code_lang.token2index['<pad>']).byte()

        anno_mask = T.tensor((anno != 0) * (anno != 1)).byte()
        code_mask = T.tensor((code != 0) * (code != 1)).byte()
            
        # forward pass
        code_pred, code_att_mat = cg_model(src=anno, tgt=code)
        anno_pred, anno_att_mat = cs_model(src=code, tgt=anno)
        
        # TODO: ideally, this should be beam-search
        code_pred = code_pred.argmax(dim=2)
        anno_pred = anno_pred.argmax(dim=2)
        
        code_score = (((code_pred == code) * code_mask).float().sum() / code_mask.sum()).cpu()
        anno_score = (((anno_pred == anno) * anno_mask).float().sum() / anno_mask.sum()).cpu()
        
        # 1)
        metrics['code']['ind_match'] += code_score / len(test_loader)
        metrics['anno']['ind_match'] += anno_score / len(test_loader)
        
        # 2)
        if np.isclose(code_score, 1):
            metrics['code']['exact_match'] += 1 / len(test_loader)
        if np.isclose(anno_score, 1):
            metrics['anno']['exact_match'] += 1 / len(test_loader)
            
        # 3)
        sy  = set([x.item() for x in (code * code_mask)[0].cpu().data if x.item() != 0])
        sy_ = set([x.item() for x in (code_pred * code_mask)[0].cpu().data if x.item() != 0])
        if len(set.difference(sy_, sy)) == 0:
            metrics['code']['coverage'] += 1 / len(test_loader)
        else:
            if np.isclose(code_score, 1):
                print(set.difference(sy_, sy))
            
        sy  = set([x.item() for x in (anno * anno_mask)[0].cpu().data if x.item() != 0])
        sy_ = set([x.item() for x in (anno_pred * anno_mask)[0].cpu().data if x.item() != 0])
        if len(set.difference(sy_, sy)) == 0:
            metrics['anno']['coverage'] += 1 / len(test_loader)
            
        # 4)
        if is_valid_code(' '.join(to_tok(code_pred * code_mask, 'code'))):
            metrics['code']['pov'] += 1 / len(test_loader)

        # save tokens
        code_toks += [(round(code_score.item(), 5), 
                       to_tok(code_pred * code_mask, 'code'), 
                       to_tok(code * code_mask, 'code'),
                       code_pred[0].cpu(),
                       code[0].cpu())]
        
        anno_toks += [(round(anno_score.item(), 5), 
                       to_tok(anno_pred * anno_mask, 'anno'), 
                       to_tok(anno * anno_mask, 'anno'),
                       anno_pred[0].cpu(),
                       anno[0].cpu())]
            
code_toks = sorted(code_toks, key=lambda x: x[0])
anno_toks = sorted(anno_toks, key=lambda x: x[0])

with open(os.path.join(exp_dir, 'eval_code.txt'), 'wt') as fp:
    for i, (s, pt, tt, p, t) in enumerate(code_toks, start=1):
        fp.write(f'{i}\n')
        fp.write(f'{s}\n')
        fp.write(f'pred: {" ".join(pt)}\n')
        fp.write(f'true: {" ".join(tt)}\n')
        fp.write(f'pred_raw: {p}\n')
        fp.write(f'true_raw: {t}\n')
        fp.write(f'{"-"*80}\n')
        
with open(os.path.join(exp_dir, 'eval_anno.txt'), 'wt') as fp:
    for i, (s, pt, tt, p, t) in enumerate(anno_toks, start=1):
        fp.write(f'{i}\n')
        fp.write(f'{s}\n')
        fp.write(f'pred: {" ".join(pt)}\n')
        fp.write(f'true: {" ".join(tt)}\n')
        fp.write(f'pred_raw: {p}\n')
        fp.write(f'true_raw: {t}\n')
        fp.write(f'{"-"*80}\n')

# results
print(exp_dir.split('/')[-1])
print(len(test_loader))
for k in ms:
    print(f"{metrics['anno'][k]:7.5f}/{metrics['code'][k]:7.5f}", end=' ')
print(round(metrics['code']['pov'], 5))

## 5.2. Attention matrices

In [None]:
a = T.tensor([  2, 576,  16,  84, 474, 695,   0,   0,   0,   3])
c = T.tensor([  2, 155, 489,  10, 159,   5,   8,   0,   0,   3])

with T.no_grad():
    i = np.random.randint(len(test_dataset))
#     i = 5557
    a, c, _, _ = test_dataset[-1]
    a, c = a.cuda(), c.cuda()
    anno_mask = T.tensor((a != 0) * (a != 1)).byte().cuda()
    code_mask = T.tensor((c != 0) * (c != 1)).byte().cuda()
    x, x_mat = cg_model(src=a.unsqueeze(0), tgt=c.unsqueeze(0))
    y, y_mat = cs_model(src=c.unsqueeze(0), tgt=a.unsqueeze(0))
    x = x[0].argmax(dim=-1)
    x_mat = x_mat[0].cpu()
    y = y[0].argmax(dim=-1)
    y_mat = y_mat[0].cpu()
    
    ct = to_tok((c * code_mask).unsqueeze(0), 'code')
    xt = to_tok((x * code_mask).unsqueeze(0), 'code')
    at = to_tok((a * anno_mask).unsqueeze(0), 'anno')
    yt = to_tok((y * anno_mask).unsqueeze(0), 'anno')
    

plt.figure(figsize=(12, 8))

# plt.subplot(1, 2, 1)
plt.imshow(F.softmax(y_mat, -1), cmap='jet')
plt.grid(False)
plt.xticks(np.arange(len(ct)), labels=ct, rotation=90)
plt.yticks(np.arange(len(at)), labels=at)

# plt.subplot(1, 2, 2)
# plt.imshow(F.softmax(y_mat, -1), cmap='jet')
# plt.grid(False)
# plt.yticks(np.arange(len(ct)), labels=ct)
# plt.xticks(np.arange(len(at)), labels=at, rotation=90)

pass