# Code Generation as a Dual Task of Code Summarization

```
@article{wei2019code,
  title={Code Generation as a Dual Task of Code Summarization},
  author={Wei, Bolin and Li, Ge and Xia, Xin and Fu, Zhiyi and Jin, Zhi},
  journal={arXiv preprint arXiv:1910.05923},
  year={2019}
}
```

<img src='https://i.imgur.com/RqN1agC.png' width='600' align='left'>

## References
- https://www.tensorflow.org/tutorials/text/nmt_with_attention
- https://blog.floydhub.com/attention-mechanism/

## Definitions

$x \; \text{: code snippets}, \; y \; \text{: comments}$

$P(x,y) = \color{#00a010}{P(x) \cdot P(y|x)} = \color{#a010a0}{P(y) \cdot P(x|y)}$

### Loss terms

$l_{xy} = -\frac{1}{m} \sum_{t=1}^{m} P(y_t | y_{\lt t}, x)$

$l_{yx} = -\frac{1}{n} \sum_{t=1}^{n} P(x_t | x_{\lt t}, y)$

$l_{dual}=\left[ \left(\color{#00a010}{\log\hat{P}(x) + \log P(y \vert x; \theta_{xy})} \right) - \left(\color{#a010a0}{\log\hat{P}(y) + \log P(x \vert y; \theta_{yx})} \right) \right]^{2} \text{ : regularization term}$

$l_{att} = l_1 + l_2, \text{ where } l_k = \mathcal{D}_{JS} \left( b_i, b_i' \right ) = \frac{1}{2n}\sum_{i=1}^{n} \mathcal{D}_{KL} \left(b_i \, || \, \frac{b_i + b_i'}{2} \right) + \mathcal{D}_{KL} \left(b_i' \, || \, \frac{b_i + b_i'}{2} \right)$

$b_i = softmax \left( A_{xy}[i, :] \right), \; b_i' = softmax \left( A_{yx}[i, :] \right)$

$A_{xy} \in \mathbb{R}^{n \times m}, \; A_{yx} \in \mathbb{R}^{m \times n} \text{ : attention weights}$

### Updates

$\text{Minibatch of } k \text{ pairs: } \langle \left(x_i, y_i\right) \rangle_{i=1}^{k}$

$
\begin{cases}
G_{xy} = \nabla_{\theta_{xy}} \frac{1}{k} \sum_{i=1}^{k} \left( l_{xy} + \lambda_{dual}^{(1)} \cdot l_{dual} + \lambda_{att}^{(1)} \cdot l_{att} \right)\\
G_{yx} = \nabla_{\theta_{yx}} \frac{1}{k} \sum_{i=1}^{k} \left( l_{yx} + \lambda_{dual}^{(2)} \cdot l_{dual} + \lambda_{att}^{(2)} \cdot l_{att} \right)
\end{cases}
$

$\text{Update } \theta_{xy} \text{ and } \theta_{yx} \text{ independently}$

### Notes
- The last encoder's hidden state is used to init the decoder's hidden state.

### Hyperparameters

In [1]:
import os
from argparse import Namespace

import numpy as np
import pandas as pd

from timeit import default_timer as timer
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

from datasets import Django

### Globals

In [2]:
EMB_DIR    = '/home/alex/workspace/msc-research/embeddings'
DJANGO_DIR = '/home/alex/workspace/msc-research/raw-datasets/django/'

### Hyperparameters

In [3]:
HP = Namespace()
HP.batch_size = 5
HP.epochs     = 1

### Dataset

In [4]:
HP.dataset_config = Namespace()
HP.dataset_config.__dict__ = {
    'root_dir': DJANGO_DIR,
    'anno_min_freq': 1,
    'code_min_freq': 1,
    'anno_seq_maxlen': 40,
    'code_seq_maxlen': 40,
    'emb_file': os.path.join(EMB_DIR, 'glove.6B.50d.txt.pickle')
}

django = Django(config=HP.dataset_config)

> clean text
> construct vocab


HBox(children=(IntProgress(value=0, max=18805), HTML(value='')))




HBox(children=(IntProgress(value=0, max=18805), HTML(value='')))


> tokenize
> pad
> build emb matrix
> load glove from pickle


HBox(children=(IntProgress(value=0, max=11705), HTML(value='')))


> DONE


### Encoder

<img src='https://i.stack.imgur.com/SjnTl.png' width='500' align='left'>

In [5]:
class Encoder(nn.Module):
    def __init__(self, hidden_size, input_maxlen, emb_matrix, num_layers=1, bidir=True):
        super(Encoder, self).__init__()
        
        self.hidden_size  = hidden_size
        self.input_maxlen = input_maxlen
        
        self.num_layers = num_layers
        self.bidir = bidir
        
        self.vocab_size, self.emb_dim = emb_matrix.shape
        
        self.embedding = nn.Embedding(self.vocab_size, self.emb_dim)
        self.embedding.weight = nn.Parameter(torch.tensor(emb_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        
        self.bidir_lstm = nn.LSTM(input_size=self.emb_dim,
                                  hidden_size=self.hidden_size,
                                  num_layers=self.num_layers,
                                  bidirectional=self.bidir,
                                  batch_first=True)
        
    def forward(self, x, hidden=None):
        if hidden is None:
            hidden = self.init_hidden(batch_size=x.shape[0])
            
        emb = self.embedding(x)
        out, hidden = self.bidir_lstm(emb)
            
        return out, hidden
    
    def init_hidden(self, batch_size):
        d = 2 if self.bidir else 1
        z = torch.zeros(d * self.num_layers, batch_size, self.hidden_size)
        return (z,z)

In [6]:
enc = Encoder(hidden_size=512, input_maxlen=40, emb_matrix=django.emb_matrix, bidir=False, num_layers=1)

x = torch.randint(40, size=(17,40))
out, (h,c) = enc(x)
print('out:', out.shape, 'h:', h.shape, 'c:', c.shape)

out: torch.Size([17, 40, 512]) h: torch.Size([1, 17, 512]) c: torch.Size([1, 17, 512])


### Luong Attention

<img src='https://www.tensorflow.org/images/seq2seq/attention_mechanism.jpg' width='400' align='left'>

In [55]:
class LuongAttention(nn.Module):
    def __init__(self, enc_hidden_size, dec_hidden_size):
        super(LuongAttention, self).__init__()
        
        self.W = nn.Linear(enc_hidden_size, dec_hidden_size)
        
    def forward(self, ht, hs):
        # ht: decoder output: (batch_size, 1, hidden_size)
        # hs: encoder output: (batch_size, seq_len, hidden_size)
        # scode: (batch_size, 1, seq_len)
        
        print('ht', ht.shape)
        print('hs', hs.shape)
        print('whs', self.W(hs).shape)
        
        score   = torch.bmm(ht, self.W(hs).transpose(1, 2))
        align   = F.softmax(score, dim=-1)
        context = torch.bmm(align, hs)
        
        return context, align

#### Test

In [51]:
att = LuongAttention(512)
bs = 17
e = torch.rand(bs, 40, 512)
d = torch.rand(bs, 1, 512)
c, a = att(d, e)
print(c.shape, a.shape, a[0, 0].sum())

ht torch.Size([17, 1, 512])
hs torch.Size([17, 40, 512])
whs torch.Size([17, 40, 512])
torch.Size([17, 1, 512]) torch.Size([17, 1, 40]) tensor(1.0000, grad_fn=<SumBackward0>)


### Decoder

In [56]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, enc_hidden_size, dec_hidden_size, num_layers=1, bidir=False):
        super(Decoder, self).__init__()
        
        self.emb_dim = emb_dim
        self.hidden_size = dec_hidden_size
        
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        
        self.num_layers = num_layers
        self.bidir = bidir
        
        self.lstm = nn.LSTM(input_size=self.emb_dim,
                            hidden_size=self.hidden_size,
                            num_layers=self.num_layers,
                            bidirectional=self.bidir,
                            batch_first=True)
        
#         self.linear = nn.Linear()
        
        self.attention = LuongAttention(enc_hidden_size, self.hidden_size)
        
    def forward(self, x, hidden, enc_output):
        context, align = self.attention(hidden, enc_output)
        
        emb = self.embedding(x)
        
        lstm_in = torch.cat((emb, context), dim=-1)
        
        output, state = self.lstm(lstm_in, hidden)
        
        return output

## Training

In [57]:
x = torch.randint(1000, size=(1,40))

enc = Encoder(hidden_size=512, 
              input_maxlen=40, 
              emb_matrix=django.emb_matrix, 
              bidir=True, 
              num_layers=1)

dec = Decoder(1000, 50, 1024, 512, 1, False)
out, (h, c) = enc(x)

inp = torch.tensor([12])
output, hidden = dec(inp, h, out)

ht torch.Size([2, 1, 512])
hs torch.Size([1, 40, 1024])
whs torch.Size([1, 40, 512])


RuntimeError: Expected tensor to have size 2 at dimension 0, but got size 1 for argument #2 'batch2' (while checking arguments for bmm)

### \#\#\# TESTING \#\#\#

In [12]:
j = 2808
# n = []
x = django.raw_example(j)['code']
xx = django.Y[j]
xx = xx[xx > 0]
x, ' '.join(list(map(lambda i: django.code_tok.index_word[i], xx)))

("parser . add_argument ( 'args' , metavar = 'app_label' , nargs = '+' ,  help = 'One or more application label.' )",
 "parser . add_argument ( 'args' , metavar = 'app_label' , nargs = '+' , help = 'One or more application label.' )")

In [32]:
from utils import load_pt_glove
e = load_pt_glove(f'{EMB_DIR}/glove.6B.50d.txt.pickle')
w = 'object'
assert np.isclose(e[w], django.emb_matrix[django.anno_tok.word_index[w]]).all()

> load glove from pickle
