# Transformer based model to translate English text to Python code

## Experiment 1 - Baseline Model

The goal is to  write a transformer-based model that can translats English text to python code(with proper whitespace indentations)

The training dataset contains around 4600+ examples of English text to python code. 
- must use transformers with self-attention, multi-head, and scaled-dot product attention in the model
- There is no limit on the number of training epochs or total number of parameters in the model
- should have trained a separate embedding layer for python keywords and paid special attention to whitespaces, colon and other things (like comma etc)
- model should to do proper indentation
- model should to use newline properly
- model should understand how to use colon (:)
- model should generate proper python code that can run on a Python interpreter and produce proper results


Some preprocessing checks on the dataset should be carried out like:
 - the dataset provided is divided into English and "python-code" pairs properly
the dataset does not have anomalies w.r.t. indentations (like a mixed-use of tabs and spaces, or use of either 4 or 3 spaces, it should be 4 spaces only). Either use tabs only or 4 spaces only, not both
- the length of the "python-code" generated is not out of your model's capacity


In [1]:
import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
import pandas as pd
import json

from torchtext.legacy.data import Field, BucketIterator, LabelField, TabularDataset
import io
from io import BytesIO
from tokenize import tokenize, untokenize, NUMBER, STRING, NAME, OP, tok_name

# Manual Seed
SEED = 43
torch.manual_seed(SEED)

USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
datasets = [[]]
file_name = '/content/drive/MyDrive/seq2py/data/english_python_data_cleaned.txt'

with open(file_name) as f:
  #my_dict = {"description":[],"code":[]}
  for line in f:
    if line.startswith('#'):
      comment = line.split('\n#')
      if datasets[-1] != []:
        # we are in a new block
        datasets.append(comment)
    else:
      stripped_line = line#.strip()
      if stripped_line:
        datasets[-1].append(stripped_line)
# datasets[0].insert(0,'# write a python program to add two numbers ')        

In [4]:
raw_data = {'Description' : [re.sub(r"^#(\d)*(.)(\s)*",'',x[0]).strip() for x in datasets], 'Code': [''.join(x[1:]) for x in datasets]}
df = pd.DataFrame(raw_data, columns=["Description", "Code"])

In [5]:
df['Description'][1617]

'Python Program to Find Sum of Natural Numbers Using Recursion'

In [6]:
df.head()

Unnamed: 0,Description,Code
0,,
1,write a python program to add two numbers,\nnum1 = 1.5\nnum2 = 6.3\nsum = num1 + num2\np...
2,write a python program to subtract two numbers,\nnum1 = 6\nnum2 = 3\ndiff = num1 - num2\nprin...
3,write a python function to add two user provid...,"def add_two_numbers(num1, num2):\n sum = nu..."
4,write a program to find and print the largest ...,num1 = 10\nnum2 = 12\nnum3 = 14\nif (num1 >= n...


In [7]:
df['Code'].replace("", float("NaN"), inplace=True)

In [8]:
df[df.isna().any(axis=1)]

Unnamed: 0,Description,Code
0,,


In [9]:
df.dropna(subset = ["Code"], inplace=True)


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4410 entries, 1 to 4410
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Description  4410 non-null   object
 1   Code         4410 non-null   object
dtypes: object(2)
memory usage: 103.4+ KB


In [11]:

# Dividing the data into train and validation dataset

train_df = df.sample(frac = 0.80) 
  
# Creating dataframe with rest of the 20% values 
valid_df = df.drop(train_df.index)

In [12]:
print(f'train df {train_df}')
print(f'Valid df {valid_df}')

train_df.to_csv('train.csv', index=False)
valid_df.to_csv('valid.csv', index=False)

train df                                             Description                                               Code
1497  write a Python function to check if a string i...  \ndef check2(string) : \n    t = '01'\n    cou...
641                                     string to tuple  def sen_to_tuple(sen):\n    return tuple(sen)\n\n
1206  Write a python function to determine optimal b...  \ndef get_max_profit(stock_prices):\n    max_p...
792   write a Python function to Find the Intersecti...  def intersection(a, b):\n    return list(set(a...
3599  write a python program that converts a binary ...  binary_num = '1010101'\ndecimal_num = int(bina...
...                                                 ...                                                ...
2039  Write a function to return the surface area of...  def cal_area_hemisphere(radius):\n    pi = 3.1...
1824  1. python function to return the nth fibonacci...  def fib(n):\n    if n <= 1:\n        return n\...
451           write a progra

In [13]:
# import io
# from io import BytesIO
# from tokenize import tokenize, untokenize, NUMBER, STRING, NAME, OP, tok_name

# def tokenize_code(text):
#     result = []
#     for tok in tokenize(io.BytesIO(text.encode('utf-8')).readline):
#         if tok_name[tok.exact_type] == 'NAME':
#             result.append(tok.string)
#         else:
#             result.append(tok_name[tok.exact_type])
#     return result

In [14]:
# tokenize_code(df['Code'][1])

In [15]:
'''
ENDMARKER = 0
NAME = 1
NUMBER = 2
STRING = 3
NEWLINE = 4
INDENT = 5
DEDENT = 6
LPAR = 7
RPAR = 8
LSQB = 9
RSQB = 10
COLON = 11
COMMA = 12
SEMI = 13
PLUS = 14
MINUS = 15
STAR = 16
SLASH = 17
VBAR = 18
AMPER = 19
LESS = 20
GREATER = 21
EQUAL = 22
DOT = 23
PERCENT = 24
LBRACE = 25
RBRACE = 26
EQEQUAL = 27
NOTEQUAL = 28
LESSEQUAL = 29
GREATEREQUAL = 30
TILDE = 31
CIRCUMFLEX = 32
LEFTSHIFT = 33
RIGHTSHIFT = 34
DOUBLESTAR = 35
PLUSEQUAL = 36
MINEQUAL = 37
STAREQUAL = 38
SLASHEQUAL = 39
PERCENTEQUAL = 40
AMPEREQUAL = 41
VBAREQUAL = 42
CIRCUMFLEXEQUAL = 43
LEFTSHIFTEQUAL = 44
RIGHTSHIFTEQUAL = 45
DOUBLESTAREQUAL = 46
DOUBLESLASH = 47
DOUBLESLASHEQUAL = 48
AT = 49
ATEQUAL = 50
RARROW = 51
ELLIPSIS = 52
COLONEQUAL = 53
OP = 54
AWAIT = 55
ASYNC = 56
TYPE_IGNORE = 57
TYPE_COMMENT = 58
# These aren't used by the C tokenizer but are needed for tokenize.py
ERRORTOKEN = 59
COMMENT = 60
NL = 61
ENCODING = 62
N_TOKENS = 63
# Special definitions for cooperation with parser
NT_OFFSET = 256
'''

"\nENDMARKER = 0\nNAME = 1\nNUMBER = 2\nSTRING = 3\nNEWLINE = 4\nINDENT = 5\nDEDENT = 6\nLPAR = 7\nRPAR = 8\nLSQB = 9\nRSQB = 10\nCOLON = 11\nCOMMA = 12\nSEMI = 13\nPLUS = 14\nMINUS = 15\nSTAR = 16\nSLASH = 17\nVBAR = 18\nAMPER = 19\nLESS = 20\nGREATER = 21\nEQUAL = 22\nDOT = 23\nPERCENT = 24\nLBRACE = 25\nRBRACE = 26\nEQEQUAL = 27\nNOTEQUAL = 28\nLESSEQUAL = 29\nGREATEREQUAL = 30\nTILDE = 31\nCIRCUMFLEX = 32\nLEFTSHIFT = 33\nRIGHTSHIFT = 34\nDOUBLESTAR = 35\nPLUSEQUAL = 36\nMINEQUAL = 37\nSTAREQUAL = 38\nSLASHEQUAL = 39\nPERCENTEQUAL = 40\nAMPEREQUAL = 41\nVBAREQUAL = 42\nCIRCUMFLEXEQUAL = 43\nLEFTSHIFTEQUAL = 44\nRIGHTSHIFTEQUAL = 45\nDOUBLESTAREQUAL = 46\nDOUBLESLASH = 47\nDOUBLESLASHEQUAL = 48\nAT = 49\nATEQUAL = 50\nRARROW = 51\nELLIPSIS = 52\nCOLONEQUAL = 53\nOP = 54\nAWAIT = 55\nASYNC = 56\nTYPE_IGNORE = 57\nTYPE_COMMENT = 58\n# These aren't used by the C tokenizer but are needed for tokenize.py\nERRORTOKEN = 59\nCOMMENT = 60\nNL = 61\nENCODING = 62\nN_TOKENS = 63\n# Special def

In [16]:
#https://docs.python.org/3/library/tokenize.html
# def tokenize_python(code_snippet):
#     tokens = tokenize(io.BytesIO(code_snippet.encode('utf-8')).readline)
#     parsed = []
#     for token in tokens:
#         if token.type not in [0,59,60,61,62,63,256]:
#             parsed.append(token.string)
#     return parsed

In [17]:
from token import tok_name as py_token_name
# from tokenize import tokenize, untokenize, NUMBER, STRING, NAME, OP, tok_name, INDENT
import tokenize

TOKEN_NAME = py_token_name[NAME]
TOKEN_SUBNAME = 'SUBNAME'
TOKEN_NAMECON = 'NAMECON'

token_id = {token_name: token_id for token_id, token_name in enumerate(sorted(py_token_name.values()))}
token_name = {token_id[token_name]: token_name for token_name in token_id}

def add_token(name):
    tid = len(token_id)
    token_id[name] = tid
    token_name[tid] = name
    print(tid)
    return tid

SUBNAME = add_token(TOKEN_SUBNAME)
NAMECON = add_token(TOKEN_NAMECON)


def tokenize_camel_case(string):
    words = []
    from_char_position = 0
    for current_char_position, (current_char, prev_char) in enumerate(zip(string,'a' + string)):
        if prev_char.isupper() and current_char.islower() and from_char_position < current_char_position - 1:
            words.append(string[from_char_position:current_char_position - 1])
            from_char_position = current_char_position - 1
    words.append(string[from_char_position:])
    return words


def tokenize_underscore(string):
    words = []
    for word in string.split('_'):
        if words:
            words.append('_')
        if word:
            words.append(word)
    return words


def tokenize_name(string, concat_symbol=None):
    words = []
    con = [] if concat_symbol is None else [(NAMECON, concat_symbol)]
    for sub_name in tokenize_camel_case(string):
        for word in tokenize_underscore(sub_name):
            words += [(word)] + con
    
    if words and words[-1] == concat_symbol:
        return words[:-1]
    return words

def tokenize_python(string, concat_symbol=None):
    tokens_ = []
    OPENERS=('class', 'def', 'for', 'if', 'try', 'while')
    #for toknum, tokval, start, end, line  in tokenize(io.BytesIO(string.encode('utf-8')).readline):
    #tokens = tokenize.generate_tokens(io.StringIO(string).readline)
    for toknum, tokval, start, end, line  in tokenize.generate_tokens(io.StringIO(string).readline):
        tokname = py_token_name[toknum]
        if tokname == TOKEN_NAME:
            tokens_.extend(tokenize_name(tokval, concat_symbol=concat_symbol))
        if toknum == tokenize.INDENT:
            val = int(len(tokval)/4)
            tokens_.append(val*'\t')
        else:
            tokens_.append(tokval)
    return tokens_

59
60


In [18]:
print(df['Code'][8])

def print_factors(x):
    print(f"The factors of {x} are:")
    for i in range(1, x + 1):
        if x % i == 0:
            print(i)




In [19]:
print(tokenize_python(df['Code'][8]))

['def', 'def', 'print', '_', 'factors', 'print_factors', '(', 'x', 'x', ')', ':', '\n', '\t', 'print', 'print', '(', 'f"The factors of {x} are:"', ')', '\n', 'for', 'for', 'i', 'i', 'in', 'in', 'range', 'range', '(', '1', ',', 'x', 'x', '+', '1', ')', ':', '\n', '\t\t', 'if', 'if', 'x', 'x', '%', 'i', 'i', '==', '0', ':', '\n', '\t\t\t', 'print', 'print', '(', 'i', 'i', ')', '\n', '\n', '', '', '', '']


In [20]:
import spacy
spacy_en = spacy.load('en')

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

SRC = Field(tokenize= tokenize_en, 
            init_token='<sos>', 
            eos_token='<eos>', 
            lower=True,
            batch_first=True)

TRG = Field(tokenize = tokenize_python, 
            init_token='<sos>', 
            eos_token='<eos>', 
            lower=False,
            batch_first=True)

In [21]:
fields = [('Description', SRC),('Code',TRG)]


In [22]:
# Using tabular dataset to process the text

train_data, test_data = TabularDataset.splits(
                                path = '',   
                                train = './train.csv',
                                test = './valid.csv',
                                format = 'csv',
                                fields = fields)

In [23]:
BATCH_SIZE = 16
device = "cuda" if torch.cuda.is_available() else "cpu"

In [24]:
SRC.build_vocab(train_data)
TRG.build_vocab(train_data)

In [25]:
len(SRC.vocab)

2099

In [26]:
len(TRG.vocab)

6709

In [27]:
train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.Code),
    device = device)

In [28]:
class Seq2Seq(nn.Module):
    def __init__(self, 
                 encoder, 
                 decoder, 
                 src_pad_idx, 
                 trg_pad_idx, 
                 device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device
        
    def make_src_mask(self, src):
        
        #src = [batch size, src len]
        
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)

        #src_mask = [batch size, 1, 1, src len]

        return src_mask
    
    def make_trg_mask(self, trg):
        
        #trg = [batch size, trg len]
        
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        
        #trg_pad_mask = [batch size, 1, 1, trg len]
        
        trg_len = trg.shape[1]
        
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device = self.device)).bool()
        
        #trg_sub_mask = [trg len, trg len]
            
        trg_mask = trg_pad_mask & trg_sub_mask
        
        #trg_mask = [batch size, 1, trg len, trg len]
        
        return trg_mask

    def forward(self, src, trg):
        
        #src = [batch size, src len]
        #trg = [batch size, trg len]
                
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        
        #src_mask = [batch size, 1, 1, src len]
        #trg_mask = [batch size, 1, trg len, trg len]


        enc_src = self.encoder(src, src_mask)
        
        #enc_src = [batch size, src len, hid dim]
                
        output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)
        
        #output = [batch size, trg len, output dim]
        #attention = [batch size, n heads, trg len, src len]        
        return output, attention

In [29]:
class Encoder(nn.Module):
    def __init__(self, 
                 input_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim,
                 dropout, 
                 device,
                 max_length = 2000):
        super().__init__()

        self.device = device
        
        self.tok_embedding = nn.Embedding(input_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        
        self.layers = nn.ModuleList([EncoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim,
                                                  dropout, 
                                                  device) 
                                     for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, src, src_mask):
        
        #src = [batch size, src len]
        #src_mask = [batch size, 1, 1, src len]
        
        batch_size = src.shape[0]
        src_len = src.shape[1]
        
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        
        #pos = [batch size, src len]

        src = self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))

        
        #src = [batch size, src len, hid dim]
        
        for layer in self.layers:
            src = layer(src, src_mask)
            
        #src = [batch size, src len, hid dim]
 
            
        return src

In [30]:
class EncoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim,  
                 dropout, 
                 device):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, src_mask):
        
        #src = [batch size, src len, hid dim]
        #src_mask = [batch size, 1, 1, src len] 
                
        #self attention
        _src, _ = self.self_attention(src, src, src, src_mask)
        
        #dropout, residual connection and layer norm
        src = self.self_attn_layer_norm(src + self.dropout(_src))
        
        #src = [batch size, src len, hid dim]
        
        #positionwise feedforward
        _src = self.positionwise_feedforward(src)
        
        #dropout, residual and layer norm
        src = self.ff_layer_norm(src + self.dropout(_src))
        
        #src = [batch size, src len, hid dim]
        
        return src

In [31]:
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()
        
        assert hid_dim % n_heads == 0
        
        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads
        
        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)
        
        self.fc_o = nn.Linear(hid_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)
        
    def forward(self, query, key, value, mask = None):
        
        batch_size = query.shape[0]
        
        #query = [batch size, query len, hid dim]
        #key = [batch size, key len, hid dim]
        #value = [batch size, value len, hid dim]
                
        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)
        
        #Q = [batch size, query len, hid dim]
        #K = [batch size, key len, hid dim]
        #V = [batch size, value len, hid dim]
                
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        
        #Q = [batch size, n heads, query len, head dim]
        #K = [batch size, n heads, key len, head dim]
        #V = [batch size, n heads, value len, head dim]
                
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        
        #energy = [batch size, n heads, query len, key len]
        
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        
        attention = torch.softmax(energy, dim = -1)
                
        #attention = [batch size, n heads, query len, key len]
                
        x = torch.matmul(self.dropout(attention), V)
        
        #x = [batch size, n heads, query len, head dim]
        
        x = x.permute(0, 2, 1, 3).contiguous()
        
        #x = [batch size, query len, n heads, head dim]
        
        x = x.view(batch_size, -1, self.hid_dim)
        
        #x = [batch size, query len, hid dim]
        
        x = self.fc_o(x)
        
        #x = [batch size, query len, hid dim]
        
        return x, attention

In [32]:
class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        
        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [batch size, seq len, hid dim]
        
        x = self.dropout(torch.relu(self.fc_1(x)))
        
        #x = [batch size, seq len, pf dim]
        
        x = self.fc_2(x)
        
        #x = [batch size, seq len, hid dim]
        
        return x

In [33]:
class Decoder(nn.Module):
    def __init__(self, 
                 output_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device,
                 max_length = 2000):
        super().__init__()
        
        self.device = device
        
        self.tok_embedding = nn.Embedding(output_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        
        self.layers = nn.ModuleList([DecoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim, 
                                                  dropout, 
                                                  device)
                                     for _ in range(n_layers)])
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):
        
        #trg = [batch size, trg len]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, 1, trg len, trg len]
        #src_mask = [batch size, 1, 1, src len]
                
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
                            
        #pos = [batch size, trg len]
            
        trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))
                
        #trg = [batch size, trg len, hid dim]
        
        for layer in self.layers:
            trg, attention = layer(trg, enc_src, trg_mask, src_mask)
        
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        output = self.fc_out(trg)
        
        #output = [batch size, trg len, output dim]
            
        return output, attention

In [34]:
class DecoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.encoder_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):
        
        #trg = [batch size, trg len, hid dim]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, 1, trg len, trg len]
        #src_mask = [batch size, 1, 1, src len]
        
        #self attention
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
        
        #dropout, residual connection and layer norm
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))
            
        #trg = [batch size, trg len, hid dim]
            
        #encoder attention
        _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)
        # query, key, value
        
        #dropout, residual connection and layer norm
        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))
                    
        #trg = [batch size, trg len, hid dim]
        
        #positionwise feedforward
        _trg = self.positionwise_feedforward(trg)
        
        #dropout, residual and layer norm
        trg = self.ff_layer_norm(trg + self.dropout(_trg))
        
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return trg, attention

In [75]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
HID_DIM = 256
ENC_LAYERS = 2
DEC_LAYERS = 2
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.3
DEC_DROPOUT = 0.3

enc = Encoder(INPUT_DIM, 
              HID_DIM, 
              ENC_LAYERS, 
              ENC_HEADS, 
              ENC_PF_DIM, 
              ENC_DROPOUT, 
              device)


dec = Decoder(OUTPUT_DIM, 
              HID_DIM, 
              DEC_LAYERS, 
              DEC_HEADS, 
              DEC_PF_DIM, 
              DEC_DROPOUT,
              device)

In [76]:
SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)

In [77]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 7,638,837 trainable parameters


In [78]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

In [79]:
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)


In [80]:
model.apply(initialize_weights);


In [81]:
LEARNING_RATE = 0.0005
optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

In [82]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.Description
        trg = batch.Code
        
        optimizer.zero_grad()
        
        output, _ = model(src, trg[:,:-1])

                
        #output = [batch size, trg len - 1, output dim]
        #trg = [batch size, trg len]
            
        output_dim = output.shape[-1]

            
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:,1:].contiguous().view(-1)
                
        #output = [batch size * trg len - 1, output dim]
        #trg = [batch size * trg len - 1]
            
        loss = criterion(output, trg)
        #loss = maskNLLLoss(output, trg,model.trg_pad_idx)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [83]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.Description
            trg = batch.Code

            output, _ = model(src, trg[:,:-1])
            
            #output = [batch size, trg len - 1, output dim]
            #trg = [batch size, trg len]
            
            output_dim = output.shape[-1]
           
            
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)
            
            #output = [batch size * trg len - 1, output dim]
            #trg = [batch size * trg len - 1]
            
            
            loss = criterion(output, trg)
            #loss = maskNLLLoss(output, trg,model.trg_pad_idx)

            #loss,_ = maskNLLLoss(output, trg, mask)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [84]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [85]:
import time
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, test_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut6-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 12s
	Train Loss: 4.362 | Train PPL:  78.437
	 Val. Loss: 3.137 |  Val. PPL:  23.029
Epoch: 02 | Time: 0m 12s
	Train Loss: 2.906 | Train PPL:  18.282
	 Val. Loss: 2.598 |  Val. PPL:  13.431
Epoch: 03 | Time: 0m 12s
	Train Loss: 2.459 | Train PPL:  11.689
	 Val. Loss: 2.350 |  Val. PPL:  10.483
Epoch: 04 | Time: 0m 12s
	Train Loss: 2.174 | Train PPL:   8.792
	 Val. Loss: 2.139 |  Val. PPL:   8.492
Epoch: 05 | Time: 0m 12s
	Train Loss: 1.955 | Train PPL:   7.064
	 Val. Loss: 2.002 |  Val. PPL:   7.401
Epoch: 06 | Time: 0m 12s
	Train Loss: 1.781 | Train PPL:   5.934
	 Val. Loss: 1.929 |  Val. PPL:   6.885
Epoch: 07 | Time: 0m 12s
	Train Loss: 1.643 | Train PPL:   5.168
	 Val. Loss: 1.859 |  Val. PPL:   6.419
Epoch: 08 | Time: 0m 12s
	Train Loss: 1.525 | Train PPL:   4.596
	 Val. Loss: 1.802 |  Val. PPL:   6.060
Epoch: 09 | Time: 0m 12s
	Train Loss: 1.428 | Train PPL:   4.168
	 Val. Loss: 1.744 |  Val. PPL:   5.722
Epoch: 10 | Time: 0m 12s
	Train Loss: 1.347 | Train PPL

In [86]:
def translate_sentence(sentence, src_field, trg_field, model, device, max_len = 500):

    model.eval()
        
    if isinstance(sentence, str):
        nlp = spacy.load('en')
        tokens = [token.text.lower() for token in nlp(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
        
    src_indexes = [src_field.vocab.stoi[token] for token in tokens]

    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
    src_mask = model.make_src_mask(src_tensor)

    with torch.no_grad():
        enc_src = model.encoder(src_tensor,src_mask)

    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]

    for i in range(max_len):

        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
        trg_mask = model.make_trg_mask(trg_tensor)
        with torch.no_grad():
            output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
        
        pred_token = output.argmax(2)[:,-1].item()
        
        trg_indexes.append(pred_token)

        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break
    
    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
    
    return trg_tokens[1:]#, attention

In [87]:
# Load model
trained_model = 'tut6-model.pt'
model.load_state_dict(torch.load(trained_model));
model.eval()

Seq2Seq(
  (encoder): Encoder(
    (tok_embedding): Embedding(2099, 256)
    (pos_embedding): Embedding(2000, 256)
    (layers): ModuleList(
      (0): EncoderLayer(
        (self_attn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (ff_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (self_attention): MultiHeadAttentionLayer(
          (fc_q): Linear(in_features=256, out_features=256, bias=True)
          (fc_k): Linear(in_features=256, out_features=256, bias=True)
          (fc_v): Linear(in_features=256, out_features=256, bias=True)
          (fc_o): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.3, inplace=False)
        )
        (positionwise_feedforward): PositionwiseFeedforwardLayer(
          (fc_1): Linear(in_features=256, out_features=512, bias=True)
          (fc_2): Linear(in_features=512, out_features=256, bias=True)
          (dropout): Dropout(p=0.3, inplace=False)
        )
    

In [88]:
sentence = "write a program to find and print the largest among three numbers"
code = translate_sentence(sentence, SRC, TRG, model, device)
#output_words[:] = [x for x in code if not (x == '<eos>' or x == '<pad>' or x == '<unk>' or x == 'utf-8')]
print(f'predicted trg = {code}')
#print(f'predicted trg = {" ".join(code)}')


predicted trg = ['\n', 'num1', 'num1', '=', '12', '\n', 'num2', 'num2', 'num2', '=', '6.3', '\n', 'num3', 'num3', 'num3', 'num3', '=', 'num1', 'num1', 'num1', 'num1', '\n', 'if', 'if', '(', 'num2', 'num2', 'num2', 'num2', ')', 'and', '(', 'num1', 'num1', 'num1', 'num1', 'num1', 'num1', 'num1', 'num1', 'num1', 'num1', 'num1', 'num1', 'num1', 'num1', 'num1', 'num1', 'num1', 'num1', 'num1', 'num1', ')', 'and', '(', 'num2', 'num2', 'num2', 'num2', 'num2', 'num2', 'num2', 'num2', 'num2', 'num2', 'num2', 'num2', ')', 'and', 'and', 'and', 'and', '(', 'num2', 'num2', 'num2', 'num2', 'num2', 'num2', 'num2', 'num2', 'num2', ')', 'and', 'and', 'and', '(', 'num2', 'num2', 'num2', 'num2', 'num2', 'num2', 'num2', ')', 'and', 'and', 'and', 'and', '(', 'num1', '>=', 'num3', 'num3', 'num3', 'num3', 'num3', 'num3', 'num3', 'num3', 'num3', 'num3', 'num3', 'num3', 'num3', 'num3', 'num3', 'num3', 'num3', 'num3', 'num3', 'num3', 'num3', 'num3', 'num3', 'num3', ')', '', '', '<eos>']


In [89]:
print("".join(code))


num1num1=12
num2num2num2=6.3
num3num3num3num3=num1num1num1num1
ifif(num2num2num2num2)and(num1num1num1num1num1num1num1num1num1num1num1num1num1num1num1num1num1num1num1num1)and(num2num2num2num2num2num2num2num2num2num2num2num2)andandandand(num2num2num2num2num2num2num2num2num2)andandand(num2num2num2num2num2num2num2)andandandand(num1>=num3num3num3num3num3num3num3num3num3num3num3num3num3num3num3num3num3num3num3num3num3num3num3num3)<eos>


In [90]:
sentence = "write a program to add two numbers"
code = translate_sentence(sentence, SRC, TRG, model, device)
print(f'predicted trg = {code}')
print("".join(code))

predicted trg = ['\n', 'a', 'a', '=', '[', '1', ',', '2', ',', '3', ',', '4', ',', '6', ',', '8', ',', '8', ',', '9', ',', '10', ']', '\n', 'b', '=', '4', '\n', 'sum', 'sum', 'sum', '=', '0', '\n', 'for', 'for', 'i', 'i', 'i', 'in', 'in', 'a', ':', '\n', '\t', 'if', 'if', 'i', 'i', '%', '2', '==', '0', ':', '\n', '\t\t', 'sum', 'sum', 'sum', 'sum', 'sum', 'sum', '=', 'i', 'i', 'i', '\n', '', '', 'print', '(', 'a', 'a', 'a', ')', '', '<eos>']

aa=[1,2,3,4,6,8,8,9,10]
b=4
sumsumsum=0
forforiiiinina:
	ififii%2==0:
		sumsumsumsumsumsum=iii
print(aaa)<eos>


In [91]:
sentence = "write a program to subtract two numbers"
code = translate_sentence(sentence, SRC, TRG, model, device)
print(f'predicted trg = {code}\n')
print("".join(code))

predicted trg = ['\n', 'def', 'def', 'compute', '_', 'hcf', 'compute_hcf', '(', 'x', ',', 'y', 'y', 'y', ')', ':', '\n', '\t', 'if', 'if', 'x', 'x', '>', 'y', ':', '\n', '\t\t', 'smaller', '=', 'y', 'y', '\n', '', 'else', 'else', ':', '\n', '\t\t', 'smaller', '=', 'y', '\n', '', 'for', 'for', 'i', 'i', 'i', 'in', 'in', 'range', 'range', '(', '1', ',', 'smaller', '+', '1', ')', ':', '\n', '\t\t', 'if', '(', 'y', 'y', 'y', 'y', 'y', 'y', ')', ':', '\n', '\t\t\t', 'hcf', '=', 'y', 'y', 'y', 'y', 'y', 'y', '\n', '', '', '', 'print', 'print', '(', '"The H.C.F. is"', ',', 'num2', 'num2', 'num2', ')', '', '<eos>']


defdefcompute_hcfcompute_hcf(x,yyy):
	ififxx>y:
		smaller=yy
elseelse:
		smaller=y
forforiiiininrangerange(1,smaller+1):
		if(yyyyyy):
			hcf=yyyyyy
printprint("The H.C.F. is",num2num2num2)<eos>


In [92]:
sentence = "write a program to print factorial of a number"
code = translate_sentence(sentence, SRC, TRG, model, device)
print(f'predicted trg = {code}\n')
print(" ".join(code))

predicted trg = ['\n', 'num', 'num', '=', 'int', 'int', '(', 'input', 'input', '(', '"Enter a number: "', ')', ')', '\n', 'factorial', 'factorial', 'factorial', 'factorial', 'factorial', 'factorial', 'factorial', '=', '1', '\n', 'if', 'if', 'num', 'num', 'num', 'num', 'num', 'num', 'num', 'num', 'num', '==', '0', ':', '\n', '\t', 'print', 'print', '(', '"Sorry, factorial does not exist for negative numbers"', ')', '', '', '', '<eos>']


 num num = int int ( input input ( "Enter a number: " ) ) 
 factorial factorial factorial factorial factorial factorial factorial = 1 
 if if num num num num num num num num num == 0 : 
 	 print print ( "Sorry, factorial does not exist for negative numbers" )    <eos>


In [93]:
sentence = "write a python function that return log of a number"
code = translate_sentence(sentence, SRC, TRG, model, device)
print(f'predicted trg = {code}\n')
print(" ".join(code))

predicted trg = ['def', 'def', 'square', '_', 'of', '_', 'roots', 'find_integer_square_roots', '(', 'num', 'num', ')', ':', '\n', '\t', 'if', 'num', 'num', 'num', 'num', 'num', '**', '2', ')', ':', '\n', '\t\t', 'return', 'return', 'num', 'num', '', '', '', '<eos>']

def def square _ of _ roots find_integer_square_roots ( num num ) : 
 	 if num num num num num ** 2 ) : 
 		 return return num num    <eos>


In [94]:
sentence = "return powerset of iterable"
code = translate_sentence(sentence, SRC, TRG, model, device)
print(f'predicted trg = {code}\n')
print(" ".join(code))

predicted trg = ['def', 'def', 'count', '_', 'count', '_', 'count', 'check_word_count', '(', 'iterable', 'iterable', 'iterable', 'iterable', ')', ':', '\n', '\t', 'from', 'itertools', 'itertools', 'itertools', 'itertools', 'itertools', 'import', 'import', 'import', 'import', 'import', 'groupby', '\n', 'return', 'return', 'return', '[', 'i', 'i', 'i', 'i', 'for', 'for', 'i', 'in', 'in', 'range', 'range', '(', '1', ',', 'len', 'len', '(', 'iterable', 'iterable', 'iterable', 'iterable', 'iterable', 'iterable', 'iterable', 'iterable', 'iterable', ')', ']', '', '', '<eos>']

def def count _ count _ count check_word_count ( iterable iterable iterable iterable ) : 
 	 from itertools itertools itertools itertools itertools import import import import import groupby 
 return return return [ i i i i for for i in in range range ( 1 , len len ( iterable iterable iterable iterable iterable iterable iterable iterable iterable ) ]   <eos>


In [95]:
sentence = "write a program to replace a string"
code = translate_sentence(sentence, SRC, TRG, model, device)
print(f'predicted trg = {code}\n')
print("".join(code))

predicted trg = ['str1', 'str1', '=', '"abc4234AFde"', '\n', 'digit', 'Count', 'digitCount', '=', '0', '\n', 'for', 'for', 'char', 'char', 'char', 'char', 'char', 'in', 'in', 'str1', 'str1', ':', '\n', '\t', 'char', 'char', 'char', 'char', 'char', 'char', 'char', 'char', 'char', '=', 'char', '\n', 'if', 'char', 'char', 'char', '.', 'islower', '(', ')', '\n', '', 'print', '(', 'char', ')', '', '<eos>']

str1str1="abc4234AFde"
digitCountdigitCount=0
forforcharcharcharcharcharininstr1str1:
	charcharcharcharcharcharcharcharchar=char
ifcharcharchar.islower()
print(char)<eos>


In [96]:

sentence = "area of a rectangle"
code = translate_sentence(sentence, SRC, TRG, model, device)
#print(f'predicted trg = {code}\n')## 
#print(tokenize.untokenize(code))

print(" ".join(code))

def def cal _ area _ rect cal_area_rect ( length length length length length ) : 
 	 return length length length length length length length length length length length length * breadth breadth breadth breadth breadth breadth breadth breadth breadth breadth breadth breadth breadth    <eos>
