In [None]:
%load_ext autoreload
%autoreload 2

from pathlib import Path

# EN = spacy.load('en_core_web_sm')
import en_core_web_sm
import pandas as pd
from sklearn.model_selection import train_test_split

# from general_utils import apply_parallel, flattenlist
EN = en_core_web_sm.load()

from ktext.preprocess import processor
import pandas as pd

In [None]:
use_cache = True

In [None]:
df = pd.read_pickle('./data/dataframe_processed.pkl')

## Separate function w/o docstrings

In [None]:
def listlen(x):
    if not isinstance(x, list):
        return 0
    return len(x)

# separate functions w/o docstrings
# docstrings should be at least 3 words in the docstring to be considered a valid docstring

with_docstrings = df[df.docstring_tokens.str.split().apply(listlen) >= 3]
without_docstrings = df[df.docstring_tokens.str.split().apply(listlen) < 3]

## Partition code by repository to minimize leakage between train, valid & test sets. 
Rough assumption that each repository has its own style.  We want to avoid having code from the same repository in the training set as well as the validation or holdout set.

In [None]:
grouped = with_docstrings.groupby('nwo')

In [None]:
# train, valid, test splits
train, test = train_test_split(list(grouped), train_size=0.87, shuffle=True, random_state=8081)
# train, valid = train_test_split(train, train_size=0.82, random_state=8081)

In [None]:
train = pd.concat([d for _, d in train]).reset_index(drop=True)
# valid = pd.concat([d for _, d in valid]).reset_index(drop=True)
test = pd.concat([d for _, d in test]).reset_index(drop=True)

In [None]:
print(f'train set num rows {train.shape[0]:,}')
# print(f'valid set num rows {valid.shape[0]:,}')
print(f'test set num rows {test.shape[0]:,}')
print(f'without docstring rows {without_docstrings.shape[0]:,}')

Preview what the training set looks like.  You can start to see how the data looks, the function tokens and docstring tokens are what will be fed downstream into the models.  The other information is important for diagnostics and bookeeping.

In [None]:
train.head()

In [None]:
train['api_sequence'].shape

In [None]:
df.head()

In [None]:
if use_cache:
    OUTPUT_PATH = Path('./data/seq2seq/processors/')
    import dill as dpickle
    import numpy as np

    with open(OUTPUT_PATH/'function_token_processor.dpkl', 'rb') as f:
        function_token_processor = dpickle.load(f)

    with open(OUTPUT_PATH/'docstring_processor.dpkl', 'rb') as f:
        docstring_processor = dpickle.load(f)

    with open(OUTPUT_PATH/'methname_processor.dpkl', 'rb') as f:
        methname_processor = dpickle.load(f)

    with open(OUTPUT_PATH/'api_seq_processor.dpkl', 'rb') as f:
        api_seq_processor = dpickle.load(f)
    
    train_token_v = np.load('./data/seq2seq/train.tokens.npy')
    train_api_seq_v = np.load('./data/seq2seq/train.apiseq.npy')
    train_methname_v = np.load('./data/seq2seq/train.methname.npy')
    train_docstring_v = np.load('./data/seq2seq/train.desc.npy')
    

## Vectorize the training set by creating a bag of words model with vocabulary size 20,000

In [None]:
if not use_cache:
    from ktext.preprocess import processor
    keep_n = 20000

    # todo, probably tokens should also be seq to seq
    function_token_processor = processor(hueristic_pct_padding=0.7, keep_n=keep_n, padding='post', truncating='post')
    train_token_v = function_token_processor.fit_transform(train['function_tokens'])

    docstring_processor = processor(append_indicators=True, hueristic_pct_padding=0.7, keep_n=keep_n, padding='post', truncating='post')
    train_docstring_v = docstring_processor.fit_transform(train['docstring_tokens'])

    methname_processor = processor(append_indicators=True, hueristic_pct_padding=0.7, keep_n=keep_n, padding='post', truncating='post')
    train_methname_v = methname_processor.fit_transform(train['tokenized_function_name'])

    api_seq_processor = processor(append_indicators=True, hueristic_pct_padding=0.7, keep_n=keep_n, padding='post', truncating='post')
    train_api_seq_v = api_seq_processor.fit_transform(train['api_sequence'])


In [None]:
function_token_processor.document_length_stats.to_csv('function_token_processor.hist.csv', index=False)
docstring_processor.document_length_stats.to_csv('docstring_processor.hist.csv', index=False)
methname_processor.document_length_stats.to_csv('methname_processor.hist.csv', index=False)
api_seq_processor.document_length_stats.to_csv('api_seq_processor.hist.csv', index=False)

In [None]:
print(train_token_v.shape)

In [None]:
train_token_v[0]

In [None]:
print(train_docstring_v.shape)

In [None]:
print(train_api_seq_v.shape)

In [None]:
print(train_methname_v.shape)

In [None]:
OUTPUT_PATH = Path('./data/seq2seq/')
import dill as dpickle
import numpy as np


# Save the preprocessor
with open(OUTPUT_PATH/'function_token_processor.dpkl', 'wb') as f:
    dpickle.dump(function_token_processor, f)

with open(OUTPUT_PATH/'docstring_processor.dpkl', 'wb') as f:
    dpickle.dump(docstring_processor, f)

with open(OUTPUT_PATH/'methname_processor.dpkl', 'wb') as f:
    dpickle.dump(methname_processor, f)

with open(OUTPUT_PATH/'api_seq_processor.dpkl', 'wb') as f:
    dpickle.dump(api_seq_processor, f)

In [None]:
import tables

def save_vecs(vecs, fout):
    np.save(fout, vecs)

In [None]:
save_vecs(train_token_v, './data/seq2seq/train.tokens.npy')

In [None]:
save_vecs(train_api_seq_v, './data/seq2seq/train.apiseq.npy')

In [None]:
save_vecs(train_methname_v, './data/seq2seq/train.methname.npy')

In [None]:
save_vecs(train_docstring_v, './data/seq2seq/train.desc.npy')

In [None]:
w_tokens = function_token_processor.transform_parallel(without_docstrings['function_tokens'])

In [None]:
w_apiseq = api_seq_processor.transform_parallel(without_docstrings['api_sequence'])

In [None]:
w_methname = methname_processor.transform_parallel(without_docstrings['tokenized_function_name'])

In [None]:
save_vecs(w_tokens, './data/seq2seq/without_docstring.tokens.npy')
save_vecs(w_apiseq, './data/seq2seq/without_docstring.apiseq.npy')
save_vecs(w_methname, './data/seq2seq/without_docstring.methname.npy')

# Generating Test vectors

In [None]:
test_token_v = function_token_processor.transform_parallel(test['function_tokens'])
test_api_seq_v = api_seq_processor.transform_parallel(test['api_sequence'])
test_methname_v = methname_processor.transform_parallel(test['tokenized_function_name'])
test_docstring_v = docstring_processor.transform_parallel(test['docstring_tokens'])

In [None]:
save_vecs(test_token_v, './data/seq2seq/test.tokens.npy')
save_vecs(test_api_seq_v, './data/seq2seq/test.apiseq.npy')
save_vecs(test_methname_v, './data/seq2seq/test.methname.npy')
save_vecs(test_docstring_v, './data/seq2seq/test.desc.npy')

# Generating Vocab

In [None]:
import pickle

with open('./data/seq2seq/vocab.apiseq.pkl', 'wb') as f:
    pickle.dump(api_seq_processor.token2id, f)

with open('./data/seq2seq/vocab.methname.pkl', 'wb') as f:
    pickle.dump(methname_processor.token2id, f)

with open('./data/seq2seq/vocab.desc.pkl', 'wb') as f:
    pickle.dump(docstring_processor.token2id, f)

with open('./data/seq2seq/vocab.tokens.pkl', 'wb') as f:
    pickle.dump(function_token_processor.token2id, f)

In [None]:
%reload_ext autoreload
%autoreload 2
OUTPUT_PATH = Path('./data/seq2seq/')
from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor


encoder_input_data, encoder_seq_len = load_encoder_inputs('./data/seq2seq/train.tokens.npy')
decoder_input_data, decoder_target_data = load_decoder_inputs('./data/seq2seq/train.desc.npy')
num_encoder_tokens, enc_pp = load_text_processor(OUTPUT_PATH/'function_token_processor.dpkl')
num_decoder_tokens, dec_pp = load_text_processor(OUTPUT_PATH/'docstring_processor.dpkl')

In [None]:
from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor


api_encoder_input_data, api_encoder_seq_len = load_encoder_inputs('./data/seq2seq/train.apiseq.npy')
decoder_input_data, decoder_target_data = load_decoder_inputs('./data/seq2seq/train.desc.npy')
api_num_encoder_tokens, api_enc_pp = load_text_processor(OUTPUT_PATH/'api_seq_processor.dpkl')
num_decoder_tokens, dec_pp = load_text_processor(OUTPUT_PATH/'docstring_processor.dpkl')

In [None]:
from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor


methname_encoder_input_data, methname_encoder_len = load_encoder_inputs('./data/seq2seq/train.methname.npy')
decoder_input_data, decoder_target_data = load_decoder_inputs('./data/seq2seq/train.desc.npy')
methname_num_encoder_tokens, methname_enc_pp = load_text_processor(OUTPUT_PATH/'methname_processor.dpkl')
num_decoder_tokens, dec_pp = load_text_processor(OUTPUT_PATH/'docstring_processor.dpkl')



# Build Seq2Seq Model For Summarizing Code

We will build a model to predict the docstring given a function or a method.  While this is a very cool task in itself, this is not the end goal of this exercise.  The motivation for training this model is to learn a general purpose feature extractor for code that we can use for the task of code search.

In [None]:
from seq2seq_utils import build_seq2seq_model

The convenience function `build_seq2seq_model` constructs the architecture for a sequence-to-sequence model.  

The architecture built for this tutorial is a minimal example with only one layer for the encoder and decoder, and does not include things like [attention](https://nlp.stanford.edu/pubs/emnlp15_attn.pdf).  We encourage you to try and build different architectures to see what works best for you!

In [None]:
seq2seq_Model = build_seq2seq_model(word_emb_dim=800,
                                    hidden_state_dim=1000,
                                    encoder_seq_len=encoder_seq_len,
                                    num_encoder_tokens=num_encoder_tokens,
                                    num_decoder_tokens=num_decoder_tokens)

In [None]:
seq2seq_Model.summary()

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
# !which python

### Train Seq2Seq Model

In [None]:
from keras.models import Model, load_model
import pandas as pd
import logging

from keras.callbacks import CSVLogger, ModelCheckpoint
import numpy as np
from keras import optimizers

import tensorflow as tf
# your code here
with tf.device('/gpu:0'):
    seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=0.00005), loss='sparse_categorical_crossentropy')

    script_name_base = 'py_func_sum_v9_'
    csv_logger = CSVLogger('{:}.log'.format(script_name_base))

    model_checkpoint = ModelCheckpoint('{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base),
                                       save_best_only=True)

    batch_size = 1100
    epochs = 20
    history = seq2seq_Model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1),
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.12, callbacks=[csv_logger, model_checkpoint])

## Training Seq To Seq for API sequence extracted from AST

In [None]:
from seq2seq_utils import build_seq2seq_model
api_seq_Model = build_seq2seq_model(word_emb_dim=800,
                                    hidden_state_dim=1000,
                                    encoder_seq_len=api_encoder_seq_len,
                                    num_encoder_tokens=api_num_encoder_tokens,
                                    num_decoder_tokens=num_decoder_tokens)


from keras.models import Model, load_model
import pandas as pd
import logging

from keras.callbacks import CSVLogger, ModelCheckpoint, TensorBoard
import numpy as np
from keras import optimizers

import tensorflow as tf
# your code here
with tf.device('/gpu:0'):
    api_seq_Model.compile(optimizer=optimizers.Nadam(lr=0.00005), loss='sparse_categorical_crossentropy')

    script_name_base = 'api_seq_model_'
    csv_logger = CSVLogger('{:}.log'.format(script_name_base))

    model_checkpoint = ModelCheckpoint('{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base),
                                       save_best_only=True)
    
    tensorboard = TensorBoard()

    batch_size = 1100
    epochs = 20
    history = api_seq_Model.fit([api_encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1),
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.12, callbacks=[csv_logger, model_checkpoint, tensorboard])

## Method name trianing

In [None]:
from seq2seq_utils import build_seq2seq_model
methname_model = build_seq2seq_model(word_emb_dim=800,
                                    hidden_state_dim=1000,
                                    encoder_seq_len=methname_encoder_len,
                                    num_encoder_tokens=methname_num_encoder_tokens,
                                    num_decoder_tokens=num_decoder_tokens)


from keras.models import Model, load_model
import pandas as pd
import logging

from keras.callbacks import CSVLogger, ModelCheckpoint
import numpy as np
from keras import optimizers

import tensorflow as tf
# your code here
with tf.device('/gpu:0'):
    methname_model.compile(optimizer=optimizers.Nadam(lr=0.00005), loss='sparse_categorical_crossentropy')

    script_name_base = 'methname_model_'
    csv_logger = CSVLogger('{:}.log'.format(script_name_base))

    model_checkpoint = ModelCheckpoint('{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base),
                                       save_best_only=True)

    batch_size = 1100
    epochs = 20
    history = methname_model.fit([methname_encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1),
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.12, callbacks=[csv_logger, model_checkpoint])

### Manually Inspect Results (on holdout set)

In [None]:
from keras.models import load_model
seq2seq_Model = load_model('py_func_sum_v9_.epoch20-val2.39946.hdf5')

In [None]:
from seq2seq_utils import Seq2Seq_Inference
import pandas as pd

seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=enc_pp,
                                 decoder_preprocessor=dec_pp,
                                 seq2seq_model=seq2seq_Model)

demo_testdf = pd.DataFrame({'code':test['function_tokens'], 'comment':test['docstring_tokens'], 'ref':''})
seq2seq_inf.demo_model_predictions(n=15, df=demo_testdf)

In [None]:
from keras.models import load_model
api_seq_Model = load_model('api_seq_model_.epoch20-val2.58093.hdf5')

In [None]:
from seq2seq_utils import Seq2Seq_Inference
import pandas as pd

api_seq_Model_inf = Seq2Seq_Inference(encoder_preprocessor=api_enc_pp,
                                 decoder_preprocessor=dec_pp,
                                 seq2seq_model=api_seq_Model)

demo_testdf = pd.DataFrame({'code':test['api_sequence'], 'comment':test['docstring_tokens'], 'ref':''})
api_seq_Model_inf.demo_model_predictions(n=15, df=demo_testdf)

In [None]:
from keras.models import load_model
methname_model = load_model('methname_model_.epoch20-val2.59926.hdf5')

In [None]:
from seq2seq_utils import Seq2Seq_Inference
import pandas as pd

methname_inf = Seq2Seq_Inference(encoder_preprocessor=methname_enc_pp,
                                 decoder_preprocessor=dec_pp,
                                 seq2seq_model=methname_model)

demo_testdf = pd.DataFrame({'code':test['tokenized_function_name'], 'comment':test['docstring_tokens'], 'ref':''})
methname_inf.demo_model_predictions(n=15, df=demo_testdf)

# Save model to disk

Save the model to disk so you can use it in Step 4 of this tutorial.

In [None]:
seq2seq_Model.save(OUTPUT_PATH/'code_summary_seq2seq_model.h5')

In [None]:
methname_model.save(OUTPUT_PATH/'methname_seq2seq_model.h5')
api_seq_Model.save(OUTPUT_PATH/'api_seq_seq2seq_model.h5')