In [7]:
from load_pairs_data import read_data
from pathlib import Path
import pandas as pd
import dill as dpickle
import torch
import numpy as np
from ktext.preprocess import processor
from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor
%matplotlib inline
from keras.models import Model, load_model
from keras.layers import Input, LSTM, GRU, Dense, Embedding, Bidirectional, BatchNormalization
from keras import optimizers
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"


USE_CACHE = True

# Get Data

Data is retrived through [GH Archive](https://www.gharchive.org/) an open source project that archives public Github code on BigQuery.  This demo uses python top-level functions only, so the first step is to query this data.  Below is the SQL Query used:

```{sql}
SELECT 
 max(concat(f.repo_name, ' ', f.path)) as repo_path,
 c.content
FROM `bigquery-public-data.github_repos.files` as f
JOIN `bigquery-public-data.github_repos.contents` as c on f.id = c.id
WHERE 
  f.path like '%.py' and --with python extension
  c.size < 15000 and --get rid of ridiculously long files
  REGEXP_CONTAINS(c.content, r'def ') --contains function
group by c.content
```

You can also view this query directly on Bigquery with the following URL: https://bigquery.cloud.google.com/savedquery/506213277345:c5e99f7fd4a04c67814eb7e992b49f6d

For convienience, I have cached the results of this query into a pickled dataframe hosted on Google Cloud: 



In [2]:
if not USE_CACHE:
    df = pd.read_csv('...')
    df.head()

# Parse Docstrings And Functions

We will take this data and write out three types of text files:

1. {train/valid/test}.function  : each line contains function tokens seperated by spaces
2. {train/valid/test}.docstring : each line contains docstring tokens seperated by spaces
3. {train/valid/test}.lineage   : each line contains a url link to the original function

All of these files have the same number of rows because each row is related to eachother.  The training data only includes top-level functions that have docstrings.  **A an important TODO is to use the code without any docstrings as a holdout set and run search on that. **

The parsing of code is done via python's built in `AST` module.

In [3]:
if not USE_CACHE:
    #TODO: Ho-Hsiang add your code here
    raise NotImplementedError

In [4]:
if USE_CACHE:

    PATH = Path('/ds/hohsiangwu/projects/semantic_search')

    train_code, train_comment, holdout_code, holdout_comment, train_lineage, holdout_lineage = \
    read_data(PATH)

train_code rows: 4,978,625
holdout_code rows: 50,290
total code rows: 5,028,915

train_comment rows: 4,978,625
holdout_comment rows: 50,290
total comment rows: 5,028,915

train_lineage rows: 4,978,625
holdout_comment rows: 50,290
total lineage rows: 5,028,915


# Pre-Process Data With `Ktext`

In [5]:
if not USE_CACHE:
    
    
    code_proc = processor(hueristic_pct_padding=.7, keep_n=20000)
    t_code = code_proc.fit_transform(train_code)

    comment_proc = processor(append_indicators=True, 
                             hueristic_pct_padding=.7, 
                             keep_n=15000, 
                             padding ='post')
    
    t_comment = comment_proc.fit_transform(train_comment)
    
    raise NotImplementedError
    # TODO: finish this
    
    #Save the preprocessor
    with open('py_code_proc.dpkl', 'wb') as f:
        dpickle.dump(code_proc, f)

    with open('py_comment_proc.dpkl', 'wb') as f:
        dpickle.dump(comment_proc, f)

    # # Save the processed data
    np.save('py_t_code_vecs.npy', t_code)
    np.save('py_t_comment_vecs.npy', t_comment)

In [6]:
if USE_CACHE:
    encoder_input_data, doc_length = load_encoder_inputs(PATH/'py_t_code_vecs.npy')
    decoder_input_data, decoder_target_data = load_decoder_inputs(PATH/'py_t_comment_vecs.npy')
    num_encoder_tokens, body_pp = load_text_processor(PATH/'py_code_proc.dpkl')
    num_decoder_tokens, title_pp = load_text_processor(PATH/'py_comment_proc.dpkl')

Shape of encoder input: (4978625, 45)
Shape of decoder input: (4978625, 14)
Shape of decoder target: (4978625, 14)
Size of vocabulary for /ds/hohsiangwu/projects/semantic_search/py_code_proc.dpkl: 10,002
Size of vocabulary for /ds/hohsiangwu/projects/semantic_search/py_comment_proc.dpkl: 8,002


# Train Function Summarizer So You Can Use This For Transfer Learning

### Define Model

In [7]:
#arbitrarly set latent dimension for embedding and hidden units
latent_dim = 800

##### Define Model Architecture ######

########################
#### Encoder Model ####
encoder_inputs = Input(shape=(doc_length,), name='Encoder-Input')

# Word embeding for encoder (ex: Issue Body)
x = Embedding(num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs)
x = BatchNormalization(name='Encoder-Batchnorm-1')(x)

# Intermediate GRU layer (optional)
# x = GRU(latent_dim, name='Encoder-Intermediate-GRU', return_sequences=True)(x)
# x = BatchNormalization(name='Encoder-Batchnorm-2')(x)

# We do not need the `encoder_output` just the hidden state.
_, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU', dropout=.5)(x)

# Encapsulate the encoder as a separate entity so we can just 
#  encode without decoding if we want to.
encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')

seq2seq_encoder_out = encoder_model(encoder_inputs)

########################
#### Decoder Model ####
decoder_inputs = Input(shape=(None,), name='Decoder-Input')  # for teacher forcing

# Word Embedding For Decoder (ex: Issue Titles)
dec_emb = Embedding(num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs)
dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)

# Set up the decoder, using `decoder_state_input` as initial state.
decoder_gru = GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU', dropout=.5)
decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out)
x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

# Dense layer for prediction
decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='Final-Output-Dense')
decoder_outputs = decoder_dense(x)

########################
#### Seq2Seq Model ####

#seq2seq_decoder_out = decoder_model([decoder_inputs, seq2seq_encoder_out])
seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
seq2seq_Model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Decoder-Input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Decoder-Word-Embedding (Embeddi (None, None, 800)    6401600     Decoder-Input[0][0]              
__________________________________________________________________________________________________
Encoder-Input (InputLayer)      (None, 45)           0                                            
__________________________________________________________________________________________________
Decoder-Batchnorm-1 (BatchNorma (None, None, 800)    3200        Decoder-Word-Embedding[0][0]     
__________________________________________________________________________________________________
Encoder-Mo

### Train Model

In [8]:
if not USE_CACHE:
    # Train Model
    seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=0.0015), loss='sparse_categorical_crossentropy')

    script_name_base = 'py_func_sum_v2_'
    csv_logger = CSVLogger('{:}.log'.format(script_name_base))
    model_checkpoint = ModelCheckpoint('{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base),
                                       save_best_only=True)

    batch_size = 900
    epochs = 60
    history = seq2seq_Model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1),
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.12, callbacks=[csv_logger, model_checkpoint])

if USE_CACHE:
    PATH = Path('/ds/hamel/CodeML/Get_Python_From_BigQuery')

    num_encoder_tokens, body_pp = load_text_processor(PATH/'py_code_proc.dpkl')
    num_decoder_tokens, title_pp = load_text_processor(PATH/'py_comment_proc.dpkl')
    seq2seq_Model = load_model(PATH/'seq2seq_code_search_py_v2.hdf5')

Size of vocabulary for /ds/hamel/CodeML/Get_Python_From_BigQuery/py_code_proc.dpkl: 20,002
Size of vocabulary for /ds/hamel/CodeML/Get_Python_From_BigQuery/py_comment_proc.dpkl: 15,002


### Sanity Check Predictions of Function Summarizer :

In [10]:

from seq2seq_utils import Seq2Seq_Inference
seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=body_pp,
                                 decoder_preprocessor=title_pp,
                                 seq2seq_model=seq2seq_Model)
                                 
demo_testdf = pd.DataFrame({'body':holdout_code, 'issue_title':holdout_comment, 'issue_url':''})
seq2seq_inf.demo_model_predictions(n=5, issue_df=demo_testdf)




Issue Body:
 def fit self X y None self random_state check_random_state self random_state X np asarray X code_init self V_init T if self V_init is not None else None dict_init self U_init T if self U_init is not None else None Vt _ E dict_learning X T self n_components self alpha tol self tol max_iter self max_iter method self method n_jobs self n_jobs verbose self verbose random_state self random_state code_init code_init dict_init dict_init self components_ Vt T self error_ E return self
 

Original Title:
 fit the model from data in x.


****** Machine Generated Title (Prediction) ******:
 fit the model from data in x



Issue Body:
 def test_nestedClass self self flakes def f foo class C bar foo def f self return foo return C f 123 f
 

Original Title:
 nested classes can access enclosing scope


****** Machine Generated Title (Prediction) ******:
 use a function to use a nested inside of cdata



Issue Body:
 def __init__ self total completed details self total total self compl

## Train Fast.AI Language Model

In [1]:
from language_model_utils import train_model

In [3]:
if not USE_CACHE:
    lang_model = train_model()

In [9]:
if USE_CACHE:
    PATH = Path('/ds/hamel/CodeML/Get_Python_From_BigQuery/')
    lang_model = torch.load(PATH/'lm_fastai_codecomment_model.pytorch')
    lang_model.eval()
    lang_model.reset()
    
    # Load Fastai Embeddings
    fastailm_emb = np.load(PATH/'combined_fastailm_emb.npy')

# Prepare Language Model For Inference

In [10]:
def str2arr(inp):
    """Convert string to array of dimension (seq_len, 1)."""
    arr = np.expand_dims(np.array([stoi[x] for x in inp.lower().split()]), -1)
    return V(T(arr))

def str2emb(inp):
    """Convert string to embedding with lang model"""
    v_arr = str2arr(inp)
    lang_model.reset()
    hidden_states = lang_model(v_arr)[-1][-1]
    lang_model.reset()
    return torch.cat([hidden_states.mean(0), 
                      hidden_states.max(0)[0], 
                      hidden_states[-1]], 
                     -1).data.numpy()

# Appendix

## File Locations

### Important Files in `/ds/hohsiangwu/projects/semantic_search`

1. `{train, valid, test}.function`:   text file, each line is a tokenized function 

         - train + valid function rows: 4,978,625
         - test function rows: 50,290
         - total function rows: 5,028,915


2. `{train, valid, test}.docstring`:  text file, each line is a tokenized docstring

         - train + valid comment rows: 4,978,625
         - test comment rows: 50,290
         - total comment rows: 5,028,915


### Important Files in `/ds/hamel/CodeML/Get_Python_From_BigQuery/`

1.  `use_emb.npy`            :          Google Universal Sentence Encoder.  shape: (4978625, 512)

2. `concat_train_avg_emb.npy`:          language model average pooling.     shape: (4978625, 400)

3. `concat_train_max_emb.npy`:          language model max pooling.         shape: (4978625, 400)

4. `concat_train_last_emb.npy`:         language model last hidden state.   shape: (4978625, 400)

5. `combined_fastailm_emb`    :         Horizontal concat of [2, 3, 4].     shape: (4978625, 1200)

6.  `codeSearch_Model_frozen.hdf5`:     My best keras model with val_loss = -0.8061 cosine proximity loss.   

```
_________________________________________________________________
Layer (type)                 Output Shape              Param #   

Encoder-Input (InputLayer)   (None, 55)                0         
_________________________________________________________________
Encoder-Model (Model)        (None, 800)               19847200  
_________________________________________________________________
dense_3 (Dense)              (None, 1024)              820224    
_________________________________________________________________
bn-1 (BatchNormalization)    (None, 1024)              4096      
_________________________________________________________________
dense_4 (Dense)              (None, 1200)              1230000   


Total params: 21,901,520
Trainable params: 2,052,272
Non-trainable params: 19,849,248
```
7. `lm_fastai_codecomment_model.pytorch`:  This is the language model trained with fastai, you will have to have the latest version of the fast.ai library

8.  `lm_fastai_codecomment_model_state_dict`: This is the state dict of the language model, a more lightweight way of re-instantiating the model's parameters.

9.  `fitlam_index.nmslib`:  This is the nmslib index that has all the code after it has been vectorized by the language model.  

### Important Notebooks

1.  `hamel/CodeML/Get_Python_From_BigQuery/Parse_Ho_Hsian_Files.ipynb`  this notebook where I do things that are CPU intense:
 - preprocess the {training, validation, lineage} files. 
 - run all the comments through the vectorizers (did a version for both Google and My own languagel model).
 - Loaded all the vectors into an NMS Lib
 - **This is the notebook where the actual demo lives**

2. `hamel/fastai/courses/dl1/lang-model-code-comments.ipynb`:  this notebook is where i trained the fastai language model, and also where I then used the trained model to vectorize all the comments.

3. `/hamel/CodeML/projects/function_summarizer/keras-code-search.ipynb` this is the notebook where I
 - train a function summarizer 
 - fine tune the function summarizer to predict embedding instead of docstring
 - make predictions for all the training data to vectorize all the code (happens on GPU).  This is the code that is loaded into the index where you want to do nearest neighbor search from.

4. `hamel/CodeML/Get_Python_From_BigQuery/Get%20Data%20For%20Python_Code_Search.ipynb` - this notebook I was using to build my own training set, but I abandoned this in favor of Ho-Hsiang's Training dataset which he gave me.