#### Evaluating the generated documentation

In [2]:
import pandas as pd
import numpy as np
import json
from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu, sentence_bleu
from evaluate import load

In [None]:
DATA_PATH = 'C:\\Users\\dmasrour\\Documents\\CodeDoc_Generation\\Data_backup\\DocGen_Unprocessed_Notebooks\\kaggle-courses-master\\intro_to_machine_learning\\'

In [73]:
with open(DATA_PATH + '03-your-first-machine-learning-model.ipynb', encoding="utf8") as jsonfile:
    original_nb = json.load(jsonfile)
original_nb = pd.json_normalize(original_nb['cells'])[['cell_type', 'source']]

In [15]:
with open(DATA_PATH + '03-your-first-machine-learning-model.ipynbPLBART_documented.ipynb', encoding="utf8") as jsonfile:
    docgen_nb = json.load(jsonfile)
docgen_nb = pd.json_normalize(docgen_nb['cells'])[['cell_type', 'source']]

In [74]:
original_nb.head(), original_nb.shape

(  cell_type                                             source
 0  markdown  [**[Introduction to Machine Learning Home Page...
 1  markdown  [## Recap\n, So far, you have loaded your data...
 2      code  [# Code you have previously used to load data\...
 3  markdown  [# Exercises\n, \n, ## Step 1: Specify Predict...
 4      code  [# print the list of columns in the dataset to...,
 (22, 2))

In [18]:
docgen_nb.head(), docgen_nb.shape

(  cell_type                                             source
 0  markdown      [Loads the data from the previous used code.]
 1      code  [# Code you have previously used to load data\...
 2  markdown  [Print the list of columns in the dataset to t...
 3      code  [# print the list of columns in the dataset to...
 4  markdown                  [Check the value of thealePrice.],
 (16, 2))

In [68]:
original_nb.loc[0, 'source']

['**[Introduction to Machine Learning Home Page](https://www.kaggle.com/learn/intro-to-machine-learning)**\n',
 '\n',
 '---\n']

#### BERT score (takes too long without GPU -> see GoogleCollab)

In [22]:
bertscore = load("bertscore")

In [None]:
predictions = ["hello there", "general kenobi"]
references = ["hello there", "general kenobi"]
results = bertscore.compute(predictions=predictions, references=references, lang="en")

#### Smoothed BLEU-1 score

In [5]:
generated_doc = pd.read_csv('../data/generated_doc.csv', index_col=0)

In [11]:
generated_doc.head()

Unnamed: 0,markdown,generated_doc
0,linear algebra,Import numpy as np
1,"data processing, CSV file I O e.g. pd.read csv",Import pandas data frame.
2,"For example, running this by clicking run or p...",Import datetime module.
3,for some statistics,Imports the scipy. stats module and imports th...
4,Load data,Reads the data from the house prices training ...


In [14]:
generated_doc.dropna(inplace=True)

In [6]:
def bleu(ref, gen):
    ''' 
    calculate pair wise bleu score. uses nltk implementation
    Args:
        references : a list of reference sentences 
        candidates : a list of candidate(generated) sentences
    Returns:
        bleu score(float)
    '''
    ref_bleu = []
    gen_bleu = []
    for l in gen:
        gen_bleu.append(l.split())
    for i,l in enumerate(ref):
        ref_bleu.append([l.split()])
    cc = SmoothingFunction()
    score_bleu = corpus_bleu(ref_bleu, gen_bleu, weights=(1, 0, 0, 0), smoothing_function=cc.method4)
    return score_bleu

In [16]:
predictions = list(generated_doc.generated_doc)
references = list(generated_doc.markdown)
bleu(references, predictions)

0.03583534217950505

#### BERT vectors cosine similarity (inaccurate)

In [54]:
from transformers import AutoTokenizer, AutoModel, pipeline
import torch
from scipy.spatial.distance import cosine

In [53]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")

In [76]:
ref = "Select the target variable, which corresponds to the sales price. Save this to a new variable called `y`. You'll need to print a list of the columns to find the name of the column you need."
code_ref = "home_data.columns"
toks_ref = tokenizer.tokenize(ref)
code_toks_ref = tokenizer.tokenize(code_ref)

tokens = [tokenizer.cls_token] + toks_ref + [tokenizer.sep_token] + code_toks_ref + [tokenizer.sep_token] 

tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
ref_embeddings = model(torch.tensor(tokens_ids)[None,:])[0]

In [80]:
gen = 'hi'
code_gen = "home_data.columns"
toks_gen = tokenizer.tokenize(gen)
code_toks_gen = tokenizer.tokenize(code_gen)

tokens = [tokenizer.cls_token] + toks_gen + [tokenizer.sep_token] + code_toks_gen + [tokenizer.sep_token] 

tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
gen_embeddings = model(torch.tensor(tokens_ids)[None,:])[0]

In [81]:
# convert tensor into np array
tensor_np_ref = ref_embeddings.cpu().detach().numpy()
tensor_np_gen = gen_embeddings.cpu().detach().numpy()
# average of embeddings of the tokens in the sequence
avg_ref = np.mean(tensor_np_ref[0], axis=0)
avg_gen = np.mean(tensor_np_gen[0], axis=0)

In [82]:
cos = 1 - cosine(avg_ref, avg_gen)
cos

0.9559369683265686

## Get data reference data pairs to be documented using PLBART

In [83]:
data_ref_pairs = pd.read_csv('../data/data_pairs_2.csv', index_col = 0)[['markdown', 'code']]
data_ref_pairs.head()

Unnamed: 0,markdown,code
0,linear algebra,import numpy as np
1,"data processing, CSV file I O e.g. pd.read csv",import pandas as pd
2,"For example, running this by clicking run or p...",from datetime import datetime
3,for some statistics,from scipy . stats import skew
4,Load data,train = pd.read_csv('../input/house-prices-adv...
