## Improve data quality


* Toward improving quality of training examples, adding additional labels, expanding date range (and number of examples)


In [7]:

JOB_NAME="preprocess-github-dataset-$(date +'%Y%m%d-%H%M%S')"

!/home/jovyan/.conda/envs/py2/bin/python -m code_search.dataflow.cli.preprocess_github_dataset \
        --runner "DataflowRunner" \
        --project "kubeflow-rl" \
        --target_dataset "github_function_embeddings" \
        --data_dir "gs://kubeflow-rl-dataflow/cs/data" \
        --job_name "preprocess-github-dataset-$(date +'%Y%m%d-%H%M%S')" \
        --temp_location "gs://kubeflow-rl-dataflow/cs/tmp" \
        --staging_location "gs://kubeflow-rl-dataflow/cs/staging" \
        --worker_machine_type "n1-standard-64" \
        --num_workers "5"


  from .lbfgsb import _minimize_lbfgsb
  from .qhull import *
INFO|2018-11-06T22:51:28|/mnt/nfs-east1-d/work/examples/code_search/src/code_search/dataflow/cli/preprocess_github_dataset.py|58| Reading data using a query.
INFO|2018-11-06T22:51:28|/mnt/nfs-east1-d/work/examples/code_search/src/code_search/dataflow/transforms/github_dataset.py|64| Writing results to BigQuery kubeflow-rl:github_function_embeddings.token_pairs
INFO|2018-11-06T22:51:28|/home/jovyan/.conda/envs/py2/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/apiclient.py|469| Starting GCS upload to gs://kubeflow-rl-dataflow/cs/staging/preprocess-github-dataset-20181106-225126.1541544688.788675/pipeline.pb...
INFO|2018-11-06T22:51:28|/home/jovyan/.conda/envs/py2/lib/python2.7/site-packages/oauth2client/transport.py|157| Attempting refresh to obtain initial access_token
INFO|2018-11-06T22:51:34|/home/jovyan/.conda/envs/py2/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/apiclient.py|484| C

## Alt

In [None]:

# Let's try a tokenization step that retains newline characters as a special symbol that won't interfere with
# downstream steps that write examples to a CSV (where a single line denotes a single example). E.g. converting
# \n's to @@@@ (something we should almost never see otherwise). We can then check after the vocab generation
# phase whether @@@@ has its own vocab entry.


In [2]:
import logging
import sys

import ast
import astor
import nltk.tokenize as tokenize
import spacy

en = spacy.load('en')


In [19]:

def tokenize_code(text):
  """Tokenize code strings.

  This simply considers whitespaces as token delimiters.

  Args:
    text: A code string to be tokenized.

  Returns:
    A list of strings representing the tokens in the code.
  """

  tokenization = []
  for i, line in enumerate(text.split("\n")):
    tokenization.extend(tokenize.RegexpTokenizer(r'\w+').tokenize(line))
    tokenization.append("@@@")
  return tokenization

tokenize_code("foo \t etc \n something else")


['foo', 'etc', '@@@', 'something', 'else', '@@@']

In [None]:

# Hmm this isn't ideal because we lose a lot of the meaning of the code this way...
# In python spaces, tabs, and newlines all have important meanings.

# Will probably try parsing code into a single line AST string.


In [11]:
# In python2 we need to call decode but in python3 strings
# are always unicode.
def _maybe_decode(s):
  if sys.version_info[0] < 3:
    return s.decode("utf-8")
  return s

def tokenize_docstring(text):
  """Tokenize docstrings.

  Args:
    text: A docstring to be tokenized.

  Returns:
    A list of strings representing the tokens in the docstring.
  """
  tokens = en.tokenizer(_maybe_decode(text))
  return [token.text.lower() for token in tokens if not token.is_space]

tokenize_docstring("  hello world \n    Args:    my_arg(str): Something awesome. \n    ")

# Not making this change for now.


[u'hello',
 u'world',
 u'args',
 u':',
 u'my_arg(str',
 u')',
 u':',
 u'something',
 u'awesome',
 u'.']

In [None]:

# So it looks like the current preproc pipeline is processing all of the available years of data.
# So perhaps it would be helpful to consider multiple languages to have more data to consider and at the
# same time increase the min star count.


In [18]:

# This might work if os.chdir() to one that contains a code_search subdir because otherwise it uses the
# directory in site-packages which doesn't include a setup.py file.
# In any case it worked by copying the command to the terminal and running from the right path.

!/home/jovyan/.conda/envs/py2/bin/python -m code_search.dataflow.cli.preprocess_github_dataset \
        --runner "DataflowRunner" \
        --project "kubeflow-rl" \
        --target_dataset "github_function_embeddings_alt" \
        --data_dir "gs://kubeflow-rl-dataflow/cs/data_alt" \
        --job_name "preprocess-github-alt-dataset-$(date +'%Y%m%d-%H%M%S')" \
        --temp_location "gs://kubeflow-rl-dataflow/cs/tmp" \
        --staging_location "gs://kubeflow-rl-dataflow/cs/staging" \
        --worker_machine_type "n1-standard-64" \
        --num_workers "5"


  from .lbfgsb import _minimize_lbfgsb
  from .qhull import *
INFO|2018-11-14T23:51:14|/home/jovyan/.conda/envs/py2/lib/python2.7/site-packages/code_search/dataflow/cli/preprocess_github_dataset.py|58| Reading data using a query.
INFO|2018-11-14T23:51:14|/home/jovyan/.conda/envs/py2/lib/python2.7/site-packages/code_search/dataflow/transforms/github_dataset.py|64| Writing results to BigQuery kubeflow-rl:github_function_embeddings_alt.token_pairs
INFO|2018-11-14T23:51:14|/home/jovyan/.conda/envs/py2/lib/python2.7/site-packages/apache_beam/runners/dataflow/internal/apiclient.py|469| Starting GCS upload to gs://kubeflow-rl-dataflow/cs/staging/preprocess-github-alt-dataset-20181114-235111.1542239474.150747/pipeline.pb...
INFO|2018-11-14T23:51:14|/home/jovyan/.conda/envs/py2/lib/python2.7/site-packages/oauth2client/transport.py|157| Attempting refresh to obtain initial access_token
INFO|2018-11-14T23:51:14|/home/jovyan/.conda/envs/py2/lib/python2.7/site-packages/apache_beam/runners/dataflow/

## AST thing?



In [74]:
import ast
import astor

def tokenize(blob):
  s = ast.dump(ast.parse(blob))
  s = s.replace("(", " ( ").replace(")", " )").replace("=", " = ").replace("[", "[ ").replace("]", " ]").replace("[  ]", "[ ]").replace("',", "' ,").replace("(  )", "( )").replace("),", ") ,").replace("'", " ' ").replace("=  '", "= '").replace("'  ,", "' ,")
  return s


In [75]:

blob = '''

def iter_node(node, name='', unknown=None,
              # Runtime optimization
              list=list, getattr=getattr, isinstance=isinstance,
              enumerate=enumerate, missing=NonExistent):
    """Iterates over an object:
       - If the object has a _fields attribute,
         it gets attributes in the order of this
         and returns name, value pairs.
       - Otherwise, if the object is a list instance,
         it returns name, value pairs for each item
         in the list, where the name is passed into
         this function (defaults to blank).
       - Can update an unknown set with information about
         attributes that do not exist in fields.
    """
    fields = getattr(node, '_fields', None)
    if fields is not None:
        for name in fields:
            value = getattr(node, name, missing)
            if value is not missing:
                yield value, name
        if unknown is not None:
            unknown.update(set(vars(node)) - set(fields))
        """something"""
    elif isinstance(node, list):
        for value in node:
            yield value, name

'''

tokenize(blob)


"Module ( body = [ FunctionDef ( name = ' iter_node ' , args = arguments ( args = [ Name ( id = ' node ' , ctx = Param ( ) ) , Name ( id = ' name ' , ctx = Param ( ) ) , Name ( id = ' unknown ' , ctx = Param ( ) ) , Name ( id = ' list ' , ctx = Param ( ) ) , Name ( id = ' getattr ' , ctx = Param ( ) ) , Name ( id = ' isinstance ' , ctx = Param ( ) ) , Name ( id = ' enumerate ' , ctx = Param ( ) ) , Name ( id = ' missing ' , ctx = Param ( ) ) ], vararg = None, kwarg = None, defaults = [ Str ( s = '  '  ) , Name ( id = ' None ' , ctx = Load ( ) ) , Name ( id = ' list ' , ctx = Load ( ) ) , Name ( id = ' getattr ' , ctx = Load ( ) ) , Name ( id = ' isinstance ' , ctx = Load ( ) ) , Name ( id = ' enumerate ' , ctx = Load ( ) ) , Name ( id = ' NonExistent ' , ctx = Load ( ) ) ] ) , body = [ Expr ( value = Str ( s = ' Iterates over an object:\\n       - If the object has a _fields attribute,\\n         it gets attributes in the order of this\\n         and returns name, value pairs.\\n      

In [76]:
blob = '''

def strip_tree(node,
               # Runtime optimization
               iter_node=iter_node, special=ast.AST,
               list=list, isinstance=isinstance, type=type, len=len):
    """Strips an AST by removing all attributes not in _fields.
    Returns a set of the names of all attributes stripped.
    This canonicalizes two trees for comparison purposes.
    """
    stripped = set()

    def strip(node, indent):
        unknown = set()
        leaf = True
        for subnode, _ in iter_node(node, unknown=unknown):
            leaf = False
            strip(subnode, indent + '    ')
        if leaf:
            if isinstance(node, special):
                unknown = set(vars(node))
        stripped.update(unknown)
        for name in unknown:
            delattr(node, name)
        if hasattr(node, 'ctx'):
            delattr(node, 'ctx')
            if 'ctx' in node._fields:
                mylist = list(node._fields)
                mylist.remove('ctx')
                node._fields = mylist
    strip(node, '')
    return stripped

'''

tokenize(blob)


"Module ( body = [ FunctionDef ( name = ' strip_tree ' , args = arguments ( args = [ Name ( id = ' node ' , ctx = Param ( ) ) , Name ( id = ' iter_node ' , ctx = Param ( ) ) , Name ( id = ' special ' , ctx = Param ( ) ) , Name ( id = ' list ' , ctx = Param ( ) ) , Name ( id = ' isinstance ' , ctx = Param ( ) ) , Name ( id = ' type ' , ctx = Param ( ) ) , Name ( id = ' len ' , ctx = Param ( ) ) ], vararg = None, kwarg = None, defaults = [ Name ( id = ' iter_node ' , ctx = Load ( ) ) , Attribute ( value = Name ( id = ' ast ' , ctx = Load ( ) ) , attr = ' AST ' , ctx = Load ( ) ) , Name ( id = ' list ' , ctx = Load ( ) ) , Name ( id = ' isinstance ' , ctx = Load ( ) ) , Name ( id = ' type ' , ctx = Load ( ) ) , Name ( id = ' len ' , ctx = Load ( ) ) ] ) , body = [ Expr ( value = Str ( s = ' Strips an AST by removing all attributes not in _fields.\\n    Returns a set of the names of all attributes stripped.\\n    This canonicalizes two trees for comparison purposes.\\n     '  ) ) , Assign 

In [77]:
blob = '''
def hello_world():
  print "hello world"
'''

tokenize(blob)

"Module ( body = [ FunctionDef ( name = ' hello_world ' , args = arguments ( args = [ ], vararg = None, kwarg = None, defaults = [ ] ) , body = [ Print ( dest = None, values = [ Str ( s = ' hello world '  ) ], nl = True ) ], decorator_list = [ ] ) ] )"