# Get Data

In [1]:
import pandas as pd
from tqdm import tqdm_notebook
import ipdb

def idx2fname(i):
    """ 
    Convert integer to filename for downloading python files from 
    Google Cloud Bucket there are 99 files with suffix 00 - 99.
    """
    suffix = str(100 + i)[-2:]
    filename = f'https://storage.googleapis.com/python_github/0000000000{suffix}.csv'
    return filename


#Download all files and read them into pandas

dataframes = []

for i in tqdm_notebook(range(100)):
    filename = idx2fname(i)
    print(f'reading {filename}')
    dataframes.append(pd.read_csv(filename))

# concatenate all files
df = pd.concat(dataframes)
df.head()

A Jupyter Widget

reading https://storage.googleapis.com/python_github/000000000000.csv
reading https://storage.googleapis.com/python_github/000000000001.csv
reading https://storage.googleapis.com/python_github/000000000002.csv
reading https://storage.googleapis.com/python_github/000000000003.csv
reading https://storage.googleapis.com/python_github/000000000004.csv
reading https://storage.googleapis.com/python_github/000000000005.csv
reading https://storage.googleapis.com/python_github/000000000006.csv
reading https://storage.googleapis.com/python_github/000000000007.csv
reading https://storage.googleapis.com/python_github/000000000008.csv
reading https://storage.googleapis.com/python_github/000000000009.csv
reading https://storage.googleapis.com/python_github/000000000010.csv
reading https://storage.googleapis.com/python_github/000000000011.csv
reading https://storage.googleapis.com/python_github/000000000012.csv
reading https://storage.googleapis.com/python_github/000000000013.csv
reading https://stor

Unnamed: 0,repo_path,content
0,wuyuewen/libcloud libcloud/test/compute/test_v...,# Licensed to the Apache Software Foundation (...
1,rnov/Fingerpay work_unit/payfi/apidropbox.py,"import os\nfrom multiprocessing import Pool, T..."
2,instrat-nigeria/django-instrat-oppia oppia/tem...,# oppia/templatetags/display_functions.py\nimp...
3,kalev/anaconda pyanaconda/installclasses/rhel.py,#\n# rhel.py\n#\n# Copyright (C) 2010 Red Hat...
4,stackforge/watcher watcher/api/controllers/bas...,# -*- encoding: utf-8 -*-\n#\n# Licensed under...


In [2]:
df.reset_index(drop=True, inplace=True)

In [3]:
df.to_pickle('python_files_df.pkl')

In [41]:
df.shape

(3409957, 2)

In [4]:
! ls -lah python_files_df.pkl

-rw-r--r-- 1 root root 15G Apr 26 21:04 python_files_df.pkl


# Parse out (function, comment) pairs w/metadata for lineage

In [5]:
import astor
import ast


def ast_to_code(ast):
    try:
        return astor.to_source(ast)
    except:
        return None
    
    
def get_function_ast(code_blob):
    """
    Given a code blob, return a list of functions.
    
    This is retrieved from both methods and top-level functions.
    """
    methods = []
    try:
        mod = ast.parse(code_blob)
    except:
        return []
    
    #ipdb.set_trace()
    classes = [node for node in mod.body if isinstance(node, ast.ClassDef)]
    functions = [node for node in mod.body if isinstance(node, ast.FunctionDef)]
    
    for c in classes:
        methods += [node for node in c.body if isinstance(node, ast.FunctionDef)]
    
    functions += methods
    
    # filter to make sure (1) there is a docstring (2) ast can be turned back into code.
    return [(ast.get_docstring(f), ast_to_code(f), f.lineno, f.name) for f in functions if ast.get_docstring(f) and ast_to_code(f)]

def get_code_comment_pair(code_blob, ref):
    funcs = get_function_ast(code_blob)
    
    if not funcs:
        return pd.DataFrame({'ref':[], 'code':[]})
    
    else:
        docstr, code, lineno, func_name = zip(*funcs)
        return pd.DataFrame({'ref':ref, 
                             'docstr':docstr,
                             'code': code,
                             'line_no':lineno, 
                             'func_name':func_name,
                            })

In [31]:
def get_code_comment_pairs_df(df, code_field='content'):
    # basic input checks
    assert code_field in df.columns, f'column `{code_field}` not in dataframe'
    assert df.index.nunique() == df.shape[0], 'dataframe Index must have unique values'
    
    #collect code_blob, ref pairs
    code_blobs = zip(df[code_field].values, df.index.values)
    return pd.concat([get_code_comment_pair(code, ref) for code, ref in code_blobs])

In [35]:
import numpy as np
from pathos.multiprocessing import Pool, cpu_count
cpu_cores = cpu_count()

In [36]:
# apply function to dataframe

#split dataframe into chunks
splitdf = np.array_split(df, cpu_cores)

pool = Pool(cpu_cores)
transformed_data = pool.map(get_code_comment_pairs_df, splitdf)

pool.close()
pool.join()

final_df = pd.concat(transformed_data)

In [37]:
final_df.head()

Unnamed: 0,code,docstr,func_name,line_no,ref
0,"def as_dict(self):\n """"""Render this object ...",Render this object as a dict of its fields.,as_dict,33.0,4.0
1,"def unset_fields_except(self, except_list=None...",Unset fields so they don't appear in the messa...,unset_fields_except,40.0,4.0
0,"def run(self, force=False):\n """"""\n ...","Runs the daily searcher, queuing selected epis...",run,39.0,6.0
0,"def read_from_h5(file_name, **kwargs):\n """"...",Read data from an H5 file in SXS format\n\nNot...,read_from_h5,59.0,9.0
1,"def write_to_h5(w, file_name, file_write_mode=...",Output the Waveform in NRAR format.\n\nNote th...,write_to_h5,244.0,9.0


In [39]:
final_df.to_pickle('parsed_python_code_comment_pairs_df.pkl')

In [40]:
!ls -lah parsed_python_code_comment_pairs_df.pkl

-rw-r--r-- 1 root root 3.6G Apr 26 21:25 parsed_python_code_comment_pairs_df.pkl


# Clean Code

In [79]:
from ktext.preprocess import textacy_cleaner
import re
def custom_clean_code(code_string_list):
    """
    Helper function to clean code by performing the following:
    1. Insert a space before each uppercase and underscore
    2. Apply textacy_cleaner from ktext
    
    Parameters:
    
    code_string_list : List[Str]
        List of strings that are code blobs
    """
    cleaned_code = []
    regex_pattern = r"([A-Z]|_|\(|\)|\s)" # find all upper case and underscore characters
    
    for code_string in code_string_list:
        code_fix = re.sub(regex_pattern, r" \1", code_string) #insert a space before
        cleaned_code.append(textacy_cleaner(code_fix)) # apply textacy cleaner from ktext
    return cleaned_code

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [85]:
def clean_code_df(df, code_field='code'):
    assert code_field in df.columns, f'column `{code_field}` not in dataframe'
    clean_code_list = custom_clean_code(df[code_field].values)
    return df.assign(clean_code = clean_code_list)

In [88]:
# apply function to dataframe

#split dataframe into chunks
splitdf = np.array_split(final_df, cpu_cores)

pool = Pool(cpu_cores)
transformed_data = pool.map(clean_code_df, splitdf)

pool.close()
pool.join()

final_df_clean = pd.concat(transformed_data)

In [89]:
final_df_clean.to_pickle('parsed_python_code_comment_pairs_tokenized_df.pkl')

In [90]:
final_df_clean.head()

Unnamed: 0,code,docstr,func_name,line_no,ref,clean_code
0,"def as_dict(self):\n """"""Render this object ...",Render this object as a dict of its fields.,as_dict,33.0,4.0,def as dict self render this object as a dict ...
1,"def unset_fields_except(self, except_list=None...",Unset fields so they don't appear in the messa...,unset_fields_except,40.0,4.0,def unset fields except self except list= none...
0,"def run(self, force=False):\n """"""\n ...","Runs the daily searcher, queuing selected epis...",run,39.0,6.0,def run self force= false runs the daily searc...
0,"def read_from_h5(file_name, **kwargs):\n """"...",Read data from an H5 file in SXS format\n\nNot...,read_from_h5,59.0,9.0,def read from h5 file name kwargs read data fr...
1,"def write_to_h5(w, file_name, file_write_mode=...",Output the Waveform in NRAR format.\n\nNote th...,write_to_h5,244.0,9.0,def write to h5 w file name file write mode=w ...


In [91]:
!ls -lah /ds/CodeML/Get_Python_From_BigQuery

total 25G
drwxr-xr-x 2 root root 6.0K Apr 26 23:49 .
drwxrwxrwx 7 1001 1001 6.0K Apr 26 23:49 ..
-rw-r--r-- 1 root root 3.6G Apr 26 21:25 parsed_python_code_comment_pairs_df.pkl
-rw-r--r-- 1 root root 5.8G Apr 26 23:49 parsed_python_code_comment_pairs_tokenized_df.pkl
-rw-r--r-- 1 root root  15G Apr 26 21:04 python_files_df.pkl


In [2]:
import pandas as pd
final_df_clean = pd.read_pickle('parsed_python_code_comment_pairs_tokenized_df.pkl')

## Tokenize For Keras Model

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
#train_df, holdout_df = train_test_split(final_df_clean, train_size=.80)



In [16]:
train_df.to_pickle('final_train_df.pkl')
holdout_df.to_pickle('final_holdout_df.pkl')

In [6]:
train_df.shape, holdout_df.shape

((4023138, 6), (1005785, 6))

In [10]:
train_code = train_df.code.values.tolist()
train_comment = train_df.docstr.values.tolist()

In [11]:
from ktext.preprocess import processor
code_proc = processor(hueristic_pct_padding=.7, keep_n=20000)
t_code = code_proc.fit_transform(train_code)

comment_proc = processor(append_indicators=True, hueristic_pct_padding=.7, keep_n=14000, padding ='post')
t_comment = comment_proc.fit_transform(train_comment)

 See full histogram by insepecting the `document_length_stats` attribute.
 See full histogram by insepecting the `document_length_stats` attribute.


In [12]:
import dill as dpickle
import numpy as np

# Save the preprocessor
with open('py_code_proc.dpkl', 'wb') as f:
    dpickle.dump(code_proc, f)

with open('py_comment_proc.dpkl', 'wb') as f:
    dpickle.dump(comment_proc, f)

# Save the processed data
np.save('py_t_code_vecs.npy', t_code)
np.save('py_t_comment_vecs.npy', t_comment)

In [2]:
! pwd

/ds/hamel/CodeML/Get_Python_From_BigQuery
