# Load Dataset 

In [1]:
import json
import re

import pandas as pd
from pathlib import Path
pd.set_option('max_colwidth',300)
from pprint import pprint

from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizerFast

In [2]:
!unzip python.zip

Archive:  python.zip
replace python/final/jsonl/train/python_train_9.jsonl.gz? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [3]:
# decompress this gzip file
!gzip -d python/final/jsonl/test/python_test_0.jsonl.gz

Read in the file and display the first row.  The data is stored in [JSON Lines](http://jsonlines.org/) format.

In [4]:
with open('python/final/jsonl/test/python_test_0.jsonl', 'r') as f:
    sample_file = f.readlines()
sample_file[0]

'{"repo": "soimort/you-get", "path": "src/you_get/extractors/youtube.py", "func_name": "YouTube.get_vid_from_url", "original_string": "def get_vid_from_url(url):\\n        \\"\\"\\"Extracts video ID from URL.\\n        \\"\\"\\"\\n        return match1(url, r\'youtu\\\\.be/([^?/]+)\') or \\\\\\n          match1(url, r\'youtube\\\\.com/embed/([^/?]+)\') or \\\\\\n          match1(url, r\'youtube\\\\.com/v/([^/?]+)\') or \\\\\\n          match1(url, r\'youtube\\\\.com/watch/([^/?]+)\') or \\\\\\n          parse_query_param(url, \'v\') or \\\\\\n          parse_query_param(parse_query_param(url, \'u\'), \'v\')", "language": "python", "code": "def get_vid_from_url(url):\\n        \\"\\"\\"Extracts video ID from URL.\\n        \\"\\"\\"\\n        return match1(url, r\'youtu\\\\.be/([^?/]+)\') or \\\\\\n          match1(url, r\'youtube\\\\.com/embed/([^/?]+)\') or \\\\\\n          match1(url, r\'youtube\\\\.com/v/([^/?]+)\') or \\\\\\n          match1(url, r\'youtube\\\\.com/watch/([^/?]+)\'

We can utilize the fact that each line in the file is valid json, and display the first row in a more human readable form:

In [5]:
pprint(json.loads(sample_file[0]))

{'code': 'def get_vid_from_url(url):\n'
         '        """Extracts video ID from URL.\n'
         '        """\n'
         "        return match1(url, r'youtu\\.be/([^?/]+)') or \\\n"
         "          match1(url, r'youtube\\.com/embed/([^/?]+)') or \\\n"
         "          match1(url, r'youtube\\.com/v/([^/?]+)') or \\\n"
         "          match1(url, r'youtube\\.com/watch/([^/?]+)') or \\\n"
         "          parse_query_param(url, 'v') or \\\n"
         "          parse_query_param(parse_query_param(url, 'u'), 'v')",
 'code_tokens': ['def',
                 'get_vid_from_url',
                 '(',
                 'url',
                 ')',
                 ':',
                 'return',
                 'match1',
                 '(',
                 'url',
                 ',',
                 "r'youtu\\.be/([^?/]+)'",
                 ')',
                 'or',
                 'match1',
                 '(',
                 'url',
                 ',',
        

Definitions of each of the above fields are located in the  in the README.md file in the root of this repository.

In [3]:
python_files = sorted(Path('python').glob('**/*.gz'))

In [4]:
print(f'Total number of files: {len(python_files):,}')

Total number of files: 15


To make analysis of this dataset easier, we can load all of the data into a pandas dataframe: 

In [5]:
columns_long_list = ['repo', 'path', 'url', 'code', 
                     'code_tokens', 'docstring', 'docstring_tokens', 
                     'language', 'partition']

columns_short_list = ['code_tokens', 'docstring_tokens', 
                      'language', 'partition']

def jsonl_list_to_dataframe(file_list, columns=columns_long_list):
    """Load a list of jsonl.gz files into a pandas DataFrame."""
    return pd.concat([pd.read_json(f, 
                                   orient='records', 
                                   compression='gzip',
                                   lines=True)[columns] 
                      for f in file_list], sort=False)

This is what the python dataset looks like:

In [6]:
df = jsonl_list_to_dataframe(python_files)

In [7]:
df.head(3)

Unnamed: 0,repo,path,url,code,code_tokens,docstring,docstring_tokens,language,partition
0,ageitgey/face_recognition,examples/face_recognition_knn.py,https://github.com/ageitgey/face_recognition/blob/c96b010c02f15e8eeb0f71308c641179ac1f19bb/examples/face_recognition_knn.py#L46-L108,"def train(train_dir, model_save_path=None, n_neighbors=None, knn_algo='ball_tree', verbose=False):\n """"""\n Trains a k-nearest neighbors classifier for face recognition.\n\n :param train_dir: directory that contains a sub-directory for each known person, with its name.\n\n (View in s...","[def, train, (, train_dir, ,, model_save_path, =, None, ,, n_neighbors, =, None, ,, knn_algo, =, 'ball_tree', ,, verbose, =, False, ), :, X, =, [, ], y, =, [, ], # Loop through each person in the training set, for, class_dir, in, os, ., listdir, (, train_dir, ), :, if, not, os, ., path, ., isdir...","Trains a k-nearest neighbors classifier for face recognition.\n\n :param train_dir: directory that contains a sub-directory for each known person, with its name.\n\n (View in source code to see train_dir example tree structure)\n\n Structure:\n <train_dir>/\n ├── <person...","[Trains, a, k, -, nearest, neighbors, classifier, for, face, recognition, .]",python,train
1,ageitgey/face_recognition,examples/face_recognition_knn.py,https://github.com/ageitgey/face_recognition/blob/c96b010c02f15e8eeb0f71308c641179ac1f19bb/examples/face_recognition_knn.py#L111-L150,"def predict(X_img_path, knn_clf=None, model_path=None, distance_threshold=0.6):\n """"""\n Recognizes faces in given image using a trained KNN classifier\n\n :param X_img_path: path to image to be recognized\n :param knn_clf: (optional) a knn classifier object. if not specified, model_s...","[def, predict, (, X_img_path, ,, knn_clf, =, None, ,, model_path, =, None, ,, distance_threshold, =, 0.6, ), :, if, not, os, ., path, ., isfile, (, X_img_path, ), or, os, ., path, ., splitext, (, X_img_path, ), [, 1, ], [, 1, :, ], not, in, ALLOWED_EXTENSIONS, :, raise, Exception, (, ""Invalid im...","Recognizes faces in given image using a trained KNN classifier\n\n :param X_img_path: path to image to be recognized\n :param knn_clf: (optional) a knn classifier object. if not specified, model_save_path must be specified.\n :param model_path: (optional) path to a pickled knn classifie...","[Recognizes, faces, in, given, image, using, a, trained, KNN, classifier]",python,train
2,ageitgey/face_recognition,examples/face_recognition_knn.py,https://github.com/ageitgey/face_recognition/blob/c96b010c02f15e8eeb0f71308c641179ac1f19bb/examples/face_recognition_knn.py#L153-L181,"def show_prediction_labels_on_image(img_path, predictions):\n """"""\n Shows the face recognition results visually.\n\n :param img_path: path to image to be recognized\n :param predictions: results of the predict function\n :return:\n """"""\n pil_image = Image.open(img_path).conv...","[def, show_prediction_labels_on_image, (, img_path, ,, predictions, ), :, pil_image, =, Image, ., open, (, img_path, ), ., convert, (, ""RGB"", ), draw, =, ImageDraw, ., Draw, (, pil_image, ), for, name, ,, (, top, ,, right, ,, bottom, ,, left, ), in, predictions, :, # Draw a box around the face u...",Shows the face recognition results visually.\n\n :param img_path: path to image to be recognized\n :param predictions: results of the predict function\n :return:,"[Shows, the, face, recognition, results, visually, .]",python,train


In [8]:
df.language.value_counts()

language
python    435285
Name: count, dtype: int64

In [9]:
data = df['code']

In [10]:
len(data)

435285

In [11]:
data.to_csv('full.csv', index=False)

In [13]:
# extracted = pd.read_csv("tokenized_functions.csv")
# extracted

Unnamed: 0,original_function,tokenized_function
0,"def train(train_dir, model_save_path=None, n_neighbors=None, knn_algo='ball_tree', verbose=False):\n """"""\n Trains a k-nearest neighbors classifier for face recognition.\n\n :param train_dir: directory that contains a sub-directory for each known person, with its name.\n\n (View in s...","[1, 822, 7945, 29898, 14968, 29918, 3972, 29892, 1904, 29918, 7620, 29918, 2084, 29922, 8516, 29892, 302, 29918, 484, 1141, 29890, 943, 29922, 8516, 29892, 889, 29876, 29918, 284, 1484, 2433, 2135, 29918, 8336, 742, 26952, 29922, 8824, 1125, 13, 1678, 9995, 13, 1678, 3201, 1144, 263, 413, 29899,..."
1,"def predict(X_img_path, knn_clf=None, model_path=None, distance_threshold=0.6):\n """"""\n Recognizes faces in given image using a trained KNN classifier\n\n :param X_img_path: path to image to be recognized\n :param knn_clf: (optional) a knn classifier object. if not specified, model_s...","[1, 822, 8500, 29898, 29990, 29918, 2492, 29918, 2084, 29892, 889, 29876, 29918, 695, 29888, 29922, 8516, 29892, 1904, 29918, 2084, 29922, 8516, 29892, 5418, 29918, 386, 12268, 29922, 29900, 29889, 29953, 1125, 13, 1678, 9995, 13, 1678, 3599, 3811, 7093, 17240, 297, 2183, 1967, 773, 263, 16370, ..."
2,"def face_distance(face_encodings, face_to_compare):\n """"""\n Given a list of face encodings, compare them to a known face encoding and get a euclidean distance\n for each comparison face. The distance tells you how similar the faces are.\n\n :param faces: List of face encodings to com...","[1, 822, 3700, 29918, 19244, 29898, 2161, 29918, 3977, 397, 886, 29892, 3700, 29918, 517, 29918, 18307, 1125, 13, 1678, 9995, 13, 1678, 11221, 263, 1051, 310, 3700, 2094, 397, 886, 29892, 7252, 963, 304, 263, 2998, 3700, 8025, 322, 679, 263, 321, 27511, 5418, 13, 1678, 363, 1269, 10230, 3700, 29..."
3,"def load_image_file(file, mode='RGB'):\n """"""\n Loads an image file (.jpg, .png, etc) into a numpy array\n\n :param file: image file name or file object to load\n :param mode: format to convert the image to. Only 'RGB' (8-bit RGB, 3 channels) and 'L' (black and white) are supported.\n...","[1, 822, 2254, 29918, 3027, 29918, 1445, 29898, 1445, 29892, 4464, 2433, 28212, 29374, 13, 1678, 9995, 13, 1678, 4309, 7925, 385, 1967, 934, 14544, 6173, 29892, 869, 2732, 29892, 2992, 29897, 964, 263, 12655, 1409, 13, 13, 1678, 584, 3207, 934, 29901, 1967, 934, 1024, 470, 934, 1203, 304, 2254, ..."
4,"def _raw_face_locations(img, number_of_times_to_upsample=1, model='hog'):\n """"""\n Returns an array of bounding boxes of human faces in a image\n\n :param img: An image (as a numpy array)\n :param number_of_times_to_upsample: How many times to upsample the image looking for faces. Hig...","[1, 822, 903, 1610, 29918, 2161, 29918, 2029, 800, 29898, 2492, 29892, 1353, 29918, 974, 29918, 3706, 29918, 517, 29918, 14340, 981, 29922, 29896, 29892, 1904, 2433, 29882, 468, 29374, 13, 1678, 9995, 13, 1678, 16969, 385, 1409, 310, 3216, 292, 16273, 310, 5199, 17240, 297, 263, 1967, 13, 13, 16..."
...,...,...
282968,"def apply_defaults(self, commands):\n """""" apply default settings to commands\n not static, shadow ""self"" in eval\n """"""\n for command in commands:\n if 'action' in command and '()' in command['action']:\n command['action'] = eval('self.{}'.format(command[...","[1, 822, 3394, 29918, 4381, 29879, 29898, 1311, 29892, 8260, 1125, 13, 1678, 9995, 3394, 2322, 6055, 304, 8260, 13, 9651, 451, 2294, 29892, 15504, 376, 1311, 29908, 297, 19745, 13, 4706, 9995, 13, 1678, 363, 1899, 297, 8260, 29901, 13, 4706, 565, 525, 2467, 29915, 297, 1899, 322, 525, 580, 29915..."
282969,"def create_commands(self, commands, parser):\n """""" add commands to parser """"""\n self.apply_defaults(commands)\n\n def create_single_command(command):\n keys = command['keys']\n del command['keys']\n kwargs = {}\n for item in command:\n kwargs[item]...","[1, 822, 1653, 29918, 26381, 29898, 1311, 29892, 8260, 29892, 13812, 1125, 13, 1678, 9995, 788, 8260, 304, 13812, 9995, 13, 1678, 1583, 29889, 7302, 29918, 4381, 29879, 29898, 26381, 29897, 13, 13, 1678, 822, 1653, 29918, 14369, 29918, 6519, 29898, 6519, 1125, 13, 4706, 6611, 353, 1899, 1839, 81..."
282970,"def check_path_action(self):\n """""" custom command line action to check file exist """"""\n\n class CheckPathAction(argparse.Action):\n\n def __call__(self, parser, args, value, option_string=None):\n if type(value) is list:\n value = value[0]\n user...","[1, 822, 1423, 29918, 2084, 29918, 2467, 29898, 1311, 1125, 13, 1678, 9995, 2888, 1899, 1196, 3158, 304, 1423, 934, 1863, 9995, 13, 13, 1678, 770, 5399, 2605, 4276, 29898, 1191, 5510, 29889, 4276, 1125, 13, 13, 4706, 822, 4770, 4804, 12035, 1311, 29892, 13812, 29892, 6389, 29892, 995, 29892, 298..."
282971,"def __call__(self, parser, args, value, option_string=None):\n if type(value) is list:\n value = value[0]\n user_value = value\n if option_string == 'None':\n if not os.path.isdir(value):\n _current_user = os.path.expanduser('~')\n if not value.starts...","[1, 822, 4770, 4804, 12035, 1311, 29892, 13812, 29892, 6389, 29892, 995, 29892, 2984, 29918, 1807, 29922, 8516, 1125, 13, 1678, 565, 1134, 29898, 1767, 29897, 338, 1051, 29901, 13, 4706, 995, 353, 995, 29961, 29900, 29962, 13, 1678, 1404, 29918, 1767, 353, 995, 13, 1678, 565, 2984, 29918, 1807, ..."
