# Load Dataset 

In [18]:
import json
import re

import pandas as pd
from pathlib import Path
pd.set_option('max_colwidth',300)
from pprint import pprint

from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizerFast

In [2]:
!unzip python.zip

Archive:  python.zip
   creating: python/
   creating: python/final/
   creating: python/final/jsonl/
   creating: python/final/jsonl/train/
  inflating: python/final/jsonl/train/python_train_9.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_12.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_10.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_0.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_6.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_2.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_4.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_8.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_11.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_5.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_13.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_3.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_1.jsonl.gz  
  inflating: python/fin

In [3]:
# decompress this gzip file
!gzip -d python/final/jsonl/test/python_test_0.jsonl.gz

Read in the file and display the first row.  The data is stored in [JSON Lines](http://jsonlines.org/) format.

In [4]:
with open('python/final/jsonl/test/python_test_0.jsonl', 'r') as f:
    sample_file = f.readlines()
sample_file[0]

'{"repo": "soimort/you-get", "path": "src/you_get/extractors/youtube.py", "func_name": "YouTube.get_vid_from_url", "original_string": "def get_vid_from_url(url):\\n        \\"\\"\\"Extracts video ID from URL.\\n        \\"\\"\\"\\n        return match1(url, r\'youtu\\\\.be/([^?/]+)\') or \\\\\\n          match1(url, r\'youtube\\\\.com/embed/([^/?]+)\') or \\\\\\n          match1(url, r\'youtube\\\\.com/v/([^/?]+)\') or \\\\\\n          match1(url, r\'youtube\\\\.com/watch/([^/?]+)\') or \\\\\\n          parse_query_param(url, \'v\') or \\\\\\n          parse_query_param(parse_query_param(url, \'u\'), \'v\')", "language": "python", "code": "def get_vid_from_url(url):\\n        \\"\\"\\"Extracts video ID from URL.\\n        \\"\\"\\"\\n        return match1(url, r\'youtu\\\\.be/([^?/]+)\') or \\\\\\n          match1(url, r\'youtube\\\\.com/embed/([^/?]+)\') or \\\\\\n          match1(url, r\'youtube\\\\.com/v/([^/?]+)\') or \\\\\\n          match1(url, r\'youtube\\\\.com/watch/([^/?]+)\'

We can utilize the fact that each line in the file is valid json, and display the first row in a more human readable form:

In [5]:
pprint(json.loads(sample_file[0]))

{'code': 'def get_vid_from_url(url):\n'
         '        """Extracts video ID from URL.\n'
         '        """\n'
         "        return match1(url, r'youtu\\.be/([^?/]+)') or \\\n"
         "          match1(url, r'youtube\\.com/embed/([^/?]+)') or \\\n"
         "          match1(url, r'youtube\\.com/v/([^/?]+)') or \\\n"
         "          match1(url, r'youtube\\.com/watch/([^/?]+)') or \\\n"
         "          parse_query_param(url, 'v') or \\\n"
         "          parse_query_param(parse_query_param(url, 'u'), 'v')",
 'code_tokens': ['def',
                 'get_vid_from_url',
                 '(',
                 'url',
                 ')',
                 ':',
                 'return',
                 'match1',
                 '(',
                 'url',
                 ',',
                 "r'youtu\\.be/([^?/]+)'",
                 ')',
                 'or',
                 'match1',
                 '(',
                 'url',
                 ',',
        

Definitions of each of the above fields are located in the  in the README.md file in the root of this repository.

In [7]:
python_files = sorted(Path('python').glob('**/*.gz'))

In [8]:
print(f'Total number of files: {len(python_files):,}')

Total number of files: 15


To make analysis of this dataset easier, we can load all of the data into a pandas dataframe: 

In [9]:
columns_long_list = ['repo', 'path', 'url', 'code', 
                     'code_tokens', 'docstring', 'docstring_tokens', 
                     'language', 'partition']

columns_short_list = ['code_tokens', 'docstring_tokens', 
                      'language', 'partition']

def jsonl_list_to_dataframe(file_list, columns=columns_long_list):
    """Load a list of jsonl.gz files into a pandas DataFrame."""
    return pd.concat([pd.read_json(f, 
                                   orient='records', 
                                   compression='gzip',
                                   lines=True)[columns] 
                      for f in file_list], sort=False)

This is what the python dataset looks like:

In [10]:
df = jsonl_list_to_dataframe(python_files)

In [11]:
df.head(3)

Unnamed: 0,repo,path,url,code,code_tokens,docstring,docstring_tokens,language,partition
0,ageitgey/face_recognition,examples/face_recognition_knn.py,https://github.com/ageitgey/face_recognition/blob/c96b010c02f15e8eeb0f71308c641179ac1f19bb/examples/face_recognition_knn.py#L46-L108,"def train(train_dir, model_save_path=None, n_neighbors=None, knn_algo='ball_tree', verbose=False):\n """"""\n Trains a k-nearest neighbors classifier for face recognition.\n\n :param train_dir: directory that contains a sub-directory for each known person, with its name.\n\n (View in s...","[def, train, (, train_dir, ,, model_save_path, =, None, ,, n_neighbors, =, None, ,, knn_algo, =, 'ball_tree', ,, verbose, =, False, ), :, X, =, [, ], y, =, [, ], # Loop through each person in the training set, for, class_dir, in, os, ., listdir, (, train_dir, ), :, if, not, os, ., path, ., isdir...","Trains a k-nearest neighbors classifier for face recognition.\n\n :param train_dir: directory that contains a sub-directory for each known person, with its name.\n\n (View in source code to see train_dir example tree structure)\n\n Structure:\n <train_dir>/\n ├── <person...","[Trains, a, k, -, nearest, neighbors, classifier, for, face, recognition, .]",python,train
1,ageitgey/face_recognition,examples/face_recognition_knn.py,https://github.com/ageitgey/face_recognition/blob/c96b010c02f15e8eeb0f71308c641179ac1f19bb/examples/face_recognition_knn.py#L111-L150,"def predict(X_img_path, knn_clf=None, model_path=None, distance_threshold=0.6):\n """"""\n Recognizes faces in given image using a trained KNN classifier\n\n :param X_img_path: path to image to be recognized\n :param knn_clf: (optional) a knn classifier object. if not specified, model_s...","[def, predict, (, X_img_path, ,, knn_clf, =, None, ,, model_path, =, None, ,, distance_threshold, =, 0.6, ), :, if, not, os, ., path, ., isfile, (, X_img_path, ), or, os, ., path, ., splitext, (, X_img_path, ), [, 1, ], [, 1, :, ], not, in, ALLOWED_EXTENSIONS, :, raise, Exception, (, ""Invalid im...","Recognizes faces in given image using a trained KNN classifier\n\n :param X_img_path: path to image to be recognized\n :param knn_clf: (optional) a knn classifier object. if not specified, model_save_path must be specified.\n :param model_path: (optional) path to a pickled knn classifie...","[Recognizes, faces, in, given, image, using, a, trained, KNN, classifier]",python,train
2,ageitgey/face_recognition,examples/face_recognition_knn.py,https://github.com/ageitgey/face_recognition/blob/c96b010c02f15e8eeb0f71308c641179ac1f19bb/examples/face_recognition_knn.py#L153-L181,"def show_prediction_labels_on_image(img_path, predictions):\n """"""\n Shows the face recognition results visually.\n\n :param img_path: path to image to be recognized\n :param predictions: results of the predict function\n :return:\n """"""\n pil_image = Image.open(img_path).conv...","[def, show_prediction_labels_on_image, (, img_path, ,, predictions, ), :, pil_image, =, Image, ., open, (, img_path, ), ., convert, (, ""RGB"", ), draw, =, ImageDraw, ., Draw, (, pil_image, ), for, name, ,, (, top, ,, right, ,, bottom, ,, left, ), in, predictions, :, # Draw a box around the face u...",Shows the face recognition results visually.\n\n :param img_path: path to image to be recognized\n :param predictions: results of the predict function\n :return:,"[Shows, the, face, recognition, results, visually, .]",python,train


In [12]:
df.language.value_counts()

language
python    435285
Name: count, dtype: int64

In [13]:
data = df['code']

In [14]:
len(data)

435285

In [15]:
selected = data[:200000]

In [17]:
def clean_method(method):
    # Remove comments
    cleaned_method = re.sub(r'#.*', '', method)
    cleaded_method = re.sub(r"[^"]*", '', cleaned_method)
    # Remove excessive newlines and spaces
    cleaned_method = re.sub(r'\n+', '\n', cleaned_method).strip()
    return cleaned_method

# Mask if-conditions
def mask_if_condition(func):
    return re.sub(r'if\s*\(.*?\)\s*:', 'if <mask>:', func)

# Extract the original `if` condition
def extract_if_condition(func):
    match = re.search(r'if\s*\((.*?)\)\s*:', func)
    if match:
        return match.group(1)
    return None

# Tokenize
def tokenize_method(method, tokenizer):
    return tokenizer.tokenize(method)

# Flatten method into a single string
def flatten_method(tokens):
    return ' '.join(tokens)

In [None]:
for d in data:
        cleaned_function = clean_method(function)
        input_function = mask_if_condition(cleaned_function)
        target_block = extract_if_condition(function)
        tokens_in_method = tokenize_method(cleaned_function, tokenizer)
        flattened_function = flatten_method(tokens_in_method)

        if target_block:  # Only include methods with if-statements
            selected['cleaned_method'] = cleaned_function
            selected['input_method'] = input_function
            selected['target_block'] = target_block
            selected['tokens_in_method'] = tokens_in_method
            selected['flattend_method'] = flatten_method

In [16]:
train_df, test_df = train_test_split(selected, test_size=0.2, random_state=42)

# Save the splits to CSV files
train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)