Cells in this notebook are responsible to perform all the operations to divide your dataset into a number of parquet files by the number of lines in a whole project

In [1]:
import pandas as pd
import dask.dataframe as dd
import time
import json
from tqdm.auto import tqdm
import os

In [2]:
data_dir = '../../data'
if not os.path.exists(data_dir):
    os.mkdir(data_dir)


### Creating a dictionary to map repository names to `int`

 - `parquet_path` -- path to the initial dataset
 - `repo_name_column` -- column with the repository names 
 - `repo_to_id_filename` -- name of json file with the mapping, will be saved in `data_dir`

In [3]:
parquet_path = '/mnt/data/glukhov/big_context_python_starcoder.parquet'

df = dd.read_parquet(parquet_path, engine='pyarrow')

print(df.head())


                         max_stars_repo_name   
0                               crempp/mdweb  \
1                       lrq3000/markdown2zim   
2                            MrOlm/glrestore   
3                           bouvierj/chimbot   
4  issa-project/entity-fishing-client-python   

                                      joined_content  
0  tests/test_debug_helper.py𐌼# -*- coding: utf-8...  
1  markdown2zim.py𐌼#!/usr/bin/python2\n# -*- codi...  
2  glrestore/glrestore.py𐌼<reponame>MrOlm/glresto...  
3  plugins/starter.py𐌼import time\nimport re\nimp...  
4  nerd/nerd_client.py𐌼<reponame>issa-project/ent...  


In [4]:
repo_name_column = 'max_stars_repo_name'

df = df[repo_name_column].unique().compute()
names_list = df.to_list()

repo_name_to_id = {name: idx for idx, name in enumerate(names_list)}

print('Number of unique repo names: ', len(repo_name_to_id.keys()))


Number of unique repo names:  1678572


In [5]:
repo_to_id_filename = 'repo_to_id.json'

with open(os.path.join(data_dir, repo_to_id_filename), 'w') as f:
    json.dump(repo_name_to_id, f)
    

### Calculating project lenghts (in lines)

 - `repo_to_id_filename` -- name of json file with the repo_names -> int mapping, should be in `data_dir`
 - `parquet_path` -- path to the initial dataset
 - `repo_name_column` -- column with the repository names
 - `content_column` -- column with the content (`str`)
 - `lines_count_filename` -- parquet file with two `int` columns: `repo_id`, `content_length`, will be saved to `data_dir`

In [6]:
def calculate_lines(content: str, newline_caracter='\n') -> int:
    return content.count(newline_caracter)

def repo_to_id(reponame: str, repo_to_id_dict: dict[str, int]) -> int:
    return repo_to_id_dict[reponame]


In [77]:
repo_to_id_filename = 'repo_to_id.json'

with open(os.path.join(data_dir, repo_to_id_filename), 'r') as f:
    repo_to_id_dict = json.load(f)
    

In [8]:
parquet_path = '/mnt/data/glukhov/python_starcoder.parquet'

df = dd.read_parquet(parquet_path, engine='pyarrow')

print(df.head())


                                 max_stars_repo_path   
0                         public_data/serializers.py  \
1                              quick_search/admin.py   
2                                      rasa/train.py   
3  coding_intereview/1475. Final Prices With a Sp...   
4               rplugin/python3/denite/ui/default.py   

                max_stars_repo_name id   
0                   MTES-MCT/sparte  0  \
1     naman1901/django-quick-search  1   
2  Amirali-Shirkh/rasa-for-botfront  2   
3        Jahidul007/Python-Bootcamp  3   
4            timgates42/denite.nvim  4   

                                             content  
0  <reponame>MTES-MCT/sparte\nfrom rest_framework...  
1  from django.contrib import admin\nfrom .models...  
2  import asyncio\nimport os\nimport tempfile\nfr...  
3  <gh_stars>1-10\nclass Solution:\n    def final...  


In [9]:
repo_name_column = 'max_stars_repo_name'
content_column = 'content'

df = df[[repo_name_column, content_column]]

df['repo_id'] = df[repo_name_column].apply(lambda x: repo_to_id(x, repo_to_id_dict), meta=('repo_id', int))
df = df[['repo_id', content_column]]

df['content_length'] = df[content_column].apply(lambda x: calculate_lines(x,), meta=('content_length', int))
df = df[['repo_id', 'content_length']]


In [10]:
lines_count_filename = 'lines_per_file.parquet'

df.compute().to_parquet(os.path.join(data_dir, lines_count_filename), index=False)


### Creating a repositories split (outliers and percentiles)

Dataset wil be divided into `num_parts` parts (100%/`num_parts` of all projects without the outliers each) and one part with the outliers. Corresponding `repo_id`'s will be saved in csv-files.

 - `lines_count_filename` -- parquet file with two `int` columns: `repo_id`, `content_length`, should be in `data_dir`
 - `split_dir_name` -- name of directory inside `dir_name` with split files
 - `outlier_percentile` -- percentile for outliers
 - `num_parts` -- number of categories in the split 


In [11]:
lines_count_filename = 'lines_per_file.parquet'

df = pd.read_parquet(os.path.join(data_dir, lines_count_filename))  # change to dd.read_parquet if file is too big


In [12]:
split_dir_name = 'repo_ids_by_length'

if not os.path.exists(os.path.join(data_dir, split_dir_name)):
    os.mkdir(os.path.join(data_dir, split_dir_name))
    


In [13]:
outlier_percentile = 0.995

df_repo_len = df.groupby('repo_id')['content_length'].sum()
outlier_value = df_repo_len.quantile(outlier_percentile)


In [14]:
num_parts = 6

split = pd.qcut(df_repo_len[df_repo_len<=outlier_value], q=num_parts).rename('bin')


In [15]:
for category in split.unique():
    cat_df = split[split==category].reset_index()['repo_id']
    filename = 'repo_ids_lines_' + f'{category}'[1:-1].replace('.0', '').replace(', ', '_') + '.csv'
    cat_df.to_csv(os.path.join(data_dir, split_dir_name, filename), index=False)
    
df_largest = df_repo_len[df_repo_len>outlier_value].reset_index()['repo_id']
filename = 'repo_ids_lines_' + f'{int(outlier_value)}_inf' + '.csv'
df_largest.to_csv(os.path.join(data_dir, split_dir_name, filename), index=False)


In [16]:
print(df_largest.shape, outlier_value)


(8393,) 16769.14500000002


### Dividing the dataset by the projects lengths

 - `repo_to_id_filename` -- name of json file with the repo_names -> int mapping, should be in `data_dir`
 - `parquet_path` -- path to the initial dataset
 - `repo_name_column` -- column with the repository names
 - `content_column` -- column with the content (`str`)
 - `filename_column`
 - `split_dir_name` -- name of directory inside `dir_name` with csv split files
 - `dataset_dir` -- name of directory inside `dir_name` with splitted dataset
 

In [17]:
repo_to_id_filename = 'repo_to_id.json'

with open(os.path.join(data_dir, repo_to_id_filename), 'r') as f:
    repo_to_id_dict = json.load(f)
    

In [18]:
parquet_path = '/mnt/data/glukhov/python_starcoder.parquet'

df = dd.read_parquet(parquet_path, engine='pyarrow')

print(df.head())


                                 max_stars_repo_path   
0                         public_data/serializers.py  \
1                              quick_search/admin.py   
2                                      rasa/train.py   
3  coding_intereview/1475. Final Prices With a Sp...   
4               rplugin/python3/denite/ui/default.py   

                max_stars_repo_name id   
0                   MTES-MCT/sparte  0  \
1     naman1901/django-quick-search  1   
2  Amirali-Shirkh/rasa-for-botfront  2   
3        Jahidul007/Python-Bootcamp  3   
4            timgates42/denite.nvim  4   

                                             content  
0  <reponame>MTES-MCT/sparte\nfrom rest_framework...  
1  from django.contrib import admin\nfrom .models...  
2  import asyncio\nimport os\nimport tempfile\nfr...  
3  <gh_stars>1-10\nclass Solution:\n    def final...  


In [19]:
repo_name_column = 'max_stars_repo_name'
content_column = 'content'
filename_column = 'max_stars_repo_path'

df = df[[repo_name_column, content_column, filename_column]]

df['repo_id'] = df[repo_name_column].apply(lambda x: repo_to_id(x, repo_to_id_dict), meta=('repo_id', int))
df = df[['repo_id', content_column, filename_column]]


In [20]:
split_dir_name = 'repo_ids_by_length'
dataset_dir = 'length_divided_dataset'

if not os.path.exists(os.path.join(data_dir, dataset_dir)):
    os.mkdir(os.path.join(data_dir, dataset_dir))
    

In [None]:
for filename in tqdm(os.listdir(os.path.join(data_dir, split_dir_name))):
    repo_ids = pd.read_csv(os.path.join(data_dir, split_dir_name, filename))
    curr_df = df.merge(repo_ids, on='repo_id', how='inner')
    file_path = os.path.join(data_dir, dataset_dir, f'{filename.replace("repo_ids_lines", "num_lines").replace(".csv","")}.parquet')
    curr_df.sort_values('repo_id').to_parquet(file_path, engine='pyarrow', write_index=False)
    

  0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
split.unique()


To load the data use:
 1. `pd.read_parquet('./data/length_divided_dataset/num_lines_278_538.parquet')` 
 1. `pd.read_parquet('./data/length_divided_dataset/num_lines_278_538.parquet/part.12.parquet')`
 

### Creating jsons with 2 mappings `file` <-> `repo_id`

In [None]:
parquet_content = dict()
for parquet_dir in tqdm(os.listdir(os.path.join(data_dir, dataset_dir))):
    for parquet_file in tqdm(os.listdir(os.path.join(data_dir, dataset_dir, parquet_dir))):
        filepath = os.path.join(data_dir, dataset_dir, parquet_dir, parquet_file)
        df = pd.read_parquet(filepath)
        parquet_content[os.path.join(parquet_dir, parquet_file)] = df['repo_id'].unique().tolist()

with open(os.path.join(data_dir, 'file_to_repo_id.json'), 'w') as f:
    json.dump(parquet_content, f)


In [None]:
with open(os.path.join(data_dir, 'file_to_repo_id.json'), 'r') as f:
    parquet_content = json.load(f)

id_2_filename = dict()
for filename, repo_ids in tqdm(parquet_content.items()):
    for repo_id in repo_ids:
        if repo_id not in id_2_filename:
            id_2_filename[repo_id] = list()

        id_2_filename[repo_id].append(filename)

with open(os.path.join(data_dir, 'repo_id_to_file.json'), 'w') as f:
    json.dump(id_2_filename, f)
    

In [None]:
print('Average number of partition files for a project: ', sum(len(i) for i in id_2_filename.values()) / len(id_2_filename.values())

In [3]:
os.listdir(os.path.join(data_dir))

['repo_id_to_file.json',
 'repo_to_id.json',
 'lines_per_file.parquet',
 'test_ids.json',
 'length_divided_dataset',
 'file_to_repo_id.json',
 'python_stack_split.json',
 'repo_ids_by_length']

In [4]:
repo_to_id_filename = 'repo_to_id.json'

with open(os.path.join(data_dir, repo_to_id_filename), 'r') as f:
    repo_to_id_dict = json.load(f)
    

In [5]:
split_filename = 'python_stack_split.json'

with open(os.path.join(data_dir, split_filename), 'r') as f:
    split_dict = json.load(f)
    

In [6]:
assert set(repo_to_id_dict.keys()) == set(split_dict.keys())


In [7]:
print(set(split_dict.values()))


{'Split.VAL', 'Split.TRAIN', 'Split.TEST'}


In [8]:
test_ids = list()

for repo_name in split_dict.keys():
    if 'test' in split_dict[repo_name].lower():
        test_ids.append(repo_to_id_dict[repo_name])
        
with open(os.path.join(data_dir, 'test_ids.json'), 'w') as f:
    json.dump(test_ids, f)
        

In [9]:
print(f'Number of test repositories: {len(test_ids)}')

Number of test repositories: 84162


In [10]:
test_ids_filename = 'test_ids.json'

with open(os.path.join(data_dir, test_ids_filename), 'r') as f:
    test_ids = json.load(f)
    

In [11]:
split_dir_name = 'repo_ids_by_length'

for filename in os.listdir(os.path.join(data_dir, split_dir_name)):
    df = pd.read_csv(os.path.join(data_dir, split_dir_name, filename))
    print('For', filename[9:-4].replace('_', ' '),  'number of repositories is: ', df.isin(test_ids).sum().iloc[0],
          'out of', df.size
         )
    

For lines 16769 inf number of repositories is:  406 out of 8393
For lines 278 538 number of repositories is:  13846 out of 278211
For lines -001 64 number of repositories is:  14111 out of 281809
For lines 1239 16769 number of repositories is:  13936 out of 278221
For lines 145 278 number of repositories is:  13860 out of 276442
For lines 64 145 number of repositories is:  13858 out of 277500
For lines 538 1239 number of repositories is:  14145 out of 277996


In [12]:
id_2_parquet_filename = 'repo_id_to_file.json'

with open(os.path.join(data_dir, id_2_parquet_filename), 'r') as f:
    id_2_parquet = json.load(f)


parquet_2_id_filename = 'file_to_repo_id.json'

with open(os.path.join(data_dir, parquet_2_id_filename), 'r') as f:
    parquet_2_id = json.load(f)
    
    

In [13]:
dataset_dir = 'length_divided_dataset'

df_list = list()

for parquet_fn in tqdm(os.listdir(os.path.join(data_dir, dataset_dir))):
    parquet_path = os.path.join(data_dir, dataset_dir, parquet_fn)
    print(parquet_fn)
    df_part_list = list()
    for partition_fn in tqdm(os.listdir(parquet_path)):
        
        test_ids_in_file = list(set(parquet_2_id[f'{parquet_fn}/{partition_fn}']).intersection(test_ids))
        partition_path = os.path.join(parquet_path, partition_fn)
        df_part = pd.read_parquet(partition_path)
        df_part = df_part[df_part.repo_id.isin(test_ids_in_file)]
        df_part_list.append(df_part)
    df_list.append(pd.concat(df_part_list))
        
df = pd.concat(df_list)

df = df.reset_index(drop=True)
df = df.sort_values(by='repo_id')

df.to_parquet(os.path.join(data_dir, 'test_dataset.parquet'), index=False)


  0%|          | 0/7 [00:00<?, ?it/s]

num_lines_-001_64.parquet


  0%|          | 0/470 [00:00<?, ?it/s]

num_lines_538_1239.parquet


  0%|          | 0/470 [00:00<?, ?it/s]

num_lines_1239_16769.parquet


  0%|          | 0/470 [00:00<?, ?it/s]

num_lines_145_278.parquet


  0%|          | 0/470 [00:00<?, ?it/s]

num_lines_16769_inf.parquet


  0%|          | 0/470 [00:00<?, ?it/s]

num_lines_278_538.parquet


  0%|          | 0/470 [00:00<?, ?it/s]

num_lines_64_145.parquet


  0%|          | 0/470 [00:00<?, ?it/s]

Unnamed: 0,repo_id,content,max_stars_repo_path
8,1394803,import os\nimport testinfra.utils.ansible_runn...,molecule/resources/tests/test_default.py
20,1394891,<reponame>r-mittal/inception-machina<filename>...,handling.py
46,1395035,<filename>HW0/ec2OnAWS/src/aws_ec2.py\nimport ...,HW0/ec2OnAWS/src/aws_ec2.py
67,1395095,"<filename>sphinxnotes/mock/__init__.py\n""""""\n ...",sphinxnotes/mock/__init__.py
70,1395115,import synthtool as s\nimport synthtool.gcp as...,synth.py


In [55]:
df = pd.read_parquet(os.path.join(data_dir, 'test_dataset.parquet'),)


In [56]:
df['is_context'] = True

number_of_files = df.groupby('repo_id')['is_context'].sum()
repo_ids_multiple_files = number_of_files[number_of_files > 1].index.to_list()
df_multiple_files = df[df['repo_id'].isin(repo_ids_multiple_files)]

index_to_completion = df_multiple_files.groupby('repo_id').sample(1).index.to_list()
df_multiple_files['is_context'].loc[index_to_completion] = False


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_multiple_files['is_context'].loc[index_to_completion] = False


In [57]:
print('Number of projects with the following number of files')
(df.groupby('repo_id').agg(list).reset_index()['is_context'].apply(lambda x: len(x))).value_counts().head(10)


Number of projects with the following number of files


is_context
1     26985
2     12361
3      7993
4      5972
5      4445
6      3583
7      2886
8      2386
9      1918
10     1584
Name: count, dtype: int64

In [62]:
df_context_split = df_multiple_files.groupby(['repo_id', 'is_context']).agg(list).reset_index()
df_context_split = df_context_split.rename(columns={'max_stars_repo_path': 'filename'})

df_context_split.head()


Unnamed: 0,repo_id,is_context,content,filename
0,36,False,[<reponame>onlyhavecans/mmPython\nfrom datetim...,[mm/utils.py]
1,36,True,[<gh_stars>0\n#!/usr/bin/env python\nfrom setu...,"[setup.py, mm/session.py, mm/logger.py, mm/fif..."
2,60,False,[<gh_stars>0\n__author__ = 'maartenbreddels'\n...,[packages/vaex-core/vaex/dataset_mmap.py]
3,60,True,[<reponame>yohplala/vaex\n__version_tuple__ = ...,"[packages/vaex-core/vaex/core/_version.py, tes..."
4,77,False,[import argparse\n\nfrom bioplottemplates.libs...,[src/bioplottemplates/cli_labeldots.py]


In [63]:
split_dir_name = 'repo_ids_by_length'

for filename in os.listdir(os.path.join(data_dir, split_dir_name)):
    len_ids_df = pd.read_csv(os.path.join(data_dir, split_dir_name, filename))
    print('For', filename[9:-4].replace('_', ' '),  'number of repositories is: ', len_ids_df.isin(repo_ids_multiple_files).sum().iloc[0],
          'out of', len_ids_df.size
         )
    

For lines 16769 inf number of repositories is:  405 out of 8393
For lines 278 538 number of repositories is:  11450 out of 278211
For lines -001 64 number of repositories is:  3058 out of 281809
For lines 1239 16769 number of repositories is:  13646 out of 278221
For lines 145 278 number of repositories is:  9219 out of 276442
For lines 64 145 number of repositories is:  6385 out of 277500
For lines 538 1239 number of repositories is:  13014 out of 277996


In [85]:
id_to_repo_dict = {v: k for k, v in repo_to_id_dict.items()}

assert df_context_split.shape[0] % 2 == 0
df_context_split = df_context_split.sort_values(by=['repo_id', 'is_context'])

benchmark_data = list()

for i in tqdm(range(df_context_split.shape[0] // 2)):
    datapoint = dict()
    
    row_completion = df_context_split.iloc[2*i]
    row_context = df_context_split.iloc[2*i + 1]
    assert row_completion['repo_id'] == row_context['repo_id']
    
    if row_completion['is_context']:
        row_completion, row_context = row_context, row_completion
    assert row_context['is_context'] and (not row_completion['is_context'])
    assert int(row_context['repo_id']) == row_context['repo_id']
    
    datapoint['repo_id'] = int(row_context['repo_id'])
    datapoint['repo_name'] = id_to_repo_dict[row_context['repo_id']]
    
    assert len(row_context['filename']) == len(row_context['content'])
    context = dict(zip(row_context['filename'], row_context['content']))
    datapoint['context'] = context
    
    assert len(row_completion['filename']) == len(row_completion['content'])
    completion = dict(zip(row_completion['filename'], row_completion['content']))
    datapoint['completion'] = completion
    
    benchmark_data.append(datapoint)
    
    

  0%|          | 0/57177 [00:00<?, ?it/s]

In [86]:
with open(os.path.join(data_dir, 'benchmark_data.json'), 'w') as f:
    json.dump(benchmark_data, f)
