Cells in this notebook are responsible to perform all the operations to divide your dataset into a number of parquet files by the number of lines in a whole project

In [3]:
import pandas as pd
import dask.dataframe as dd
import time
import json
from tqdm.auto import tqdm
import os

In [4]:
data_dir = '../../data'
if not os.path.exists(data_dir):
    os.mkdir(data_dir)


### Creating a dictionary to map repository names to `int`

 - `parquet_path` -- path to the initial dataset
 - `repo_name_column` -- column with the repository names 
 - `repo_to_id_filename` -- name of json file with the mapping, will be saved in `data_dir`

In [None]:
parquet_path = '/mnt/data/glukhov/big_context_python_starcoder_tokenized.parquet'

df = dd.read_parquet(parquet_path, engine='pyarrow')

print(df.head())


In [None]:
repo_name_column = 'max_stars_repo_name'

df = df[repo_name_column].unique().compute()
names_list = df.to_list()

repo_name_to_id = {name: idx for idx, name in enumerate(names_list)}

print('Number of unique repo names: ', len(repo_name_to_id.keys()))


In [None]:
repo_to_id_filename = 'repo_to_id.json'

with open(os.path.join(data_dir, repo_to_id_filename), 'w') as f:
    json.dump(repo_name_to_id, f)
    

### Calculating project lenghts (in lines)

 - `repo_to_id_filename` -- name of json file with the repo_names -> int mapping, should be in `data_dir`
 - `parquet_path` -- path to the initial dataset
 - `repo_name_column` -- column with the repository names
 - `content_column` -- column with the content (`str`)
 - `lines_count_filename` -- parquet file with two `int` columns: `repo_id`, `content_length`, will be saved to `data_dir`

In [None]:
def calculate_lines(content: str, newline_caracter='\n') -> int:
    return content.count(newline_caracter)

def repo_to_id(reponame: str, repo_to_id_dict: dict[str, int]) -> int:
    return repo_to_id_dict[reponame]


In [None]:
repo_to_id_filename = 'repo_to_id.json'

with open(os.path.join(data_dir, repo_to_id_filename), 'r') as f:
    repo_to_id_dict = json.load(f)
    

In [None]:
parquet_path = '/mnt/data/glukhov/python_starcoder.parquet'

df = dd.read_parquet(parquet_path, engine='pyarrow')

print(df.head())


In [None]:
repo_name_column = 'max_stars_repo_name'
content_column = 'content'

df = df[[repo_name_column, content_column]]

df['repo_id'] = df[repo_name_column].apply(lambda x: repo_to_id(x, repo_to_id_dict), meta=('repo_id', int))
df = df[['repo_id', content_column]]

df['content_length'] = df[content_column].apply(lambda x: calculate_lines(x,), meta=('content_length', int))
df = df[['repo_id', 'content_length']]


In [None]:
lines_count_filename = 'lines_per_file.parquet'

df.compute().to_parquet(os.path.join(data_dir, lines_count_filename), index=False)


### Creating a repositories split (outliers and percentiles)

Dataset wil be divided into `num_parts` parts (100%/`num_parts` of all projects without the outliers each) and one part with the outliers. Corresponding `repo_id`'s will be saved in csv-files.

 - `lines_count_filename` -- parquet file with two `int` columns: `repo_id`, `content_length`, should be in `data_dir`
 - `split_dir_name` -- name of directory inside `dir_name` with split files
 - `outlier_percentile` -- percentile for outliers
 - `num_parts` -- number of categories in the split 


In [None]:
lines_count_filename = 'lines_per_file.parquet'

df = pd.read_parquet(os.path.join(data_dir, lines_count_filename))  # change to dd.read_parquet if file is too big


In [None]:
split_dir_name = 'repo_ids_by_length'

if not os.path.exists(os.path.join(data_dir, split_dir_name)):
    os.mkdir(os.path.join(data_dir, split_dir_name))
    


In [None]:
outlier_percentile = 0.995

df_repo_len = df.groupby('repo_id')['content_length'].sum()
outlier_value = df_repo_len.quantile(outlier_percentile)


In [None]:
num_parts = 6

split = pd.qcut(df_repo_len[df_repo_len<=outlier_value], q=num_parts).rename('bin')


In [None]:
for category in split.unique():
    cat_df = split[split==category].reset_index()['repo_id']
    filename = 'repo_ids_lines_' + f'{category}'[1:-1].replace('.0', '').replace(', ', '_') + '.csv'
    cat_df.to_csv(os.path.join(data_dir, split_dir_name, filename), index=False)
    
df_largest = df_repo_len[df_repo_len>outlier_value].reset_index()['repo_id']
filename = 'repo_ids_lines_' + f'{int(outlier_value)}_inf' + '.csv'
df_largest.to_csv(os.path.join(data_dir, split_dir_name, filename), index=False)


In [None]:
print(df_largest.shape, outlier_value)


### Dividing the dataset by the projects lengths

 - `repo_to_id_filename` -- name of json file with the repo_names -> int mapping, should be in `data_dir`
 - `parquet_path` -- path to the initial dataset
 - `repo_name_column` -- column with the repository names
 - `content_column` -- column with the content (`str`)
 - `filename_column`
 - `split_dir_name` -- name of directory inside `dir_name` with csv split files
 - `dataset_dir` -- name of directory inside `dir_name` with splitted dataset
 

In [None]:
repo_to_id_filename = 'repo_to_id.json'

with open(os.path.join(data_dir, repo_to_id_filename), 'r') as f:
    repo_to_id_dict = json.load(f)
    

In [None]:
parquet_path = '/mnt/data/glukhov/python_starcoder.parquet'

df = dd.read_parquet(parquet_path, engine='pyarrow')

print(df.head())


In [None]:
repo_name_column = 'max_stars_repo_name'
content_column = 'content'
filename_column = 'max_stars_repo_path'

df = df[[repo_name_column, content_column, filename_column]]

df['repo_id'] = df[repo_name_column].apply(lambda x: repo_to_id(x, repo_to_id_dict), meta=('repo_id', int))
df = df[['repo_id', content_column, filename_column]]


In [None]:
split_dir_name = 'repo_ids_by_length'
dataset_dir = 'length_divided_dataset'

if not os.path.exists(os.path.join(data_dir, dataset_dir)):
    os.mkdir(os.path.join(data_dir, dataset_dir))
    

In [None]:
for filename in tqdm(os.listdir(os.path.join(data_dir, split_dir_name))):
    repo_ids = pd.read_csv(os.path.join(data_dir, split_dir_name, filename))
    curr_df = df.merge(repo_ids, on='repo_id', how='inner')
    file_path = os.path.join(data_dir, dataset_dir, f'{filename.replace("repo_ids_lines", "num_lines").replace(".csv","")}.parquet')
    curr_df.sort_values('repo_id').to_parquet(file_path, engine='pyarrow', write_index=False)
    

In [None]:
split.unique()


To load the data use:
 1. `pd.read_parquet('./data/length_divided_dataset/num_lines_278_538.parquet')` 
 1. `pd.read_parquet('./data/length_divided_dataset/num_lines_278_538.parquet/part.12.parquet')`
 

### Creating jsons with 2 mappings `file` <-> `repo_id`

In [None]:
parquet_content = dict()
for parquet_dir in tqdm(os.listdir(os.path.join(data_dir, dataset_dir))):
    for parquet_file in tqdm(os.listdir(os.path.join(data_dir, dataset_dir, parquet_dir))):
        filepath = os.path.join(data_dir, dataset_dir, parquet_dir, parquet_file)
        df = pd.read_parquet(filepath)
        parquet_content[os.path.join(parquet_dir, parquet_file)] = df['repo_id'].unique().tolist()

with open(os.path.join(data_dir, 'file_to_repo_id.json'), 'w') as f:
    json.dump(parquet_content, f)


In [None]:
with open(os.path.join(data_dir, 'file_to_repo_id.json'), 'r') as f:
    parquet_content = json.load(f)

id_2_filename = dict()
for filename, repo_ids in tqdm(parquet_content.items()):
    for repo_id in repo_ids:
        if repo_id not in id_2_filename:
            id_2_filename[repo_id] = list()

        id_2_filename[repo_id].append(filename)

with open(os.path.join(data_dir, 'repo_id_to_file.json'), 'w') as f:
    json.dump(id_2_filename, f)
    

In [None]:
print('Average number of partition files for a project: ', sum(len(i) for i in id_2_filename.values()) / len(id_2_filename.values())