In [None]:
import requests
import zipfile
import io
import os

# URL of the ZIP file
zip_url = "https://github.com/reddy-lab-code-research/MuST-CoST/raw/refs/heads/main/CoST_data.zip"

# Local directory to extract the files
extract_path = "."

# Download the ZIP file
response = requests.get(zip_url)
response.raise_for_status()  # Raise an error for failed requests

# Open the ZIP file from the response content
with zipfile.ZipFile(io.BytesIO(response.content), 'r') as zip_ref:
    # Create the extraction directory if it doesn't exist
    os.makedirs(extract_path, exist_ok=True)
    # Extract all contents
    zip_ref.extractall(extract_path)

print(f"Files extracted to: {extract_path}")


## snippet data

In [None]:
import os
import glob

# Path to the directory containing processed data
path = './CoST_data_release/processed_data/snippet_data'

# Mapping of programming languages to their file extensions
code_formats = {
    'C': '.c',
    'C++': '.cpp',
    'C#': '.cs',
    'Java': '.java',
    'Javascript': '.js',
    'PHP': '.php',
    'Python': '.py'
}


all_indices = []
all_dict = {k: {} for k in code_formats}



# Iterate through each subdirectory in the specified path
for p in glob.glob(os.path.join(path, '*')):
    folder_name = os.path.basename(p)
    lang1, lang2 = folder_name.split('-')
    
    for lang in [lang1, lang2]:
        if lang not in code_formats:
            print(f"Skipping unknown language: {lang}")
            continue
        
        indices_temp = []
        # Process each data split (train, test, val)
        for split in ['train', 'test', 'val']:
            map_file_path = os.path.join(p, f'{split}-{lang}-map.jsonl')
            code_file_path = os.path.join(p, f'{split}-{folder_name}-tok{code_formats[lang]}')
            
            # Check if files exist
            if not os.path.exists(map_file_path):
                print(f"Skipping missing file: {map_file_path}")
                continue
            
            if not os.path.exists(code_file_path):
                print(f"Skipping missing file: {code_file_path}")
                continue
            
            # Read the files
            with open(map_file_path, 'r') as f:
                indices = f.readlines()
                indices = [i.strip() for i in indices]
            with open(code_file_path, 'r') as f:
                codes = f.readlines()
            # Add logic here to process `indices` and `codes` if needed
            
            indices_temp.extend(indices)
            
            dict_temp = dict(zip(indices, codes))
            for key, value in dict_temp.items():
                if key not in all_dict[lang]:
                    all_dict[lang][key] = value

            
        # print(folder_name)
        # print(len(indices_temp))
        all_indices.append(indices_temp)

In [None]:
all_indices_new = []
for indices_temp in all_indices:
    indices_temp = [i.split('-')[0] + '-' + i.split('-')[-1] for i in indices_temp]
    all_indices_new.append(indices_temp)

In [None]:
def intersection_of_all_lists(list_of_lists):
    # Start with the set of the first list
    intersection = set(list_of_lists[0])
    
    # Iterate over the rest of the lists and keep intersecting
    for lst in list_of_lists[1:]:
        intersection &= set(lst)  # Keep the intersection with the next list
    
    return list(intersection)  # Return as a list (optional)


common_sinppet_ids = intersection_of_all_lists(all_indices_new)

In [None]:
import os
import json

# Path to the directory containing the JSONL files
input_path = 'CoST_data_release/processed_data/map_data'
output_path = 'code_snippets'


# Ensure the output directory exists
os.makedirs(output_path, exist_ok=True)

In [None]:
# Function to process each file
def process_file(file_path, output_dir):
    language = file_path.split('/')[-1].split('-')[0]  # Extract language from file name

    output_dir_lang = os.path.join(output_dir, language)
    os.makedirs(output_dir_lang, exist_ok=True)
    
    with open(file_path, 'r', encoding='utf-8') as infile:
        for line in infile:
            entry = json.loads(line)
            # Extract the base ID from the idx (e.g., "1010-2" from "1010-C#-2")
            base_id = entry['idx'].split('-')[0] + '-' + entry['idx'].split('-')[2]
            if base_id in common_sinppet_ids:
                # Save each entry to a separate file based on its base_id
                del entry['bpe']
                del entry['comment_bpe']
                del entry['desc_bpe']
                output_file = os.path.join(output_dir_lang, f"{base_id}.json")
                with open(output_file, 'w', encoding='utf-8') as outfile:
                    json.dump(entry, outfile)
                    outfile.write('\n')


In [None]:
# Iterate through the files in the input directory
for file_name in os.listdir(input_path):
    if file_name.endswith('mapping-tok.jsonl'):
        file_path = os.path.join(input_path, file_name)
        process_file(file_path, output_path)
print(f"Filtered data saved to {output_path}")


In [None]:
code_formats = {
    'C': '.c',
    'C++': '.cpp',
    'C#': '.cs',
    'Java': '.java',
    'Javascript': '.js',
    'PHP': '.php',
    'Python': '.py'
}

In [None]:
input_path = 'code_snippets'

all_langs = code_formats.keys()

all_filenames = os.listdir(os.path.join(input_path, 'C'))
# Iterate through the files in the input directory
for file_name in all_filenames:
    
    for lang in all_langs:
        
        file_path = os.path.join((os.path.join(input_path, lang)), file_name)
        with open(file_path, 'r', encoding='utf-8') as infile:
            for line in infile:
                entry = json.loads(line)
                print(lang, ':', entry['snippet'])

    print('Rust : ')
    print('\n')