## Download Raw PL Data

In [None]:
from collections import defaultdict
from datasets import load_dataset

# List of target languages
languages = ["Java", "C", "Python", "HTML", "PHP", "Markdown", "C++", "C#", "Ruby", "Rust", "GO", "JavaScript", "SQL"]

# Load the dataset in streaming mode
ds = load_dataset("codeparrot/github-code", split="train", cache_dir='../../data_cache/', streaming=True)

# Initialize the dictionary to hold code snippets for each language
language_dict = defaultdict(list)

# Define the maximum number of data points per language
max_datapoints = 50000

# Function to check if all target languages have reached the limit
def all_languages_full(data, target_languages, max_limit):
    return all(len(data[lang]) >= max_limit for lang in target_languages)

# Define the step interval for checking conditions
step_interval = 100000

# Iterate through the dataset
for idx, item in enumerate(ds):
    if "language" in item and "code" in item:
        lang = item["language"]
        code = item["code"]
        
        # Process only target languages
        if lang in languages and len(language_dict[lang]) < max_datapoints:
            language_dict[lang].append(code)
    
    # Check conditions every step_interval
    if (idx + 1) % step_interval == 0:
        
        # Debug: Print the summary of collected data
        for language, code_list in language_dict.items():
            print(f"Language: {language}, Number of Snippets: {len(code_list)}")

        if all_languages_full(language_dict, languages, max_datapoints):
            print(f"All target languages reached {max_datapoints} items. Stopping early.")
            break


## Download Raw Wikipedia English

In [None]:
from collections import defaultdict
from datasets import load_dataset

# Load the dataset in streaming mode
ds = load_dataset("wikimedia/wikipedia", "20231101.en", cache_dir='../../data_cache/', streaming=True, split="train")


# Initialize the dictionary to hold code snippets for each language
language_dict = defaultdict(list)

# Define the maximum number of data points per language
max_datapoints = 50000

# Iterate through the dataset
for idx, item in enumerate(ds):
    text = item["text"]
    # Process only target languages
    if len(language_dict['Wiki']) < max_datapoints:
        language_dict['Wiki'].append(text)
    else:
        break

In [None]:
import pickle

def save_dict(data, filename="dict_of_lists.pkl"):
    with open(filename, "wb") as f:
        pickle.dump(data, f)
    print(f"Dictionary saved to {filename}")


# Function to load the dictionary from a pickle file
def load_dict(filename="dict_of_lists.pkl"):
    with open(filename, "rb") as f:
        data = pickle.load(f)
    print(f"Dictionary loaded from {filename}")
    return data

In [None]:
# Save the example dictionary
save_dict(language_dict)

# Load the dictionary
loaded_dict = load_dict()