In [None]:
# copyright: https://github.com/RUCAIBox/Language-Specific-Neurons/issues/2

In [None]:
token = "hf_xxXxXxXXXXxxxxxXXxxxxxXXXXXXXxXXxx"

In [None]:
llama = "codellama/CodeLlama-7b-hf"

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(llama, use_auth_token=token)
# tokenizer.pad_token =  tokenizer.eos_token

In [None]:
import pickle

# Function to load the dictionary from a pickle file
def load_dict(filename="dict_of_lists.pkl"):
    with open(filename, "rb") as f:
        data = pickle.load(f)
    print(f"Dictionary loaded from {filename}")
    return data

loaded_dict = load_dict('../datasets/raw/dict_of_lists.pkl')

In [None]:
import torch
from tqdm import tqdm

def tokenize_and_save(data, tokenizer, output_file, max_length=4096):
    """
    Tokenizes a dataset of text and saves the resulting token IDs to a file.

    Args:
        data (list of str): The dataset to tokenize.
        tokenizer (Callable): The tokenizer to use, with a `tokenizer()` method.
        output_file (str): The file to save the token IDs to.
        max_length (int, optional): Maximum sequence length for tokenization. Default is 4096.

    Returns:
        int: The number of tokens saved.
    """
    ids = []  # A flat list to hold all token IDs

    # Tokenization and concatenation
    for entry in tqdm(data, desc="Tokenizing data"):
        tokens = tokenizer(
            entry,
            truncation=True,
            max_length=max_length,  # Ensure no sequence exceeds this limit
            return_tensors=None  # Return as a flat list
        )['input_ids']  # Get the input IDs
        ids.extend(tokens)  # Append the token IDs to the flat list

    # Convert to a torch tensor for efficient storage
    ids_tensor = torch.tensor(ids, dtype=torch.int32)

    # Save to a file
    torch.save(ids_tensor, output_file)

    print(f"Saved {len(ids)} tokens to '{output_file}'")


In [None]:
import os
from tqdm import tqdm

os.makedirs(f"./data", exist_ok=True)  # Create the directory if it doesn't exist

# Iterate over each language and tokenize its data
for lang, code_snippets in tqdm(loaded_dict.items()):
    
    output_file = f"./data/id.{lang.lower()}.train.llama"  # Save each language's tokens to a separate file
    print(f"Processing language: {lang}")
    tokenize_and_save(code_snippets, tokenizer, output_file)
