In [None]:
%pip install --upgrade transformers
%pip install wandb

In [None]:
%pip install evaluate

In [None]:
%pip install datasets==3.6.0
%pip install codebleu
%pip install tree-sitter-python==0.21
%pip install accelerate peft bitsandbytes trl==0.12.0 

**Prepare Data**

In [None]:
from datasets import load_dataset
print("check version")
from datasets import __version__ as datasets_version
print(f"datasets version: {datasets_version}")

In [None]:
from datasets import load_dataset
thevault = load_dataset("Fsoft-AIC/the-vault-function", split_set=["train/small"], languages=['python','java','c','rust','ruby','go'], trust_remote_code=True)

In [None]:
thevault_test = load_dataset("Fsoft-AIC/the-vault-function", split_set=["test"], languages=['python','java','c','rust','ruby','go'], trust_remote_code=True)

In [None]:
import re
def contains_url(text):
    url_pattern = re.compile(r'(https?://|www\.)[^\s/$.?#].[^\s]*', re.IGNORECASE)
    return bool(url_pattern.search(text))
def filtering_rules(example):
    doc  = example["docstring"].strip()
    code = example["code"].strip()

    # 1. Độ dài hợp lệ
    if not (30 <= len(doc) <= 300 and 30 <= len(code) <= 600):
        return False
    # 2. Không chứa URL
    if contains_url(doc):
        return False
    # 3. Không phải method trong class (loại bỏ 'self')
    if "self" in code:
        return False
    return True

In [None]:
def normalize(ex):
        return {
            "input":  ex["docstring"].strip(),
            "output": ex["code"].strip(),
            "language": ex.get("language", "unknown")  # Preserve language field for continual learning
        }

vault = thevault.map(
        normalize,
        remove_columns=[
            "hexsha", "repo", "path", "license",
            "identifier", "return_type", "original_string", "original_docstring",
            "docstring_tokens", "code_tokens", "short_docstring",
            "short_docstring_tokens", "comment", "parameters", "docstring_params"
        ]
    )

vault_test = thevault_test.map(
        normalize,
        remove_columns=[
            "hexsha", "repo", "path", "license",
            "identifier", "return_type", "original_string", "original_docstring",
            "docstring_tokens", "code_tokens", "short_docstring",
            "short_docstring_tokens", "comment", "parameters", "docstring_params"
        ]
    )

print(f"Vault train: {len(vault)} examples")
print(f"Vault test: {len(vault_test)} examples")
print(f"Available languages in vault: {set(vault['language'])}")
print(f"Available languages in vault_test: {set(vault_test['language'])}")

In [None]:
vault

In [None]:
from datasets import DatasetDict
def extract_dataset(ds):
    if isinstance(ds, DatasetDict):
        return ds[list(ds.keys())[0]]
    return ds

train_ds = extract_dataset(vault)
test_ds  = extract_dataset(vault_test)

In [None]:
train_ds[80010]


**Prepare Model**

In [None]:
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-small")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def processed_data(examples):
    """Simple tokenization function that preserves language field"""
    inputs = examples["input"]
    targets = examples["output"]
    
    # Tokenize inputs
    model_inputs = tokenizer(
        inputs,
        max_length=256,
        truncation=True,
        padding="max_length"
    )
    
    # Tokenize targets
    labels = tokenizer(
        targets,
        max_length=256,
        truncation=True,
        padding="max_length"
    ).input_ids
    
    # Replace pad tokens with -100 for loss calculation
    labels = [[-100 if token == tokenizer.pad_token_id else token for token in label] for label in labels]
    model_inputs["labels"] = labels
    
    # Preserve language field for continual learning
    if "language" in examples:
        model_inputs["language"] = examples["language"]
    
    return model_inputs

print("Tokenizer ready!")

In [None]:
# Tokenize datasets
print("Tokenizing datasets...")

tokenized_train = train_ds.map(
    processed_data,
    batched=True,
    remove_columns=train_ds.column_names
)

tokenized_test = test_ds.map(
    processed_data,
    batched=True,
    remove_columns=test_ds.column_names
)

print(f"Tokenized - Train: {len(tokenized_train)}, Test: {len(tokenized_test)}")
print(f"Columns: {tokenized_train.column_names}")

In [None]:
print("Train columns:", tokenized_train.column_names)
print("Test  columns:", tokenized_test.column_names)

In [None]:
from peft import LoraConfig, get_peft_model, TaskType
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load the base model
MODEL_NAME = "SailorDad/t5-small-lora-seq2seq"
base_model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

# Make sure base model parameters don't require grad initially
# Changed 'model' to 'base_model'
for param in base_model.parameters():
    param.requires_grad = False

# Create LoRA config
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,  # Important: set to False for training
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q", "v", "k", "o", "wi_0", "wi_1", "wo"]  # T5 specific modules
)

# Create PEFT model
# Changed 'model' to 'base_model'
peft_model = get_peft_model(base_model, peft_config)
peft_model.print_trainable_parameters()

# Enable training mode
peft_model.train()

**Task Routing and Embedding System**

Implementation of elastic LoRA pool with task-based routing for code generation tasks.