In [None]:
!pip install -U transformers datasets accelerate bitsandbytes peft

Collecting transformers
  Downloading transformers-4.55.2-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metada

In [None]:
import os,json, math, time, random, subprocess, tempfile
import torch
from torch.utils.data import DataLoader
from datasets import Dataset
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

In [None]:
from google.colab import drive
drive.mount('/content/drive')  # follow the OAuth steps
# create a working folder on drive
!mkdir -p /content/drive/MyDrive/
WORK_DIR = "/content/drive/MyDrive/"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Cell 3: config
import os
DATA_JSON = "converted_dataset.json"              # or f"{WORK_DIR}/data.json"
PRETRAINED = "google/bigbird-pegasus-large-bigpatent"
OUTPUT_DIR = "./bigbird_obf_output"  # or f"{WORK_DIR}/bigbird_obf_output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

max_source_length = 2048
max_target_length = 1024
per_device_train_batch_size = 1
per_device_eval_batch_size = 1
gradient_accumulation_steps = 8
num_train_epochs = 2
learning_rate = 3e-5
USE_LORA = True
# ... shard and generation params ...
shard_max_tokens = 1800
shard_overlap_tokens = 200
num_beams = 6

In [None]:
# Cell 4: imports and helpers
import time
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

def print_gpu():
    !nvidia-smi --query-gpu=name,memory.total,memory.used --format=csv,nounits


In [None]:
# Cell 5: load and preview
with open(DATA_JSON, "r", encoding="utf-8") as f:
    raw = json.load(f)
print("Records:", len(raw))
# show few samples
for i in range(2):
    print("----- SAMPLE", i)
    print("INPUT (first 400 chars):", raw[i]["input"][:400])
    print("OUTPUT (first 400 chars):", raw[i]["output"][:400])


Records: 39162
----- SAMPLE 0
INPUT (first 400 chars): function mutate(translate) {
      for (var i = 0; i < TRANSFORMERS.length; i++) {
        var transformer = TRANSFORMERS[i];

        if (isFunction(transformer) && isFunction(transformer().modify)) {
          translate = transformer(Glide, Components, Events).modify(translate);
        } else {
          warn('Transformer should be a function that returns an object with `modify()` method');
   
OUTPUT (first 400 chars): (function (_0x410290, _0x120a7b) {
    var _0x9c56c1 = a0_0x2ca7, _0x123f45 = _0x410290();
    while (!![]) {
        try {
            var _0x1a367a = -parseInt(_0x9c56c1(0x1dd)) / 0x1 * (parseInt(_0x9c56c1(0x1df)) / 0x2) + -parseInt(_0x9c56c1(0x1dc)) / 0x3 + -parseInt(_0x9c56c1(0x1e2)) / 0x4 * (-parseInt(_0x9c56c1(0x1da)) / 0x5) + parseInt(_0x9c56c1(0x1d9)) / 0x6 + parseInt(_0x9c56c1(0x1db)) / 0
----- SAMPLE 1
INPUT (first 400 chars): function mutate(translate) {
      for (var i = 0; i < TRANSFORMERS.length; i

In [None]:
# Cell 6: HF dataset + split
ds = Dataset.from_list(raw)
ds = ds.train_test_split(test_size=0.1, shuffle=True, seed=42)
train_ds = ds['train']
eval_ds = ds['test']
print(len(train_ds), len(eval_ds))


35245 3917


In [None]:
# Cell 7: tokenizer + model + LoRA
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token":"<pad>"})

model = AutoModelForSeq2SeqLM.from_pretrained(PRETRAINED)
model.resize_token_embeddings(len(tokenizer))

if USE_LORA:
    try:
        model = prepare_model_for_kbit_training(model)
    except Exception:
        pass
    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="SEQ_2_SEQ_LM",
    )
    model = get_peft_model(model, lora_config)
print("Model loaded. Trainable params:", sum(p.numel() for p in model.parameters() if p.requires_grad))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/1.92M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/775 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.31G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.31G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/232 [00:00<?, ?B/s]

Model loaded. Trainable params: 2097152


In [None]:
# Cell 8: tokenize datasets
def preprocess(examples):
    inputs = examples["input"]
    targets = examples["output"]
    model_inputs = tokenizer(inputs, max_length=max_source_length, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_ds = train_ds.map(preprocess, batched=True, remove_columns=["input","output"])
eval_ds  = eval_ds.map(preprocess, batched=True, remove_columns=["input","output"])
train_ds.set_format(type="torch")
eval_ds.set_format(type="torch")


Map:   0%|          | 0/35245 [00:00<?, ? examples/s]



Map:   0%|          | 0/3917 [00:00<?, ? examples/s]

In [None]:
# Cell 9: trainer setup
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    eval_strategy="epoch",
    learning_rate=learning_rate,
    weight_decay=0.01,
    num_train_epochs=num_train_epochs,
    fp16=torch.cuda.is_available(),
    save_total_limit=3,
    logging_steps=100,
    save_strategy="epoch",
)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
    data_collator=data_collator
)
print("Trainer ready.")


  trainer = Seq2SeqTrainer(


Trainer ready.


In [None]:
# Cell 10: training
trainer.train()
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Training complete; saved to", OUTPUT_DIR)
# If runtime disconnects, your checkpoints saved to Drive will persist (if you used Drive). Use trainer.train(resume_from_checkpoint=CHECKPOINT_DIR) to continue.

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33medratabaku[0m ([33medratabaku-srh-university-of-applied-sciences-north-rhin[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Epoch,Training Loss,Validation Loss
1,3.2766,3.013855


Epoch,Training Loss,Validation Loss
1,3.2766,3.013855
2,3.2931,3.042805


Training complete; saved to ./bigbird_obf_output


In [None]:
# Cell 11: sharding & stitching functions (token-based)
def shard_text_to_token_chunks(text, max_tokens=shard_max_tokens, overlap=shard_overlap_tokens):
    tok_ids = tokenizer.encode(text, add_special_tokens=False)
    chunks=[]
    start=0; L=len(tok_ids)
    if L==0: return [[tokenizer.pad_token_id]]
    while start < L:
        end = min(start + max_tokens, L)
        chunks.append(tok_ids[start:end])
        if end==L: break
        start = max(0, end - overlap)
    return chunks

def merge_strings_with_overlap(a,b,min_overlap_chars=12,max_overlap_chars=800):
    max_check=min(max_overlap_chars,len(a),len(b))
    best=0
    for k in range(max_check, min_overlap_chars-1, -1):
        if a.endswith(b[:k]):
            best=k; break
    if best>0: return a+b[best:]
    return a+b

def stitch_decoded_chunks(decoded_chunks):
    if not decoded_chunks: return ""
    out = decoded_chunks[0]
    for nxt in decoded_chunks[1:]:
        out = merge_strings_with_overlap(out, nxt)
    return out


In [None]:
# Cell 12: batched generation
@torch.no_grad()
def generate_for_chunks(chunks_token_ids, batch_size=4, num_beams=num_beams, max_target_len=max_target_length):
    decoded=[]
    device = next(model.parameters()).device
    for i in range(0, len(chunks_token_ids), batch_size):
        batch = chunks_token_ids[i:i+batch_size]
        texts = [tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) for ids in batch]
        enc = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=max_source_length).to(device)
        gen_ids = model.generate(
            input_ids=enc["input_ids"],
            attention_mask=enc["attention_mask"],
            max_length=max_target_len,
            num_beams=num_beams,
            length_penalty=1.0,
            early_stopping=True,
            no_repeat_ngram_size=3,
            use_cache=True,
        )
        decs = tokenizer.batch_decode(gen_ids, skip_special_tokens=True)
        decoded.extend(decs)
    return decoded


In [None]:
# Cell 13: full inference helper
def obfuscate_long_code(code, gen_batch_size=4, num_beams_local=num_beams):
    chunks = shard_text_to_token_chunks(code, max_tokens=shard_max_tokens, overlap=shard_overlap_tokens)
    decoded_chunks = generate_for_chunks(chunks, batch_size=gen_batch_size, num_beams=num_beams_local)
    obf = stitch_decoded_chunks(decoded_chunks)
    return obf


In [None]:
import torch
import json

results = []
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
for item in ds['test']:
    input_code = item["input"]
    expected_code = item["output"]

    inputs = tokenizer(
        input_code,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=4096
    ).to(device)  # Move to same device as the model

    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_length=4096,
            num_beams=5,        # beam search
            early_stopping=True
        )

    generated_code = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    results.append({
        "input": input_code,
        "predicted_output": generated_code,
        "expected_output": expected_code
    })

# Save results to a JSON file
with open("obfuscation_test_results.json", "w") as f:
    json.dump(results, f, indent=4)

print("Saved test results to obfuscation_test_results.json")


KeyboardInterrupt: 

In [None]:
print(results)

[{'input': 'function fixOnBandTicksCoords(axis, ticksCoords, tickCategoryInterval, alignWithLabel, clamp) {\n    var ticksLen = ticksCoords.length;\n\n    if (!axis.onBand || alignWithLabel || !ticksLen) {\n        return;\n    }\n\n    var axisExtent = axis.getExtent();\n    var last;\n    if (ticksLen === 1) {\n        ticksCoords[0].coord = axisExtent[0];\n        last = ticksCoords[1] = {coord: axisExtent[0]};\n    }\n    else {\n        var shift = (ticksCoords[1].coord - ticksCoords[0].coord);\n        each(ticksCoords, function (ticksItem) {\n            ticksItem.coord -= shift / 2;\n            var tickCategoryInterval = tickCategoryInterval || 0;\n            // Avoid split a single data item when odd interval.\n            if (tickCategoryInterval % 2 > 0) {\n                ticksItem.coord -= shift / ((tickCategoryInterval + 1) * 2);\n            }\n        });\n        last = {coord: ticksCoords[ticksLen - 1].coord + shift};\n        ticksCoords.push(last);\n    }\n\n    v