# Config

In [92]:
MODEL_TYPE = "facebook/bart-base"
SAVE_MODEL = "BART_2"
TEST_MODEL = "BART_2"

# Data

In [93]:
import datasets

In [94]:
dataset = datasets.load_dataset("evgenesh/java-obfuscation")["train"]
dataset

Found cached dataset json (/Users/eshevlyakov/.cache/huggingface/datasets/evgenesh___json/evgenesh--java-obfuscation-4846253e475242bb/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['source', 'decompiled'],
    num_rows: 405
})

# Tokenization

In [95]:
from transformers import BartTokenizer

In [96]:
tokenizer = BartTokenizer.from_pretrained(MODEL_TYPE)

loading file vocab.json from cache at /Users/eshevlyakov/.cache/huggingface/hub/models--facebook--bart-base/snapshots/aadd2ab0ae0c8268c7c9693540e9904811f36177/vocab.json
loading file merges.txt from cache at /Users/eshevlyakov/.cache/huggingface/hub/models--facebook--bart-base/snapshots/aadd2ab0ae0c8268c7c9693540e9904811f36177/merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at /Users/eshevlyakov/.cache/huggingface/hub/models--facebook--bart-base/snapshots/aadd2ab0ae0c8268c7c9693540e9904811f36177/config.json
Model config BartConfig {
  "_name_or_path": "facebook/bart-base",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "clas

# Data Collator

In [97]:
from transformers import DataCollatorForSeq2Seq

In [98]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=MODEL_TYPE)

# Evaluate

In [99]:
import evaluate
import numpy as np
from transformers import EvalPrediction

In [100]:
metric = evaluate.load("sacrebleu")

In [101]:
def compute_metrics(eval_preds: EvalPrediction) -> dict|None:
    # References List[] -> List[str]
    refs = eval_preds.label_ids
    refs = np.where(refs != -100, refs, tokenizer.pad_token_id) # TODO: why refs != -100
    refs = tokenizer.batch_decode(refs)
    refs = [[ref.strip()] for ref in refs]

    # Predictions List[int] -> List[str]
    preds = eval_preds.predictions
    if isinstance(preds, tuple):
        preds = preds[0]
    preds = tokenizer.batch_decode(preds)
    preds = [pred.strip() for pred in preds]

    # Metric bleu
    result = metric.compute(references=refs, predictions=preds)
    result = {"bleu": result["score"]}

    # Metric gen_len
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}

    return result

# Train the model

In [102]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, PreTrainedTokenizer

Form data

In [103]:
def create_inputs(dataset: datasets.Dataset, tokenizer: PreTrainedTokenizer, test_size=0.2):
    train_dataset, test_dataset = dataset.train_test_split(test_size).values()
    print(train_dataset)

    tokenize = lambda x: tokenizer(
        text=x["decompiled"],
        text_target=x["source"],
        padding="max_length",
        truncation=True
    )

    tokenized_train_dataset = train_dataset.map(tokenize)
    tokenized_test_dataset = test_dataset.map(tokenize)

    return tokenized_train_dataset, tokenized_test_dataset

In [104]:
train_dataset, test_dataset = create_inputs(dataset, tokenizer)

Loading cached split indices for dataset at /Users/eshevlyakov/.cache/huggingface/datasets/evgenesh___json/evgenesh--java-obfuscation-4846253e475242bb/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-093e8dc553d396d3.arrow and /Users/eshevlyakov/.cache/huggingface/datasets/evgenesh___json/evgenesh--java-obfuscation-4846253e475242bb/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-c38d3a85caa176fa.arrow


Dataset({
    features: ['source', 'decompiled'],
    num_rows: 324
})


Loading cached processed dataset at /Users/eshevlyakov/.cache/huggingface/datasets/evgenesh___json/evgenesh--java-obfuscation-4846253e475242bb/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-75735789b34829ff.arrow


Map:   0%|          | 0/81 [00:00<?, ? examples/s]

Model

In [105]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_TYPE)

loading configuration file config.json from cache at /Users/eshevlyakov/.cache/huggingface/hub/models--facebook--bart-base/snapshots/aadd2ab0ae0c8268c7c9693540e9904811f36177/config.json
Model config BartConfig {
  "_name_or_path": "facebook/bart-base",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL

Training args

In [106]:
training_args = Seq2SeqTrainingArguments(
    output_dir=SAVE_MODEL,
    evaluation_strategy="epoch",
    num_train_epochs=8, # try to increase
    learning_rate=6e-5, # try to increase.
    weight_decay=0.01, # mb useless
    save_total_limit=3, # mb useless
    predict_with_generate=True,
    log_level="info",
    logging_strategy="epoch",
    # gradient_accumulation_steps - Number of updates steps to accumulate the gradients for, before performing a backward/update pass
    # Default optimizer is AdamW. Mb try another
    # auto_find_batch_size -- (bool, optional, defaults to False) — Whether to find a batch size that will fit into memory automatically through exponential decay, avoiding CUDA Out-of-Memory errors. Requires accelerate to be installed (pip install accelerate)
    # use_mps_device (bool, optional, defaults to False) — Whether to use Apple Silicon chip based mps device.


)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


Trainer

In [107]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Build the model

In [None]:
result = trainer.train()

The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: source, decompiled. If source, decompiled are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 324
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 328
  Number of trainable parameters = 139420416


Epoch,Training Loss,Validation Loss


The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: source, decompiled. If source, decompiled are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 81
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: source, decompiled. If source, decompiled are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 81
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: source, decompiled. If source, decompiled are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Runnin

In [None]:
trainer.save_model()

# Testing

In [None]:
result_model = AutoModelForSeq2SeqLM.from_pretrained(SAVE_MODEL)
result_tokenizer = BartTokenizer.from_pretrained(SAVE_MODEL)

In [None]:
def deobfuscate_code(model, tokenizer, code):
    print(f"Start to deobfuscate code:\n \033[94m{code}\033[0;0m")

    input = tokenizer.encode(code, return_tensors="pt")
    output = model.generate(input)
    deobfuscated_code = tokenizer.decode(output[0], skip_special_tokens=True)

    print(f"Deobfuscated:\n \033[94m{deobfuscated_code}\033[0;0m")

In [None]:
deobfuscate_code(model=result_model, tokenizer=result_tokenizer, code=data.decompiled[1])

In [None]:
default_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_TYPE)
default_tokenizer = BartTokenizer.from_pretrained(MODEL_TYPE)

In [None]:
deobfuscate_code(model=default_model, tokenizer=default_tokenizer, code=data.decompiled[1])