# Config

In [52]:
MODEL_TYPE = "facebook/bart-base"
SAVE_MODEL = "BART_2"
TEST_MODEL = "BART_2"

# Data

In [53]:
import datasets

In [54]:
dataset = datasets.load_dataset("evgenesh/java-obfuscation")["train"]
dataset

Found cached dataset json (/Users/eshevlyakov/.cache/huggingface/datasets/evgenesh___json/evgenesh--java-obfuscation-4846253e475242bb/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['source', 'decompiled'],
    num_rows: 405
})

# Tokenization

In [57]:
from transformers import BartTokenizer

In [58]:
tokenizer = BartTokenizer.from_pretrained(MODEL_TYPE)

# Data Collator

In [59]:
from transformers import DataCollatorForSeq2Seq

In [60]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=MODEL_TYPE)

# Evaluate

In [61]:
import evaluate
import numpy as np
from transformers import EvalPrediction

In [62]:
metric = evaluate.load("sacrebleu")

In [63]:
def compute_metrics(eval_preds: EvalPrediction) -> dict|None:
    # References List[] -> List[str]
    refs = eval_preds.label_ids
    refs = np.where(refs != -100, refs, tokenizer.pad_token_id) # TODO: why refs != -100
    refs = tokenizer.batch_decode(refs)
    refs = [[ref.strip()] for ref in refs]

    # Predictions List[int] -> List[str]
    preds = eval_preds.predictions
    if isinstance(preds, tuple):
        preds = preds[0]
    preds = tokenizer.batch_decode(preds)
    preds = [pred.strip() for pred in preds]

    # Metric bleu
    result = metric.compute(references=refs, predictions=preds)
    result = {"bleu": result["score"]}

    # Metric gen_len
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}

    return result

# Train the model

In [64]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, PreTrainedTokenizer

Form data

In [65]:
def create_inputs(dataset: datasets.Dataset, tokenizer: PreTrainedTokenizer, test_size=0.2):
    train_dataset, test_dataset = dataset.train_test_split(test_size).values()
    print(train_dataset)

    tokenize = lambda x: tokenizer(
        text=x["decompiled"],
        text_target=x["source"],
        padding="max_length",
        truncation=True
    )

    tokenized_train_dataset = train_dataset.map(tokenize)
    tokenized_test_dataset = test_dataset.map(tokenize)

    return tokenized_train_dataset, tokenized_test_dataset

In [66]:
train_dataset, test_dataset = create_inputs(dataset, tokenizer)

Dataset({
    features: ['source', 'decompiled'],
    num_rows: 324
})


Map:   0%|          | 0/324 [00:00<?, ? examples/s]

Map:   0%|          | 0/81 [00:00<?, ? examples/s]

Model

In [67]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_TYPE)

Training args

In [71]:
training_args = Seq2SeqTrainingArguments(
    output_dir=SAVE_MODEL,
    evaluation_strategy="epoch",
    num_train_epochs=8, # try to increase
    learning_rate=6e-5, # try to increase.
    weight_decay=0.01, # mb useless
    save_total_limit=3, # mb useless
    predict_with_generate=True,
    log_level="info",
    logging_strategy="epoch",
    # gradient_accumulation_steps - Number of updates steps to accumulate the gradients for, before performing a backward/update pass
    # Default optimizer is AdamW. Mb try another


)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


Trainer

In [72]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [74]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: source, decompiled. If source, decompiled are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 81
  Batch size = 16


AttributeError: 'NotebookTrainingTracker' object has no attribute 'value'

Build the model

In [73]:
result = trainer.train()

The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: source, decompiled. If source, decompiled are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 324
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 63
  Number of trainable parameters = 139420416


KeyboardInterrupt: 

In [672]:
trainer.save_model()

Saving model checkpoint to first_BART_model
Configuration saved in first_BART_model/config.json
Model weights saved in first_BART_model/pytorch_model.bin
tokenizer config file saved in first_BART_model/tokenizer_config.json
Special tokens file saved in first_BART_model/special_tokens_map.json


# Testing

In [30]:
result_model = AutoModelForSeq2SeqLM.from_pretrained(SAVE_MODEL)
result_tokenizer = BartTokenizer.from_pretrained(SAVE_MODEL)

In [38]:
def deobfuscate_code(model, tokenizer, code):
    print(f"Start to deobfuscate code:\n \033[94m{code}\033[0;0m")

    input = tokenizer.encode(code, return_tensors="pt")
    output = model.generate(input)
    deobfuscated_code = tokenizer.decode(output[0], skip_special_tokens=True)

    print(f"Deobfuscated:\n \033[94m{deobfuscated_code}\033[0;0m")

In [39]:
deobfuscate_code(model=result_model, tokenizer=result_tokenizer, code=data.decompiled[1])

Start to deobfuscate code:
 [94mpublic final class Address {

    
    public String f3481d;

    
    public String f3479a = "Holmes, Sherlock";

    
    public String f3480b = "Baker";
    public String c = "London";

    
    public String f3482e = "123456";

    public final String getCity() {
        return this.c;
    }

    public final String getName() {
        return this.f3479a;
    }

    public final String getState() {
        return this.f3481d;
    }

    public final String getStreet() {
        return this.f3480b;
    }

    public final String getZip() {
        return this.f3482e;
    }

    public final void setCity(String str) {
        i.e("<set-?>", str);
        this.c = str;
    }

    public final void setName(String str) {
        i.e("<set-?>", str);
        this.f3479a = str;
    }

    public final void setState(String str) {
        this.f3481d = str;
    }

    public final void setStreet(String str) {
        i.e("<set-?>", str);
        this.f3480b 

In [35]:
default_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_TYPE)
default_tokenizer = BartTokenizer.from_pretrained(MODEL_TYPE)

In [40]:
deobfuscate_code(model=default_model, tokenizer=default_tokenizer, code=data.decompiled[1])

Start to deobfuscate code:
 [94mpublic final class Address {

    
    public String f3481d;

    
    public String f3479a = "Holmes, Sherlock";

    
    public String f3480b = "Baker";
    public String c = "London";

    
    public String f3482e = "123456";

    public final String getCity() {
        return this.c;
    }

    public final String getName() {
        return this.f3479a;
    }

    public final String getState() {
        return this.f3481d;
    }

    public final String getStreet() {
        return this.f3480b;
    }

    public final String getZip() {
        return this.f3482e;
    }

    public final void setCity(String str) {
        i.e("<set-?>", str);
        this.c = str;
    }

    public final void setName(String str) {
        i.e("<set-?>", str);
        this.f3479a = str;
    }

    public final void setState(String str) {
        this.f3481d = str;
    }

    public final void setStreet(String str) {
        i.e("<set-?>", str);
        this.f3480b 