In [None]:
!pip install fairseq

In [None]:
# Apex can be installed to make pretraining and finetuning faster

In [None]:
%%writefile setup.sh

git clone https://github.com/NVIDIA/apex
cd apex
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" \
  --global-option="--deprecated_fused_adam" --global-option="--xentropy" \
  --global-option="--fast_multihead_attn" ./

In [None]:
!sh setup.sh

In [None]:
# Pretrain model!
# Please make sure to change paths as appropriate

!fairseq-train drive/MyDrive/varRecovery-New/data-bin/pretrain-32bit-ideal \
    --task masked_lm \
    --criterion masked_lm \
    --arch roberta_base \
    --sample-break-mode none \
    --tokens-per-sample 512 \
    --optimizer adam \
    --adam-betas '(0.9,0.98)' --adam-eps 1e-6 \
    --clip-norm 0.0 \
    --lr-scheduler polynomial_decay \
    --max-tokens 512 \
    --lr 0.0001 \
    --fp16 \
    --warmup-updates 10000 \
    --total-num-update 305000 \
    --dropout 0.1 \
    --attention-dropout 0.1 \
    --weight-decay 0.01 \
    --update-freq 32 \
    --max-update 20000 \
    --log-format json \
    --log-interval 10 \
    --no-epoch-checkpoints \
    --save-dir drive/MyDrive/varRecovery-New/test_check/pretrain-32bit-ideal/ \
    --mask-prob 0.2 --random-token-prob 0.0 --leave-unmasked-prob 0.0

In [None]:
# Finetune Model
# Please make sure to change paths as appropriate

!fairseq-train drive/MyDrive/varRecovery-New/data-bin/finetune-32bitideal-O3/ \
    --max-positions 512 \
    --batch-size 8 \
    --task sentence_prediction \
    --reset-optimizer --reset-dataloader --reset-meters \
    --required-batch-size-multiple 1 \
    --arch roberta_base \
    --num-classes 9 \
    --criterion sentence_prediction \
    --classification-head-name sentence_prediction \
    --dropout 0.1 --attention-dropout 0.1 \
    --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
    --clip-norm 0.0 \
    --lr-scheduler polynomial_decay --lr 1e-05 --max-epoch 15 --warmup-updates 500 \
    --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
    --find-unused-parameters \
    --no-epoch-checkpoints --update-freq 4 --log-format=json --log-interval 10 \
    --save-dir SavedModels/ \
    --restore-file drive/MyDrive/varRecovery-New/test_check/pretrain-32bit-ideal/checkpoint_best.pt

In [None]:
# Load finetuned model for inference
from fairseq.models.roberta import RobertaModel
roberta = RobertaModel.from_pretrained('drive/MyDrive/varRecovery-New/test_check/finetune-32bitideal-O0', 'checkpoint_best.pt', data_name_or_path='drive/MyDrive/varRecovery-New/data-bin/finetune-32bitideal-O0', bpe=None)
roberta.eval()

In [None]:
# Load evaluation data
import json
data = json.load(open('drive/MyDrive/varRecovery-New/data-src/SPEC/SPEC.json'))
# JSON is structured as per output of dataResolver
# {
#   "func_name": "X" : [<func disas>],
#                "Y" : <size>,    
# }

In [None]:
labelFn = lambda label: roberta.task.label_dictionary.string(
  [label + roberta.task.label_dictionary.nspecial]
)

In [None]:
for func in data:
  if "O0" not in func:
    continue
  tokens = data[func]["X"]
  actualValue = data[func]["Y"]
  if actualValue > 2048:
    continue
  encoded_tokens = roberta.task.source_dictionary.encode_line(tokens)
  prediction = roberta.predict('sentence_prediction', encoded_tokens).argmax().item()
  print(labelFn(prediction), actualValue)