In [1]:
! pip install datasets transformers[torch] evaluate




[notice] A new release of pip available: 22.2.2 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
dataset_name = "cw1521/ember2018-malware"
model_checkpoint = "allenai/longformer-base-4096"
model_name = "ma-ember-1"

In [3]:
from datasets import load_dataset
from transformers import( 
    AutoTokenizer,
    AutoConfig,
    LongformerForSequenceClassification,
    Trainer,
    TrainingArguments
)


# dataset = load_dataset(
#     dataset_name,
#     split="train"
# )

  from .autonotebook import tqdm as notebook_tqdm


Testing (comment out for full dataset)

In [4]:
data_files = [
    "..\\..\\ember2018\\data\\ember2018_train_1.jsonl",
    "..\\..\\ember2018\\data\\ember2018_test_1.jsonl"
]


dataset = load_dataset(
    "json",
    data_files=data_files
)

Found cached dataset json (C:/Users/school/.cache/huggingface/datasets/json/default-96c99c50bebd9cb9/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)
100%|██████████| 1/1 [00:00<00:00, 64.49it/s]


Display the dataset

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['x', 'input', 'y', 'sha256', 'appeared', 'label', 'avclass', 'subset'],
        num_rows: 4000
    })
})

In [6]:
cols = [
    "subset", 
    "sha256",
    "appeared",
    "x",
    "y",
    "avclass"
]


dataset = dataset.remove_columns(cols)
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'label'],
        num_rows: 4000
    })
})

In [7]:
dataset["train"][0]

{'input': '0.014676122 0.0042218715 0.0039226813 0.004028752 0.004007151 0.0037750206 0.003824993 0.0038872168 0.0041528773 0.0038037144 0.003805004 0.0037988783 0.0038781895 0.0038949545 0.0037901734 0.0040403586 0.003862392 0.0037489058 0.003708928 0.003776955 0.003807261 0.004002315 0.0037975886 0.0037746981 0.003854654 0.0037469715 0.003823381 0.0037962992 0.0037775997 0.003709895 0.0038278946 0.0037982336 0.0038736758 0.0037688948 0.0037643812 0.003798556 0.0038108074 0.003823381 0.0038769 0.0037998455 0.0037821133 0.003689906 0.0037256929 0.0037643812 0.0038872168 0.0037514851 0.0037621243 0.0038375668 0.0038701296 0.0037853373 0.0038149985 0.0039749104 0.0037972664 0.003823381 0.0038062937 0.0037353649 0.0037962992 0.0038317635 0.0038246706 0.0038839928 0.0038266052 0.003893665 0.0037798565 0.0038807688 0.004232833 0.0038465941 0.0039014027 0.003933965 0.003854654 0.004070342 0.0038291842 0.0038056488 0.0038685175 0.0038266052 0.0037327856 0.003736977 0.0038472388 0.003875288 0.

In [8]:
dataset = dataset["train"].train_test_split(test_size=0.2)

train_ds = dataset["train"]
valid_ds = dataset["test"]

In [9]:


config = AutoConfig.from_pretrained(
    model_checkpoint,
    num_labels=1
)
model = LongformerForSequenceClassification.from_pretrained(
    model_checkpoint,
    config=config
)
# model.config

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weigh

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Set max_input_length, max_output_length, and batch_size

In [11]:
max_input_length = 4096
max_output_length = 512
batch_size = 1

In [12]:
def process_data_to_model_inputs(batch):
    # batch["input"] = [i.split( ) for i in batch["input"]]
    inputs = tokenizer(
        batch["input"],
        truncation=True,
        padding="max_length",
        max_length=max_input_length
    )

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask

    # create 0 global_attention_mask lists
    batch["global_attention_mask"] = len(batch["input_ids"]) * [
        [0 for _ in range(len(batch["input_ids"][0]))]
    ]

    # since above lists are references, the following line changes the 0 index for all samples
    batch["global_attention_mask"][0][0] = 1

    batch["labels"] = batch["label"]




    return batch

Convert the dataset to torch

In [13]:
train = train_ds.map(
    process_data_to_model_inputs,
    batch_size=batch_size,
    batched=True,
    remove_columns=["input", "label"]
)

                                                               

In [14]:
valid = valid_ds.map(
    process_data_to_model_inputs,
    batch_size=batch_size,
    batched=True,
    remove_columns=["input", "label"]
)

                                                             

In [15]:
train.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "global_attention_mask", "labels"]
)
valid.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "global_attention_mask", "labels"]
)

Metrics

In [16]:
import numpy as np
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Train the Model

In [17]:


training_args = TrainingArguments(
    model_name,
    evaluation_strategy="epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=1e-4,
    weight_decay=0.001,
    fp16=True,
    logging_dir='./logs',
    save_steps=100,
    save_total_limit=3,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    num_train_epochs=1
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train,
    eval_dataset=valid,
)

In [18]:
trainer.train()
trainer.save_model()
trainer.save_state()

  0%|          | 0/200 [00:00<?, ?it/s]You're using a LongformerTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  0%|          | 1/200 [00:26<1:28:42, 26.75s/it]

KeyboardInterrupt: 