In [18]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
!pip install pytorch-crf
!pip install transformers[torch]
!pip install accelerate -U
!pip install seqeval
!pip install datasets



In [20]:
import numpy as np
import pandas as pd
import tensorflow as tf

import transformers
from transformers import BertPreTrainedModel, BertModel
from transformers.modeling_outputs import  TokenClassifierOutput
from torch import nn
from torch.nn import CrossEntropyLoss
import torch
from torchcrf import CRF
from transformers import TrainingArguments, Trainer, AdamW, get_scheduler, EarlyStoppingCallback

from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns

In [21]:
df = pd.read_csv('/content/drive/MyDrive/BERT Variations/all_in_one.csv', encoding='utf-8')
df = df.fillna(method='ffill')
df.head(-5)

Unnamed: 0,Sentence ID,Word,Tag
0,1,The,O
1,1,admin@338,B-HackOrg
2,1,has,O
3,1,largely,O
4,1,targeted,O
...,...,...,...
175670,6592,TEaM,I-Tool
175671,6592,Shell,I-Tool
175672,6592,"""",O
175673,6592,",",O


In [22]:
df['Sentence ID'].nunique(), df.Word.nunique(), df.Tag.nunique()

(6582, 9529, 27)

In [23]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df['ner_tags'] = le.fit_transform(df['Tag'])

In [24]:
df.groupby('ner_tags').size().reset_index(name='counts')

Unnamed: 0,ner_tags,counts
0,0,2531
1,1,1331
2,2,1025
3,3,4214
4,4,1579
5,5,1662
6,6,1361
7,7,918
8,8,1660
9,9,1327


In [25]:
class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                           s["ner_tags"].values.tolist())]
        self.grouped = self.data.groupby("Sentence ID").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["{}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [26]:
get_data = SentenceGetter(df)

sentences = [[word[0] for word in sentence] for sentence in get_data.sentences]
labels = [[s[1] for s in sentence] for sentence in get_data.sentences]

print(sentences[0])
print(labels[0])
print(len(sentences[0]))

['The', 'admin@338', 'has', 'largely', 'targeted', 'organizations', 'involved', 'in', 'financial', ',', 'economic', 'and', 'trade', 'policy', ',', 'typically', 'using', 'publicly', 'available', 'RATs', 'such', 'as', 'Poison', 'Ivy', ',', 'as', 'well', 'some', 'non-public', 'backdoors', '.']
[26, 3, 26, 26, 26, 26, 26, 26, 4, 26, 4, 26, 4, 17, 26, 26, 26, 11, 24, 24, 26, 26, 11, 24, 26, 26, 26, 26, 11, 24, 26]
31


In [27]:
from datasets import Dataset
data = {
    'tokens': sentences,
    'ner_tags': labels
}
dataset = Dataset.from_dict(data)

dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 6582
})

In [28]:
dataset['ner_tags'][0:5]

[[26,
  3,
  26,
  26,
  26,
  26,
  26,
  26,
  4,
  26,
  4,
  26,
  4,
  17,
  26,
  26,
  26,
  11,
  24,
  24,
  26,
  26,
  11,
  24,
  26,
  26,
  26,
  26,
  11,
  24,
  26],
 [26,
  3,
  26,
  26,
  0,
  13,
  6,
  19,
  26,
  26,
  26,
  26,
  26,
  4,
  26,
  4,
  26,
  26,
  0,
  13,
  26,
  0,
  26],
 [26, 0, 3, 16, 16, 26, 26, 6, 19, 19, 26, 26, 26, 26],
 [26, 3, 26, 26, 6, 19, 19, 26, 26, 26, 26],
 [26,
  10,
  23,
  26,
  26,
  3,
  26,
  12,
  25,
  25,
  26,
  26,
  26,
  26,
  26,
  26,
  6,
  19,
  26,
  26,
  26,
  26,
  26,
  26,
  26,
  26,
  26]]

In [29]:
checkpoint = "bert-base-cased"

In [30]:
from transformers import BertTokenizerFast, Trainer, TrainingArguments,BertTokenizer, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [35]:
inputs = tokenizer(dataset["tokens"][0], is_split_into_words=True)
print(inputs.tokens())
print(len(inputs.tokens()))

['[CLS]', 'The', 'ad', '##min', '@', '33', '##8', 'has', 'largely', 'targeted', 'organizations', 'involved', 'in', 'financial', ',', 'economic', 'and', 'trade', 'policy', ',', 'typically', 'using', 'publicly', 'available', 'RA', '##T', '##s', 'such', 'as', 'Po', '##ison', 'Ivy', ',', 'as', 'well', 'some', 'non', '-', 'public', 'back', '##do', '##ors', '.', '[SEP]']
44


In [33]:
inputs.word_ids()

[None,
 0,
 1,
 1,
 1,
 1,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 19,
 19,
 20,
 21,
 22,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 28,
 28,
 29,
 29,
 29,
 30,
 None]

In [36]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [39]:
labels = dataset["ner_tags"][0]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))
print(len(align_labels_with_tokens(labels, word_ids)))

[26, 3, 26, 26, 26, 26, 26, 26, 4, 26, 4, 26, 4, 17, 26, 26, 26, 11, 24, 24, 26, 26, 11, 24, 26, 26, 26, 26, 11, 24, 26]
[-100, 26, 3, 4, 4, 4, 4, 26, 26, 26, 26, 26, 26, 4, 26, 4, 26, 4, 17, 26, 26, 26, 11, 24, 24, 24, 24, 26, 26, 11, 12, 24, 26, 26, 26, 26, 11, 12, 12, 24, 24, 24, 26, -100]
44


In [63]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, padding="max_length",max_length= 107,
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [64]:
tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,
)

Map:   0%|          | 0/6582 [00:00<?, ? examples/s]

In [65]:
tokenized_datasets

Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 6582
})

In [66]:
print(tokenized_datasets["tokens"][0:4])
print(tokenized_datasets["ner_tags"][0:4])
print(tokenized_datasets["input_ids"][0:4])
print(tokenized_datasets["token_type_ids"][0:4])
print(tokenized_datasets["attention_mask"][0:4])
print(tokenized_datasets["labels"][0:4])

[['The', 'admin@338', 'has', 'largely', 'targeted', 'organizations', 'involved', 'in', 'financial', ',', 'economic', 'and', 'trade', 'policy', ',', 'typically', 'using', 'publicly', 'available', 'RATs', 'such', 'as', 'Poison', 'Ivy', ',', 'as', 'well', 'some', 'non-public', 'backdoors', '.'], ['The', 'admin@338', 'started', 'targeting', 'Hong', 'Kong', 'media', 'companies', ',', 'probably', 'in', 'response', 'to', 'political', 'and', 'economic', 'challenges', 'in', 'Hong', 'Kong', 'and', 'China', '.'], ['Multiple', 'China-based', 'cyber', 'threat', 'groups', 'have', 'targeted', 'international', 'media', 'organizations', 'in', 'the', 'past', '.'], ['The', 'admin@338', 'has', 'targeted', 'international', 'media', 'organizations', 'in', 'the', 'past', '.']]
[[26, 3, 26, 26, 26, 26, 26, 26, 4, 26, 4, 26, 4, 17, 26, 26, 26, 11, 24, 24, 26, 26, 11, 24, 26, 26, 26, 26, 11, 24, 26], [26, 3, 26, 26, 0, 13, 6, 19, 26, 26, 26, 26, 26, 4, 26, 4, 26, 26, 0, 13, 26, 0, 26], [26, 0, 3, 16, 16, 26, 26

In [67]:
from sklearn.model_selection import train_test_split


train_dataset, test_dataset = train_test_split(tokenized_datasets, test_size = 0.2, random_state = 2018)
train_dataset = Dataset.from_dict(train_dataset)
test_dataset = Dataset.from_dict(test_dataset)

print(train_dataset)
print(test_dataset)

Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 5265
})
Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 1317
})


In [68]:
class BertLstmCRF(BertPreTrainedModel):

    _keys_to_ignore_on_load_unexpected = [r"pooler"]

    def __init__(self, config, dropout_prob=0.3, lstm_hidden_size=128):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config, add_pooling_layer=False)
        self.dropout = nn.Dropout(dropout_prob)

        lstm_hidden_size = lstm_hidden_size or (config.hidden_size) // 2
        self.bilstm = nn.LSTM(config.hidden_size, lstm_hidden_size, dropout=dropout_prob, batch_first=True, bidirectional=True)

        self.classifier = nn.Linear(lstm_hidden_size * 2, config.num_labels)
        self.crf = CRF(num_tags=config.num_labels, batch_first=True)
        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
            1]``.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        lstm_output, hc = self.bilstm(sequence_output)
        logits = self.classifier(lstm_output)

        loss = None
        if labels is not None:
            log_likelihood, tags = self.crf(logits, labels), self.crf.decode(logits)
            loss = 0 - log_likelihood
        else:
            tags = self.crf.decode(logits)
        tags = torch.Tensor(tags)

        if not return_dict:
            output = (tags,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return loss, tags

In [69]:
model = BertLstmCRF.from_pretrained(checkpoint, num_labels=27)

Some weights of BertLstmCRF were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['bilstm.bias_hh_l0', 'bilstm.bias_hh_l0_reverse', 'bilstm.bias_ih_l0', 'bilstm.bias_ih_l0_reverse', 'bilstm.weight_hh_l0', 'bilstm.weight_hh_l0_reverse', 'bilstm.weight_ih_l0', 'bilstm.weight_ih_l0_reverse', 'classifier.bias', 'classifier.weight', 'crf.end_transitions', 'crf.start_transitions', 'crf.transitions']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [70]:
import seqeval
from seqeval.metrics import f1_score

def compute_metrics(pred):
    labels = pred.label_ids.flatten()
    preds = pred.predictions.flatten()
    f1 = f1_score(labels, preds, average='macro')
    print(classification_report(labels, preds))
    return {
        'f1': f1
    }

In [73]:
from transformers import TrainingArguments, Trainer, AdamW, get_scheduler, EarlyStoppingCallback
from transformers import DataCollatorForTokenClassification

learning_rate = 3e-5
optimizer = AdamW(
    model.parameters(),
    lr=learning_rate,
    weight_decay=0.01,
)

num_warmup_steps = 10

scheduler = get_scheduler(
    "linear",
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=len(train_dataset) // 64 * 3,
)

model.optimizer = optimizer
model.lr_scheduler = scheduler


training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch",
                                  num_train_epochs=10,              # total number of training epochs
                                  per_device_train_batch_size=8,  # batch size per device during training
                                  per_device_eval_batch_size=8,   # batch size for evaluation
                                 )

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)


trainer.train()

results = trainer.evaluate()
print(results)



RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
