In [1]:
!pip install pytorch-crf
!pip install datasets
!pip install transformers[torch]
!pip install accelerate>=0.20.1

Collecting pytorch-crf
  Downloading pytorch_crf-0.7.2-py3-none-any.whl (9.5 kB)
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.2
Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
print('Tensorflow version: ', tf.__version__)

Tensorflow version:  2.15.0


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df = pd.read_csv('/content/drive/MyDrive/BERT Variations/all_in_one.csv', encoding='utf-8')
df = df.fillna(method='ffill')
df.head(-5)

Unnamed: 0,Sentence ID,Word,Tag
0,1,The,O
1,1,admin@338,B-HackOrg
2,1,has,O
3,1,largely,O
4,1,targeted,O
...,...,...,...
175670,6592,TEaM,I-Tool
175671,6592,Shell,I-Tool
175672,6592,"""",O
175673,6592,",",O


In [5]:
df['Sentence ID'].nunique(), df.Word.nunique(), df.Tag.nunique()

(6582, 9529, 27)

In [6]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df['ner_tags'] = le.fit_transform(df['Tag'])
df['ner_tags'] = df['ner_tags'] + 1

In [7]:
df.groupby('ner_tags').size().reset_index(name='counts')

Unnamed: 0,ner_tags,counts
0,1,2531
1,2,1331
2,3,1025
3,4,4214
4,5,1579
5,6,1662
6,7,1361
7,8,918
8,9,1660
9,10,1327


In [8]:
class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                           s["ner_tags"].values.tolist())]
        self.grouped = self.data.groupby("Sentence ID").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["{}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [9]:
getter = SentenceGetter(df)

In [10]:
sentences = [[word[0] for word in sentence] for sentence in getter.sentences]
sentences[0]

['The',
 'admin@338',
 'has',
 'largely',
 'targeted',
 'organizations',
 'involved',
 'in',
 'financial',
 ',',
 'economic',
 'and',
 'trade',
 'policy',
 ',',
 'typically',
 'using',
 'publicly',
 'available',
 'RATs',
 'such',
 'as',
 'Poison',
 'Ivy',
 ',',
 'as',
 'well',
 'some',
 'non-public',
 'backdoors',
 '.']

In [11]:
labels = [[s[1] for s in sentence] for sentence in getter.sentences]
print(labels[0])

[27, 4, 27, 27, 27, 27, 27, 27, 5, 27, 5, 27, 5, 18, 27, 27, 27, 12, 25, 25, 27, 27, 12, 25, 27, 27, 27, 27, 12, 25, 27]


In [12]:
def split_tokens_and_labels(tokens_list, labels_list, max_length=75):
    new_tokens_list = []
    new_labels_list = []

    for tokens, labels in zip(tokens_list, labels_list):
        if len(tokens) <= max_length:
            new_tokens_list.append(tokens)
            new_labels_list.append(labels)
        else:
            start = 0
            end = max_length
            while start < len(tokens):
                new_tokens_list.append(tokens[start:end])
                new_labels_list.append(labels[start:end])
                start = end
                end = start + max_length

    return new_tokens_list, new_labels_list

In [13]:
new_tokens_list, new_labels_list = split_tokens_and_labels(sentences, labels, max_length=75)

In [14]:
from transformers import BertPreTrainedModel, BertModel
from transformers.modeling_outputs import  TokenClassifierOutput
from torch import nn
from torch.nn import CrossEntropyLoss
import torch
from torchcrf import CRF

In [15]:
class BertCRF(BertPreTrainedModel):

    _keys_to_ignore_on_load_unexpected = [r"pooler"]

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config, add_pooling_layer=False)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.crf = CRF(num_tags=config.num_labels, batch_first=True)
        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
            1]``.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            log_likelihood, tags = self.crf(logits, labels), self.crf.decode(logits)
            loss = 0 - log_likelihood
        else:
            tags = self.crf.decode(logits)
        tags = torch.Tensor(tags)

        if not return_dict:
            output = (tags,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return loss, tags

In [16]:
from datasets import Dataset
data = {
    'tokens': new_tokens_list,
    'ner_tags': new_labels_list
}
dataset = Dataset.from_dict(data)

In [17]:
from sklearn.metrics import classification_report, f1_score
from transformers import BertTokenizerFast, Trainer, TrainingArguments,BertTokenizer
from transformers.trainer_utils import IntervalStrategy
from sklearn.model_selection import train_test_split


train_dataset, test_dataset = train_test_split(dataset, test_size = 0.2, random_state = 2018)
train_dataset = Dataset.from_dict(train_dataset)
test_dataset = Dataset.from_dict(test_dataset)

train_dataset = train_dataset.rename_column('ner_tags', 'label_ids')
test_dataset = test_dataset.rename_column('ner_tags', 'label_ids')

In [22]:
model = BertCRF.from_pretrained('bert-base-cased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

Some weights of BertCRF were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'crf.end_transitions', 'crf.start_transitions', 'crf.transitions']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
def tokenize(batch):
    result = {
        'label_ids': [],
        'input_ids': [],
        'token_type_ids': [],
    }
    max_length = 512

    for tokens, label in zip(batch['tokens'], batch['label_ids']):
        tokenids = tokenizer(tokens, add_special_tokens=False, truncation=True, max_length=max_length)  # Apply truncation

        token_ids = []
        label_ids = []
        num_labels = len(label)  # Track the number of actual labels

        for ids, lab in zip(tokenids['input_ids'], label):
            token_ids.extend(ids)
            label_ids.append(lab)

        # Ensure consistent padding (pad with -100)
        diff = max_length - len(token_ids)
        label_ids += [-100] * diff

        token_type_ids = tokenizer.create_token_type_ids_from_sequences(token_ids)
        token_ids = tokenizer.build_inputs_with_special_tokens(token_ids)
        result['input_ids'].append(token_ids)
        result['label_ids'].append(label_ids)
        result['token_type_ids'].append(token_type_ids)

    result = tokenizer.pad(result, padding='longest', max_length=max_length, return_attention_mask=True, )

    return result




train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))
train_dataset.set_format('torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label_ids'])
test_dataset.set_format('torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label_ids'])


def compute_metrics(pred):
    labels = pred.label_ids.flatten()
    preds = pred.predictions.flatten()
    f1 = f1_score(labels, preds, average='macro')
    print(classification_report(labels, preds))
    return {
        'f1': f1
    }


Map:   0%|          | 0/5266 [00:00<?, ? examples/s]

Map:   0%|          | 0/1317 [00:00<?, ? examples/s]

In [30]:
from transformers import TrainingArguments, Trainer, AdamW, get_scheduler, EarlyStoppingCallback


learning_rate = 3e-5
optimizer = AdamW(
    model.parameters(),
    lr=learning_rate,
    weight_decay=0.01,
)

num_warmup_steps = 200

scheduler = get_scheduler(
    "linear",
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=len(train_dataset) // 64 * 3,
)

model.optimizer = optimizer
model.lr_scheduler = scheduler

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,

    save_strategy=IntervalStrategy.EPOCH,
    evaluation_strategy=IntervalStrategy.EPOCH,
    logging_dir='./logs',

    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="tensorboard",
    run_name="my_experiment",
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)


trainer.train()

results = trainer.evaluate()
print(results)

RuntimeError: stack expects each tensor to be equal size, but got [503] at entry 0 and [506] at entry 1