In [None]:
# importing required libraries
import pandas as pd
import spacy
from tqdm import tqdm
from torch import cuda
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertConfig, BertForTokenClassification
from sklearn.metrics import f1_score

In [None]:
!python3 -m spacy download ru_core_news_md
!pip install transformers seqeval[gpu]

In [None]:
# Loading the dataset
jsonObj = pd.read_json(path_or_buf='train.jsonl', lines=True)
data = jsonObj

In [None]:
# Load the pre-trained Russian spaCy model
nlp = spacy.load("ru_core_news_md")

In [None]:
# Generating labels
labels_ = []
for idx, row in tqdm(data.iterrows(),total=data.shape[0]):
  sentence = row.sentences
  entities = row.ners
  doc = nlp(sentence)
  tokens = []
  labels = ['O'] * len(doc) # default label
  for start, end, label in entities:
      for token in doc:
          if token.idx == start:
              labels[token.i] = 'B-' + label # beginning of entity
          elif start < token.idx < end:
              labels[token.i] = 'I-' + label # inside of entity
  labels_.append(labels)

100%|██████████| 519/519 [01:20<00:00,  6.41it/s]


In [None]:
# Create a df for features(sentences) and labels
dataset = pd.DataFrame({'labels': labels_})
X = data['sentences']
y = dataset['labels']

In [None]:
# Check if the cude is available
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [None]:
# Constants defining the configuration for training and data processing
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 25
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
# Initializing the BertTokenizer from Hugging Face's transformers
tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

In [None]:
# Function to tokenize a sentence and preserve labels for each token
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):

    tokenized_sentence = []
    labels = []
    # Strip whitespace from the sentence and process it into words
    sentence = sentence.strip()
    doc = nlp(sentence)
    for word, label in zip(doc, text_labels):
        # Process words into subwords
        tokenized_word = tokenizer.tokenize(word.text)
        n_subwords = len(tokenized_word)

        tokenized_sentence.extend(tokenized_word)

        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [None]:
# Combine features and labels into a single DataFrame
df = pd.concat([X,y],axis=1)
labels = [l for sublist in df.labels.to_list() for l in sublist]
label2id = {k: v for v, k in enumerate(labels)}
id2label = {v: k for v, k in enumerate(labels)}

In [None]:
# Dataset class for loading data
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        sentence = self.data.sentences[index]
        word_labels = self.data.labels[index]
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # add outside label for [CLS] token
        labels.insert(-1, "O") # add outside label for [SEP] token

        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]

        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)
        label_ids = [label2id[label] for label in labels]

        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        }

    def __len__(self):
        return self.len

In [None]:
# Splitting data into training and testing datasets
train_size = 0.8
train_dataset = df.sample(frac=train_size,random_state=200)
test_dataset = df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

# Creating DataLoader instances for loading data in batches
training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

FULL Dataset: (519, 2)
TRAIN Dataset: (415, 2)
TEST Dataset: (104, 2)


In [None]:
# Model initialization for token classification
model = BertForTokenClassification.from_pretrained('DeepPavlov/rubert-base-cased',
                                                   num_labels=len(id2label),
                                                   id2label=id2label,
                                                   label2id=label2id)
model.to(device)

pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, e

In [None]:
# Optimizer setup
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [None]:
# Training function
def train(epoch):
    tr_loss, tr_f1_score = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    model.train()

    for idx, batch in enumerate(training_loader):

        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
        loss, tr_logits = outputs.loss, outputs.logits
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)

        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")

        # compute training f1_score
        flattened_targets = targets.view(-1)
        active_logits = tr_logits.view(-1, model.num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1)
        
        active_accuracy = mask.view(-1) == 1
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_preds.extend(predictions)
        tr_labels.extend(targets)

        tmp_tr_f1_score = f1_score(targets.cpu().numpy(), predictions.cpu().numpy(),average='weighted')
        tr_f1_score += tmp_tr_f1_score

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_f1_score = tr_f1_score / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training F1 score epoch: {tr_f1_score}")

In [None]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 100 training steps: 4.262213230133057
Training loss per 100 training steps: 2.0993667911775042
Training loss epoch: 2.074096041230055
Training f1_score epoch: 0.4321871796061021
Training epoch: 2
Training loss per 100 training steps: 1.4427820444107056
Training loss per 100 training steps: 1.1169813375661868
Training loss epoch: 1.1115459960240583
Training f1_score epoch: 0.6748330467604545
Training epoch: 3
Training loss per 100 training steps: 0.86073899269104
Training loss per 100 training steps: 0.7596120704518686
Training loss epoch: 0.7542113352280396
Training f1_score epoch: 0.7829484527689086
Training epoch: 4
Training loss per 100 training steps: 0.7127864956855774
Training loss per 100 training steps: 0.5831721681769532
Training loss epoch: 0.5836013718866385
Training f1_score epoch: 0.8316125091889167
Training epoch: 5
Training loss per 100 training steps: 0.44461172819137573
Training loss per 100 training steps: 0.4661581194636845
Trainin

In [None]:
# Validation function
def valid(model, testing_loader):
    model.eval()

    eval_loss, eval_f1_score = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):

            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.long)

            outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
            loss, eval_logits = outputs.loss, outputs.logits

            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += targets.size(0)

            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # compute evaluation f1_score
            flattened_targets = targets.view(-1)
            active_logits = eval_logits.view(-1, model.num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1)
            
            active_accuracy = mask.view(-1) == 1
            targets = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_labels.extend(targets)
            eval_preds.extend(predictions)

            tmp_eval_f1_score = f1_score(targets.cpu().numpy(), predictions.cpu().numpy(),average='weighted')
            eval_f1_score += tmp_eval_f1_score

    labels = [id2label[id.item()] for id in eval_labels]
    predictions = [id2label[id.item()] for id in eval_preds]

    eval_loss = eval_loss / nb_eval_steps
    eval_f1_score = eval_f1_score / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation F1 Score: {eval_f1_score}")

    return labels, predictions

In [None]:
labels, predictions = valid(model, testing_loader)

Validation loss per 100 evaluation steps: 0.6518080830574036
Validation Loss: 0.6322644799947739
Validation Accuracy: 0.8860312816064683


In [None]:
# prediction on new test data (for submission)
testObj = pd.read_json(path_or_buf='test_x.jsonl', lines=True)
testObj=testObj.rename(columns={'senences':'sentences'})

In [None]:
y_pred = []
for i in range(len(testObj)):
  sentence = testObj['sentences'].iloc[i].replace('\n',' ').replace('  ',' ')
  inputs = tokenizer(sentence, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors="pt")

  # move to gpu
  ids = inputs["input_ids"].to(device)
  mask = inputs["attention_mask"].to(device)
  # forward pass
  outputs = model(ids, mask)
  logits = outputs[0]

  active_logits = logits.view(-1, model.num_labels)
  flattened_predictions = torch.argmax(active_logits, axis=1)

  tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
  token_predictions = [id2label[i] for i in flattened_predictions.cpu().numpy()]
  wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

  word_level_predictions = []
  doc = nlp(sentence)
  idx = 0
  for pair in wp_preds:
    if pair[0].startswith("##") or pair[0] in ['[CLS]', '[SEP]', '[PAD]']:
      continue
    elif word_level_predictions and pair[0] in tokenizer.tokenize(word_level_predictions[-1][0]):
      continue
    else:
      word_level_predictions.append((doc[idx].text,pair[1]))
      idx += 1
  y_pred.append(word_level_predictions)

In [None]:
# constructing the final submission
answers= []
for i in range(len(y_pred)):
  answer = []
  for token, entity in zip(nlp(testObj.iloc[i].sentences.replace('\n',' ').replace('  ',' ')),y_pred[i]):
    if entity[0]=='B':
      i = token.idx
      j = i + len(token.text) -1
      answer.append([i,j,entity[2:]])
    elif entity[0]=='I':
      if len(answer)!=0:
        j += len(token.text)+1
        temp = answer[-1]
        temp[1] = j
        answer[-1] = temp
      else:
        i = token.idx
        j = i + len(token.text)-1
        answer.append([i,j,entity[2:]])
  answers.append(answer)

In [None]:
# Saving the submission
submission = pd.DataFrame({'ners':answers, 'id':testObj.id})
submission.to_json('test.jsonl',orient='records',lines=True)
!zip test test.jsonl

  adding: test.jsonl (deflated 87%)
