In [1]:
!pip install evaluate
!pip install seqeval

Collecting evaluate
  Obtaining dependency information for evaluate from https://files.pythonhosted.org/packages/70/63/7644a1eb7b0297e585a6adec98ed9e575309bb973c33b394dae66bc35c69/evaluate-0.4.1-py3-none-any.whl.metadata
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.1
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=02345e6be580b823b95c9b0a61810d

In [2]:
import os
import string
from typing import List
import warnings
import numpy as np
import pandas as pd
from itertools import chain
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import nltk
import spacy
import re
from tqdm import tqdm

from sklearn.model_selection import train_test_split, StratifiedKFold, GroupKFold, TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputClassifier

import torch
from torch.utils.data import DataLoader, random_split, Dataset
import torchvision.models as models
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification

%matplotlib inline



<a id='Token_classification_with_Transformers'></a>
# Token classification with Transformers

In this notebook we'll attempt to use tranformers on our mountain dataset 

In [3]:
import pandas as pd

df_mountains = pd.read_csv('/kaggle/input/mountain-ner-dataset/mountain_dataset_with_markup.csv', converters={'marker': eval})


In [4]:
df_mountains

Unnamed: 0,text,marker
0,A visit to a science museum for hands-on learn...,[]
1,Voice surface coach set democratic time year. ...,[]
2,Parent according maybe activity activity finis...,[]
3,A visit to a sculpture garden with intriguing ...,[]
4,The Julian Alps in Slovenia offer pristine lak...,"[(11, 15)]"
...,...,...
1579,They never audience meet. Appear region allow ...,[]
1580,Witnessing the mesmerizing Northern Lights dan...,"[(75, 97)]"
1581,Consumer join stage. Best likely center they p...,[]
1582,Hospital real school cover hotel over. Any tra...,[]


In [5]:
# Our synthetic data is pretty clean so no need in such brutal cleaning
# Yet this is a great example of how to process data from different sources like 
# Telegram Twitter Instagram etc.
def preprocess_text(text):
    # Remove links
    text = re.sub(r'http\S+|www.\S+', '', text)

    # Special remove telegram links
    pattern = r"(?:https?:\/\/)?(?:www\.)?(?:t\.me\/\S+|telegram\.me\/\S+|telegram\.dog\/\S+)"
    text = re.sub(pattern, '', text)

    # Remove phone numbers
    phone_regex = r'\(?\+?\d{0,3}\)?[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{2}[-.\s]?\d{2}'
    text = re.sub(phone_regex, '', text)

    # Remove special characters
    text = re.sub(r'[\n\t\r]', ' ', text)

    # Remove tags
    text = re.sub(r'@\w+', '', text)

    # Remove emojis
    emoji_pattern = re.compile(
        pattern="["
                u"\U0001F600-\U0001F64F"  # emoticons
                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                u"\U00002702-\U000027B0"
                u"\U000024C2-\U0001F251"
                u"\U0001f926-\U0001f937"
                u'\U00010000-\U0010ffff'
                u"\u200d"
                u"\u2640-\u2642"
                u"\u2600-\u2B55"
                u"\u23cf"
                u"\u23e9"
                u"\u231a"
                u"\u3030"
                "]+", flags=re.UNICODE
    )
    text = emoji_pattern.sub(r'', text)

    # Remove multiple spaces
    text = re.sub(r' +', ' ', text)

    return text

In [6]:
df_mountains.shape

(1584, 2)

In [7]:
def divide_markers(text, markers):
    divided_markers = []
    
    for start, end in markers:
        # Extract the mountain name from the text based on the marker
        mountain_name = text[start:end]

        # Split the mountain name into individual words
        words = mountain_name.split(" ")

        # Generate divided markers for each word
        for word in words:
            word_start = text.find(word, start)
            word_end = word_start + len(word)
            divided_markers.append((word_start, word_end))

    return divided_markers


# Apply the divide_markers function to the DataFrame
df_mountains['loc_markers'] = df_mountains.apply(lambda row: divide_markers(row['text'], row['marker']), axis=1)

In [8]:
df_mountains

Unnamed: 0,text,marker,loc_markers
0,A visit to a science museum for hands-on learn...,[],[]
1,Voice surface coach set democratic time year. ...,[],[]
2,Parent according maybe activity activity finis...,[],[]
3,A visit to a sculpture garden with intriguing ...,[],[]
4,The Julian Alps in Slovenia offer pristine lak...,"[(11, 15)]","[(11, 15)]"
...,...,...,...
1579,They never audience meet. Appear region allow ...,[],[]
1580,Witnessing the mesmerizing Northern Lights dan...,"[(75, 97)]","[(75, 87), (88, 97)]"
1581,Consumer join stage. Best likely center they p...,[],[]
1582,Hospital real school cover hotel over. Any tra...,[],[]


In [9]:
import spacy

from spacy.training.iob_utils import biluo_to_iob, doc_to_biluo_tags
from tqdm.autonotebook import tqdm
tqdm.pandas()

df_mountains['loc_markers'] = df_mountains['loc_markers'].apply(lambda x: [[y[0], y[1], 'LOC'] for y in x])

nlp = spacy.blank("xx")

def convert_to_conll(row):
    data = {
        "text": row['text'],
        "label": row['loc_markers']
    }
    doc = nlp(data["text"])
    ents = []

    # Sort the spans based on their start positions
    sorted_spans = sorted(data["label"], key=lambda x: x[0])

    for start, end, label in sorted_spans:
        span = doc.char_span(start, end, label=label)

        # Check for overlaps with existing spans
        if span is not None:
            if not any(span.start >= ent.start and span.end <= ent.end for ent in ents):
                ents.append(span)
        else:
            pass
            # TODO: fix not align to token case
            # print("Skipping span (does not align to tokens):", start, end, label, doc.text[start:end])

    doc.ents = ents
    return {
        'tokens': list([t.text for t in doc]),
        'labels': list(biluo_to_iob(doc_to_biluo_tags(doc)))
    }

df_mountains['conll'] = df_mountains.progress_apply(convert_to_conll, axis=1)

  0%|          | 0/1584 [00:00<?, ?it/s]

In [10]:
label2id = {'O': 0, 'B-LOC': 1, 'I-LOC': 2}

df_mountains['tokens'] = df_mountains.conll.str['tokens']
df_mountains['ner_tags'] = df_mountains.conll.str['labels'].apply(lambda x: [label2id[t] for t in x])

df_mountains['is_valid'] = 0
df_mountains.loc[df_mountains.index > 1200, 'is_valid'] = 1

df_train = df_mountains[df_mountains.is_valid == 0]
df_valid = df_mountains[df_mountains.is_valid == 1]

In [11]:
df_mountains[['tokens', 'ner_tags']].to_json(
    'train_processed.json', orient='records', lines=True)
df_mountains[['tokens', 'ner_tags']].to_json(
    'valid_processed.json', orient='records', lines=True)

In [12]:
from datasets import load_dataset

raw_datasets = load_dataset(
    "json",
    data_files={
        'train': 'train_processed.json',
        'val': 'valid_processed.json'
    }
)
raw_datasets

Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-7e2e1697bb44d3b5/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-7e2e1697bb44d3b5/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 1584
    })
    val: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 1584
    })
})

In [13]:
from transformers import AutoTokenizer, AutoModelForTokenClassification


id2label = {v: k for k, v in label2id.items()}

model = AutoModelForTokenClassification.from_pretrained(
    'xlm-roberta-base',
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large'
#                                           , add_prefix_space=True
                                         )

Downloading config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [14]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [15]:
tokenized_datasets_ua = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [16]:
from transformers import TrainingArguments

args = TrainingArguments(
    "roberta-base",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5
)

In [17]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW([
    {'params': list(model.roberta.parameters()), 'lr':  0.0000016},
    {'params': list(model.classifier.parameters()), 'lr':  0.00012}
])

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0.1*3*(tokenized_datasets_ua['train'].num_rows/8),
    num_training_steps=3*(tokenized_datasets_ua['train'].num_rows/8)
)



In [18]:
import evaluate
import numpy as np

metric = evaluate.load("seqeval")

label_names = list(label2id.keys())

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [19]:
!pip install wandb

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [20]:
my_api_key = '9d4d0591e05d13690d35330a36ffa0de42a04006'

In [21]:
from transformers import Trainer
import wandb

wandb.login(key=my_api_key)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets_ua["train"],
    eval_dataset=tokenized_datasets_ua["val"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    optimizers=(optimizer, scheduler)
)
trainer.train()

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mgencgeray[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.053923,0.0,0.0,0.0,0.978682
2,No log,0.029125,0.490085,0.42928,0.457672,0.988587
3,0.222000,0.025578,0.553435,0.719603,0.625674,0.993525
4,0.222000,0.025578,0.553435,0.719603,0.625674,0.993525
5,0.222000,0.025578,0.553435,0.719603,0.625674,0.993525


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=990, training_loss=0.1258959856900302, metrics={'train_runtime': 174.5843, 'train_samples_per_second': 45.365, 'train_steps_per_second': 5.671, 'total_flos': 141587377365744.0, 'train_loss': 0.1258959856900302, 'epoch': 5.0})

In [22]:
trainer.save_model("roberta-base")

There will be no saved wheights as roberta (even base one) is pretty heavy

In [23]:
# For inference if needed
# from transformers import pipeline

# # Replace this with your own checkpoint
# model_checkpoint = "roberta-base"
# token_classifier = pipeline(
#     "token-classification", model=model_checkpoint, aggregation_strategy="simple"
# )

### Notebook Summary:

1. **Model and Tokenizer Loading:**
   - Loaded a token classification model (`AutoModelForTokenClassification`) and tokenizer (`AutoTokenizer`) from the Hugging Face Transformers library.
   - Used the `xlm-roberta-base` model with specific label mapping.

2. **Data Processing:**
   - Defined functions for aligning labels with tokens and tokenizing input data.
   - Applied these functions to preprocess the dataset, aligning labels with tokenized inputs.

3. **Training Setup:**
   - Defined training arguments using `TrainingArguments`, specifying batch sizes, evaluation strategy, and number of epochs.
   - Configured an optimizer (`AdamW`) and a learning rate scheduler.

4. **Evaluation Metrics:**
   - Utilized the `seqeval` library for computing precision, recall, F1 score, and accuracy during model evaluation.

5. **WandB Integration:**
   - Integrated WandB for experiment tracking during model training.

6. **Trainer Setup and Training:**
   - Configured the `Trainer` with the loaded model, training arguments, datasets, data collator, and evaluation metrics.
   - Initiated model training using the `trainer.train()` method.

7. **Conclusion and Saving Model:**
   - Logged in to WandB using the provided API key.
   - Executed training and saved the trained model.

8. **Inference (Optional):**
   - Provided a commented-out section for setting up a pipeline for token classification inference.

9. **Summary and Future Steps:**
   - The notebook focuses on fine-tuning a token classification model for the task of Named Entity Recognition (NER) on mountain names.
   - It saves the trained model for future use.

### Conclusion:

The notebook covers essential aspects of training a token classification model for NER. It demonstrates proficiency in using the Hugging Face Transformers library and includes experiment tracking with WandB. 