In [1]:
!pip install transformers datasets tokenizers seqeval -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [2]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [3]:
import json
import random
import datasets
import numpy as np
import evaluate
from datasets import load_dataset
from transformers import BertTokenizerFast
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from transformers import pipeline
from transformers import TrainingArguments, Trainer

In [8]:
# Files paths
first_json_file = "/content/data_200_mountains.json"
second_json_file = "/content/data_200_no_mountains.json"
output_path = "/content/result_data.json"

In [9]:
# First file is already loaded, we should write the second file with kagle dataset samples

# Loading the dataset
dataset = load_dataset('rjac/kaggle-entity-annotated-corpus-ner-dataset')

# Select the first 200 samples
subset = dataset['train'].select(range(200))

# Add custom IDs from 201 to 400 and adjust ner_tags
subset_filtered = subset.map(lambda x, idx: {
    'id': str(201 + idx),
    'tokens': x['tokens'],
    'ner_tags': [0 if tag != 0 else tag for tag in x['ner_tags']]
}, with_indices=True)

# Remove any other fields
subset_filtered = subset_filtered.remove_columns([col for col in subset_filtered.column_names if col not in ['id', 'tokens', 'ner_tags']])

# Save as JSON
subset_filtered.to_json(second_json_file, orient='records', lines=False)

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

52073

In [10]:
# Loading files
def load_json(file_path):
    """
    Function for loading json file by specified file_path
    """
    with open(file_path, 'r', encoding='utf-8') as json_file:
        return json.load(json_file)

# Concat json files
def merge_json_files(file1_path, file2_path, output_path):
    """
    Function of merging two json files
    """
    data1 = load_json(file1_path)
    data2 = load_json(file2_path)

    # Concatenate them
    merged_data = data1 + data2

    # Write the merged data to a new JSON file
    with open(output_path, 'w', encoding='utf-8') as output_file:
        json.dump(merged_data, output_file, ensure_ascii=False, indent=4)

In [11]:
output_path = 'result_data.json'
# Merge the JSON files
merge_json_files(first_json_file, second_json_file, output_path)

In [12]:
#Load and shuffle final data
data = load_json(output_path)
random.shuffle(data)

In [20]:
#Set up tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")



In [14]:
def tokenize_and_align_labels(examples, label_all_tokens=False):
    """
    Tokenizes input text and aligns named entity recognition (NER) labels with the tokens,
    handling subwords based on the label_all_tokens flag.
    Returns tokenized inputs with aligned labels.
    """
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, padding='max_length', is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Get the word IDs of the tokens
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                # Special tokens like [CLS] and [SEP] don't correspond to any label
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                # Check if word_idx is valid
                if word_idx < len(label):  # Ensure it is within range
                    label_ids.append(label[word_idx])
                else:
                    print(f"Warning: word_idx {word_idx} out of range for example {i}.")
                    label_ids.append(-100)
            else:
                # For subwords, we either keep the same label or ignore them (-100)
                if label_all_tokens:
                    label_ids.append(label[word_idx])
                else:
                    label_ids.append(-100)

            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [15]:
#Tokenizing dataset
dataset = datasets.Dataset.from_list(data)
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/401 [00:00<?, ? examples/s]

In [17]:
# Getting pretrained Bert model
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=3)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_metrics(eval_preds):
    """
    Returns metrics for training
    """
    pred_logits, labels = eval_preds
    pred_logits = np.argmax(pred_logits, axis=2)

    # Use list comprehensions to create filtered predictions and true labels
    predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]

    # Ensure predictions are non-empty before computing metrics
    if predictions and true_labels:
        results = metric.compute(predictions=predictions, references=true_labels)
    else:
        results = {"overall_precision": 0, "overall_recall": 0, "overall_f1": 0, "overall_accuracy": 0}

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
# Data collator for token classification
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
# Load seqeval metric for evaluation
metric = evaluate.load("seqeval")

In [None]:
# Define the list of labels
label_list = ['O', 'B-MOUNT', 'I-MOUNT']

In [None]:
# Define training arguments
args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    logging_strategy="steps",
    report_to="none",
)

In [19]:
# Initialize the trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset.select(range(350)),  # Select first 150 samples for training
    eval_dataset=tokenized_dataset.select(range(350, len(tokenized_datasets))),  # Remaining for evaluation
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2094,0.098996,0.125,0.12,0.122449,0.965481
2,0.0886,0.072361,0.666667,0.8,0.727273,0.983264
3,0.0555,0.063639,0.645161,0.8,0.714286,0.982218
4,0.0311,0.060252,0.655172,0.76,0.703704,0.983264
5,0.0375,0.062476,0.666667,0.8,0.727273,0.98431
6,0.0261,0.061865,0.7,0.84,0.763636,0.986402
7,0.0261,0.057077,0.714286,0.8,0.754717,0.987448
8,0.0217,0.060781,0.677419,0.84,0.75,0.985356
9,0.0168,0.062101,0.677419,0.84,0.75,0.985356
10,0.02,0.062275,0.7,0.84,0.763636,0.986402


TrainOutput(global_step=220, training_loss=0.06718920909545638, metrics={'train_runtime': 420.7539, 'train_samples_per_second': 8.318, 'train_steps_per_second': 0.523, 'total_flos': 914546916864000.0, 'train_loss': 0.06718920909545638, 'epoch': 10.0})

In [21]:
# Saving trained model
model.save_pretrained("ner_mountains_resulting")

In [22]:
# Creating dictionaries for config
id2label = {
    str(i): label for i,label in enumerate(label_list)
}
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}

In [23]:
id2label

{'0': 'O', '1': 'B-MOUNT', '2': 'I-MOUNT'}

In [24]:
label2id

{'O': '0', 'B-MOUNT': '1', 'I-MOUNT': '2'}

In [25]:
#Modifying model config file
config = json.load(open("ner_mountains_resulting/config.json"))
config["id2label"] = id2label
config["label2id"] = label2id
json.dump(config, open("ner_mountains_resulting/config.json","w"))

In [26]:
#Downloading trained model
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("ner_mountains_resulting")

In [27]:
# NER pipeline using the fine-tuned model
ner_mountains = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer, aggregation_strategy="simple")

example = "Mountains stand as monumental sentinels of the Earth, their towering peaks reaching toward the heavens. Each range tells a story of geological forces that shaped our planet over millions of years. From the jagged spires of the Himalayas, home to the world's highest peak, Mount Everest."

# Run NER on the example text
ner_results = ner_mountains(example)

# Print results
for result in ner_results:
    print(result)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


{'entity_group': 'MOUNT', 'score': 0.68168813, 'word': 'himalayas', 'start': 227, 'end': 236}
{'entity_group': 'MOUNT', 'score': 0.9066627, 'word': 'mount everest', 'start': 272, 'end': 285}


In [28]:
!pip freeze > requirements.txt