In [2]:
import pandas as pd
from datasets import load_dataset

In [3]:
from google.colab import files
uploaded = files.upload()

Saving NER dataset.csv to NER dataset.csv


In [4]:
import pandas as pd

# Try with Windows-1252 first
df = pd.read_csv("NER dataset.csv", encoding="windows-1252")
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [5]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

In [6]:
df = df.drop(columns=["POS"])

In [7]:
df["Sentence #"] = df["Sentence #"].fillna(method="ffill")

  df["Sentence #"] = df["Sentence #"].fillna(method="ffill")


In [8]:
# ===== Group into sentences =====
sentences = []
labels = []

# Fill any potential NaN values in 'Word' with empty strings before grouping
df["Word"] = df["Word"].fillna("")

for _, group in df.groupby("Sentence #"):
    words = group["Word"].tolist()
    tags = group["Tag"].tolist()
    sentences.append(words)
    labels.append(tags)

In [9]:
# Create label list
label_list = sorted(list(set(tag for doc in labels for tag in doc)))
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

In [10]:
# ===== Prepare dict for HuggingFace Dataset =====
data_dict = {
    "tokens": sentences,  # list of lists of strings
    "ner_tags": [[label_to_id[tag] for tag in doc] for doc in labels]  # list of lists of ints
}

# Ensure all types are correct before creating dataset
assert all(isinstance(tok, str) for doc in data_dict["tokens"] for tok in doc)
assert all(isinstance(tag, int) for doc in data_dict["ner_tags"] for tag in doc)

# ===== Convert to Dataset =====
dataset = Dataset.from_dict(data_dict)

# ===== Train-test split =====
train_test = dataset.train_test_split(test_size=0.1, seed=42)
dataset = DatasetDict({
    "train": train_test["train"],
    "test": train_test["test"]
})

print(dataset)
print("Labels:", label_list)

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 43163
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 4796
    })
})
Labels: ['B-art', 'B-eve', 'B-geo', 'B-gpe', 'B-nat', 'B-org', 'B-per', 'B-tim', 'I-art', 'I-eve', 'I-geo', 'I-gpe', 'I-nat', 'I-org', 'I-per', 'I-tim', 'O']


In [11]:
dataset["train"] = dataset["train"].select(range(20000))
dataset["test"] = dataset["test"].select(range(2000))


In [12]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=bf022e4df6d34e5c41b332f1c8bb1cbe9ae16cc29c24cfa4d929885bdb4f4ef6
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [13]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer
from seqeval.metrics import classification_report
import numpy as np

In [14]:
# =============================
# 4. Tokenizer
# =============================
model_checkpoint = "distilbert-base-cased"  # You can change to any compatible model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Align labels with tokens
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignore special tokens
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)  # Ignore sub-tokens
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [15]:
# =============================
# 5. Model
# =============================
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)


model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# =============================
# 6. Data Collator
# =============================
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


In [17]:
# =============================
# 7. Metrics
# =============================
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [id_to_label[l] for l in label if l != -100]
        for label in labels
    ]
    true_predictions = [
        [id_to_label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    report = classification_report(true_labels, true_predictions, digits=4)
    print(report)

    return {
        "f1": float(report.split()[-2]),  # last F1 score in report
    }


In [18]:
# =============================
# 8. Training Arguments
# =============================
args = TrainingArguments(
    "ner-model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    report_to="none"
)


In [19]:
# =============================
# 9. Trainer
# =============================
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# =============================
# 10. Train
# =============================
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1
1,0.1016,0.096605,0.8207


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         art     0.0000    0.0000    0.0000        22
         eve     0.5000    0.3333    0.4000        12
         geo     0.8578    0.8813    0.8694      1575
         gpe     0.9189    0.9563    0.9372       640
         nat     0.4286    0.5000    0.4615         6
         org     0.6854    0.6769    0.6811       811
         per     0.7558    0.8003    0.7774       696
         tim     0.8232    0.8607    0.8415       833

   micro avg     0.8135    0.8335    0.8234      4595
   macro avg     0.6212    0.6261    0.6210      4595
weighted avg     0.8086    0.8335    0.8207      4595



Epoch,Training Loss,Validation Loss,F1
1,0.1016,0.096605,0.8207
2,0.0754,0.089656,0.8298
3,0.0662,0.09313,0.836


              precision    recall  f1-score   support

         art     0.6667    0.0909    0.1600        22
         eve     0.3636    0.3333    0.3478        12
         geo     0.8616    0.8895    0.8754      1575
         gpe     0.9668    0.9563    0.9615       640
         nat     0.6667    0.3333    0.4444         6
         org     0.7026    0.6671    0.6844       811
         per     0.7373    0.8147    0.7741       696
         tim     0.8629    0.8535    0.8582       833

   micro avg     0.8278    0.8357    0.8317      4595
   macro avg     0.7285    0.6173    0.6382      4595
weighted avg     0.8271    0.8357    0.8298      4595

              precision    recall  f1-score   support

         art     0.6000    0.1364    0.2222        22
         eve     0.3636    0.3333    0.3478        12
         geo     0.8638    0.8978    0.8804      1575
         gpe     0.9652    0.9547    0.9599       640
         nat     0.7500    0.5000    0.6000         6
         org     0.7000 

TrainOutput(global_step=7500, training_loss=0.0984513845761617, metrics={'train_runtime': 18672.1481, 'train_samples_per_second': 3.213, 'train_steps_per_second': 0.402, 'total_flos': 641494886798832.0, 'train_loss': 0.0984513845761617, 'epoch': 3.0})

In [20]:
# =============================
# 11. Evaluate
# =============================
trainer.evaluate()

              precision    recall  f1-score   support

         art     0.6000    0.1364    0.2222        22
         eve     0.3636    0.3333    0.3478        12
         geo     0.8638    0.8978    0.8804      1575
         gpe     0.9652    0.9547    0.9599       640
         nat     0.7500    0.5000    0.6000         6
         org     0.7000    0.7078    0.7039       811
         per     0.7759    0.7859    0.7809       696
         tim     0.8597    0.8535    0.8566       833

   micro avg     0.8330    0.8416    0.8373      4595
   macro avg     0.7348    0.6462    0.6690      4595
weighted avg     0.8322    0.8416    0.8360      4595



{'eval_loss': 0.0931297168135643,
 'eval_f1': 0.836,
 'eval_runtime': 152.435,
 'eval_samples_per_second': 13.12,
 'eval_steps_per_second': 1.64,
 'epoch': 3.0}

In [2]:
import gradio as gr

def predict_ner(text):
    # Ensure the checkpoint path is correct
    checkpoint = "/content/ner-model/checkpoint-7500" # Update this if your checkpoint path is different
    token_classifier = pipeline(
        "token-classification", model=checkpoint, aggregation_strategy="simple"
    )
    return token_classifier(text)

iface = gr.Interface(fn=predict_ner, inputs="text", outputs="json")
iface.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://857bad2e24191a93bb.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


