In [1]:
import pandas as pd
from datasets import load_dataset

In [2]:
from google.colab import files
uploaded = files.upload()

Saving NER dataset.csv to NER dataset.csv


In [3]:
import pandas as pd

In [4]:
import pandas as pd

# Try with Windows-1252 first
df = pd.read_csv("NER dataset.csv", encoding="windows-1252")
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [5]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

In [6]:
df = df.drop(columns=["POS"])

In [7]:
df["Sentence #"] = df["Sentence #"].fillna(method="ffill")

  df["Sentence #"] = df["Sentence #"].fillna(method="ffill")


In [8]:
# ===== Group into sentences =====
sentences = []
labels = []

# Fill any potential NaN values in 'Word' with empty strings before grouping
df["Word"] = df["Word"].fillna("")

for _, group in df.groupby("Sentence #"):
    words = group["Word"].tolist()
    tags = group["Tag"].tolist()
    sentences.append(words)
    labels.append(tags)

In [9]:
# Create label list
label_list = sorted(list(set(tag for doc in labels for tag in doc)))
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

In [10]:
# ===== Prepare dict for HuggingFace Dataset =====
data_dict = {
    "tokens": sentences,  # list of lists of strings
    "ner_tags": [[label_to_id[tag] for tag in doc] for doc in labels]  # list of lists of ints
}

# Ensure all types are correct before creating dataset
assert all(isinstance(tok, str) for doc in data_dict["tokens"] for tok in doc)
assert all(isinstance(tag, int) for doc in data_dict["ner_tags"] for tag in doc)

# ===== Convert to Dataset =====
dataset = Dataset.from_dict(data_dict)

# ===== Train-test split =====
train_test = dataset.train_test_split(test_size=0.1, seed=42)
dataset = DatasetDict({
    "train": train_test["train"],
    "test": train_test["test"]
})

print(dataset)
print("Labels:", label_list)

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 43163
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 4796
    })
})
Labels: ['B-art', 'B-eve', 'B-geo', 'B-gpe', 'B-nat', 'B-org', 'B-per', 'B-tim', 'I-art', 'I-eve', 'I-geo', 'I-gpe', 'I-nat', 'I-org', 'I-per', 'I-tim', 'O']


In [11]:
dataset["train"] = dataset["train"].select(range(10000))
dataset["test"] = dataset["test"].select(range(2000))

In [12]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=2f2d9db814fe1c8bcb78184b5e4bf2e49edd13f28cc7c01bb22d59a464b8c723
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [13]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer
from seqeval.metrics import classification_report
import numpy as np

In [15]:
# =============================
# 4. Tokenizer
# =============================
model_checkpoint = "roberta-base"  # You can change to any compatible model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# Align labels with tokens
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignore special tokens
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)  # Ignore sub-tokens
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [16]:
# =============================
# 5. Model
# =============================
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# =============================
# 6. Data Collator
# =============================
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


In [18]:
# =============================
# 7. Metrics
# =============================
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [id_to_label[l] for l in label if l != -100]
        for label in labels
    ]
    true_predictions = [
        [id_to_label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    report = classification_report(true_labels, true_predictions, digits=4)
    print(report)

    return {
        "f1": float(report.split()[-2]),  # last F1 score in report
    }


In [19]:
# =============================
# 8. Training Arguments
# =============================
args = TrainingArguments(
    "ner-model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    report_to="none"
)


In [20]:
# =============================
# 9. Trainer
# =============================
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# =============================
# 10. Train
# =============================
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1
1,0.1119,0.115732,0.8005


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         art     0.0000    0.0000    0.0000        22
         eve     0.0000    0.0000    0.0000        12
         geo     0.8232    0.8959    0.8580      1575
         gpe     0.9341    0.9297    0.9319       640
         nat     0.0000    0.0000    0.0000         6
         org     0.6291    0.6128    0.6209       811
         per     0.7424    0.7744    0.7581       696
         tim     0.8564    0.8235    0.8397       833

   micro avg     0.7986    0.8113    0.8049      4595
   macro avg     0.4982    0.5045    0.5011      4595
weighted avg     0.7910    0.8113    0.8005      4595



Epoch,Training Loss,Validation Loss,F1
1,0.1119,0.115732,0.8005
2,0.1074,0.103887,0.8165


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         art     0.0000    0.0000    0.0000        22
         eve     0.6667    0.3333    0.4444        12
         geo     0.8578    0.8692    0.8635      1575
         gpe     0.9060    0.9484    0.9267       640
         nat     0.0000    0.0000    0.0000         6
         org     0.6594    0.6757    0.6675       811
         per     0.7730    0.7974    0.7850       696
         tim     0.8501    0.8439    0.8470       833

   micro avg     0.8145    0.8239    0.8192      4595
   macro avg     0.5891    0.5585    0.5668      4595
weighted avg     0.8095    0.8239    0.8165      4595



KeyboardInterrupt: 

In [None]:
# =============================
# 11. Evaluate
# =============================
trainer.evaluate()

In [22]:
import gradio as gr

def predict_ner(text):
    # Ensure the checkpoint path is correct
    checkpoint = "/content/ner-model/checkpoint-625" # Update this if your checkpoint path is different
    token_classifier = pipeline(
        "token-classification", model=checkpoint, aggregation_strategy="simple"
    )
    return token_classifier(text)

iface = gr.Interface(fn=predict_ner, inputs="text", outputs="json")
iface.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7c9813fffea546306e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


