In [163]:
from datasets import load_dataset
dataset = load_dataset("wikiann", "en")  # ou "fr", "ar", etc.


In [164]:
dataset

DatasetDict({
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 20000
    })
})

In [165]:
tags=dataset["train"].features["ner_tags"].feature.names


In [166]:
len(tags)

7

In [167]:
tags

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

In [168]:
first_row_tokens=dataset["train"]["tokens"][0]

In [169]:
first_row_tags=dataset["train"][0]["ner_tags"]

In [170]:
first_row_tokens

['R.H.',
 'Saunders',
 '(',
 'St.',
 'Lawrence',
 'River',
 ')',
 '(',
 '968',
 'MW',
 ')']

In [207]:
first_row_tags

[3, 4, 0, 3, 4, 4, 0, 0, 0, 0, 0]

In [209]:
def get_text_tags(row_tokens,row_tags):
    full_text=""
    text_tags=""
    for token,label in zip(row_tokens,row_tags):
        full_label=tags[label]
        full_text+=token +' '
        text_tags+=full_label +' '
    print(full_text)
    print(text_tags)


In [173]:
from transformers import AutoTokenizer
model_chekpoint="bert-base-cased"
tokenizer=AutoTokenizer.from_pretrained(model_chekpoint,offset_mapping=True,return_special_tokens_mask=True)

In [222]:
inputs=tokenizer(dataset["train"][0]['tokens'],is_split_into_words=True,return_tensors="pt")

In [227]:
inputs.word_ids()

[None, 0, 0, 0, 0, 1, 2, 3, 3, 4, 5, 6, 7, 8, 8, 9, 10, None]

In [179]:
import evaluate
metric=evaluate.load("seqeval")

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            new_labels.append(-100)
        else:
            label = labels[word_id]
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [230]:
labels=dataset["train"][0]["ner_tags"]
words=inputs.word_ids()
res=align_labels_with_tokens(labels,words)

In [231]:
res

[-100, 3, 4, 4, 4, 4, 0, 3, 4, 4, 4, 0, 0, 0, 0, 0, 0, -100]

In [232]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [235]:
tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset["train"].column_names,
)


In [236]:
from transformers import DataCollatorForTokenClassification
data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer)

In [242]:
batch=data_collator([tokenized_datasets["train"][i] for i in range(2)])


In [262]:
labels=dataset["train"][0]["ner_tags"]

labels

[3, 4, 0, 3, 4, 4, 0, 0, 0, 0, 0]

In [272]:
res=[tags[i] for i in labels]
res


['B-ORG', 'I-ORG', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O']

In [275]:
predictions=res.copy()
predictions[1]="O"
predictions

['B-ORG', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O']

In [280]:
print(res)
print([res])

['B-ORG', 'I-ORG', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O']
[['B-ORG', 'I-ORG', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O']]


In [279]:
metric=metric.compute(predictions=[predictions],references=[res])

In [None]:
id2label={i:label for i,label in enumerate(tags)}
id2label



{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC'}

In [None]:
label2id={v:k for k,v in id2label.items()}


In [286]:
label2id

{'O': 0,
 'B-PER': 1,
 'I-PER': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'B-LOC': 5,
 'I-LOC': 6}

In [287]:
from transformers import AutoModelForTokenClassification

model=AutoModelForTokenClassification.from_pretrained(
    model_chekpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [294]:
from huggingface_hub import login
login(token="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")

In [295]:
from transformers import TrainingArguments

args=TrainingArguments(
    "token_classification_model",
    eval_strategy="epoch",
    weight_decay=0.01,
    learning_rate=2e-5,
    fp16=True,
    logging_strategy="epoch",
    num_train_epochs=3,
    push_to_hub=True
    

)

In [None]:
import numpy as np
def compute(outputs):
    logits,labels=outputs
    predictions=np.argmax(logits,axis=-1)
    true_predictions=[
        [tags[p] for (p,l) in zip(pred,label) if l!=-100]
        for pred, label in zip(predictions,labels)
    ]
    true_labels=[[tags[l] for l in labels if l!=-100]]
    all_metrics=metric.compute(
        predictions=true_predictions,references=true_labels
    )
    return {
        "precision":all_metrics["overall_precision"],
        "recall":all_metrics["overall_recal"],
        "f1":all_metrics["overall_f1"],
        "accuracy":all_metrics["overall_accuracy"]
    }

: 

In [None]:
from transformers import Trainer

trainer=Trainer(
model=model,
args=args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
data_collator=data_collator,
processing_class=tokenizer
)



In [297]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mabderraoufheboul[0m ([33mabderraoufheboul-devformm[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,0.3648,0.267417
2,0.2063,0.278565
3,0.1375,0.312378


TrainOutput(global_step=7500, training_loss=0.236201220703125, metrics={'train_runtime': 744.5566, 'train_samples_per_second': 80.585, 'train_steps_per_second': 10.073, 'total_flos': 791438327074560.0, 'train_loss': 0.236201220703125, 'epoch': 3.0})