In [1]:
import pandas as pd
from transformers import (AutoTokenizer, AutoModelForTokenClassification,
    pipeline, TrainingArguments, Trainer, DataCollatorForTokenClassification)
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

Загружаем датасет и преобразуем

In [2]:
data = pd.read_csv("ner.csv")
data.dropna(inplace=True)
del data['POS']
del data['Sentence #']
data = data.rename(columns={'Sentence': 'tokens','Tag': 'ner_tags'})

In [3]:
%%capture output
import ast

for i in range(len(data)):
    tags = ast.literal_eval(data['ner_tags'][i])
    data['ner_tags'][i] = [str(word.upper()) for word in tags]

In [4]:
data

Unnamed: 0,tokens,ner_tags
0,Thousands of demonstrators have marched throug...,"[O, O, O, O, O, O, B-GEO, O, O, O, O, O, B-GEO..."
1,Families of soldiers killed in the conflict jo...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,They marched from the Houses of Parliament to ...,"[O, O, O, O, O, O, O, O, O, O, O, B-GEO, I-GEO..."
3,"Police put the number of marchers at 10,000 wh...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,The protest comes on the eve of the annual con...,"[O, O, O, O, O, O, O, O, O, O, O, B-GEO, O, O,..."
...,...,...
47954,Indian border security forces are accusing the...,"[B-GPE, O, O, O, O, O, O, B-GPE, O, O, O, O, O..."
47955,Indian officials said no one was injured in Sa...,"[B-GPE, O, O, O, O, O, O, O, B-TIM, O, O, O, O..."
47956,Two more landed in fields belonging to a nearb...,"[O, O, O, O, O, O, O, O, O, O, O]"
47957,They say not all of the rockets exploded upon ...,"[O, O, O, O, O, O, O, O, O, O, O]"


Находим уникальные метки слов в датасете для задачи NER

In [5]:
entity_codes = set([val for sublist in data['ner_tags'].values for val in sublist])
entity_codes

{'B-ART',
 'B-EVE',
 'B-GEO',
 'B-GPE',
 'B-NAT',
 'B-ORG',
 'B-PER',
 'B-TIM',
 'I-ART',
 'I-EVE',
 'I-GEO',
 'I-GPE',
 'I-NAT',
 'I-ORG',
 'I-PER',
 'I-TIM',
 'O'}

## Метки



| Код в датасете | Описание | Код в предобученной модели |
|------|------|------|
| ORG | Organization | ORG |
| PER | Person| PER |
| GEO | Geographical Entity| LOC |
| GPE | Geopolitical Entity| MISC - Miscellaneous entity |
| ART | Artifact| O |
| EVE | Event| O |
| NAT | Natural Phenomenon| O |
| TIM | Time indicator| O |
| O | Outside of a named entity| O |

Проверка предобученной модели трансформера для задачи NER

In [6]:
%%capture output

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")

pipe = pipeline("ner", model="dslim/bert-base-NER", tokenizer=tokenizer,device=0)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
%%capture cap --no-stderr
from tqdm import tqdm
preds = []
for i in tqdm(data.values):
  pred = pipe(i[0])
  preds.append(pred)

100%|████████████████████████████████████████████████████████████████████████████| 47959/47959 [13:26<00:00, 59.45it/s]


Процент предсказаний, где верно предсказано количество именованных сущностей

In [9]:
k=0
for v1,v2 in zip(data.values, preds):
    true = list(filter(lambda x: x != 'O', v1[1]))
    if len(true)==len(v2):
        k+=1
"{:.2%}".format(k/len(data.values))

'36.07%'

Проведем дообучение модели трансформера

In [10]:
label2id = {}
dict([ (elem, 0) for elem in entity_codes ])
k=0
for i in entity_codes:
    label2id[i]=k
    k+=1
id2label = {y: x for x, y in label2id.items()}
print(label2id,"\n",id2label)

{'I-NAT': 0, 'B-TIM': 1, 'B-ART': 2, 'I-EVE': 3, 'B-GEO': 4, 'I-PER': 5, 'B-EVE': 6, 'B-ORG': 7, 'I-GEO': 8, 'B-NAT': 9, 'I-TIM': 10, 'I-GPE': 11, 'B-GPE': 12, 'I-ART': 13, 'I-ORG': 14, 'B-PER': 15, 'O': 16} 
 {0: 'I-NAT', 1: 'B-TIM', 2: 'B-ART', 3: 'I-EVE', 4: 'B-GEO', 5: 'I-PER', 6: 'B-EVE', 7: 'B-ORG', 8: 'I-GEO', 9: 'B-NAT', 10: 'I-TIM', 11: 'I-GPE', 12: 'B-GPE', 13: 'I-ART', 14: 'I-ORG', 15: 'B-PER', 16: 'O'}


In [11]:
%%capture output
k=0
data['labels'] = data['ner_tags']
for i in range(len(data)):
    data['tokens'][i] = data['tokens'][i].split()
    data['labels'][i] = [label2id[x] for x in data['labels'][i]]
    if len(data['ner_tags'][i]) != len(data['tokens'][i]):
        data.drop([i],inplace=True)
        k+=1
data['labels'] = data['labels'].apply(lambda x: [int(i) for i in x])

In [12]:
data

Unnamed: 0,tokens,ner_tags,labels
0,"[Thousands, of, demonstrators, have, marched, ...","[O, O, O, O, O, O, B-GEO, O, O, O, O, O, B-GEO...","[16, 16, 16, 16, 16, 16, 4, 16, 16, 16, 16, 16..."
1,"[Families, of, soldiers, killed, in, the, conf...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 1..."
2,"[They, marched, from, the, Houses, of, Parliam...","[O, O, O, O, O, O, O, O, O, O, O, B-GEO, I-GEO...","[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 4..."
3,"[Police, put, the, number, of, marchers, at, 1...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 1..."
4,"[The, protest, comes, on, the, eve, of, the, a...","[O, O, O, O, O, O, O, O, O, O, O, B-GEO, O, O,...","[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 4..."
...,...,...,...
47954,"[Indian, border, security, forces, are, accusi...","[B-GPE, O, O, O, O, O, O, B-GPE, O, O, O, O, O...","[12, 16, 16, 16, 16, 16, 16, 12, 16, 16, 16, 1..."
47955,"[Indian, officials, said, no, one, was, injure...","[B-GPE, O, O, O, O, O, O, O, B-TIM, O, O, O, O...","[12, 16, 16, 16, 16, 16, 16, 16, 1, 16, 16, 16..."
47956,"[Two, more, landed, in, fields, belonging, to,...","[O, O, O, O, O, O, O, O, O, O, O]","[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16]"
47957,"[They, say, not, all, of, the, rockets, explod...","[O, O, O, O, O, O, O, O, O, O, O]","[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16]"


In [14]:
print("Количество некорректных значений: ",k)

Количество некорректных значений:  4


Делим данные для проверки и обучения

In [24]:
from sklearn.model_selection import train_test_split

data_train, data_test = train_test_split(data, test_size=0.3)
data_val, data_test = train_test_split(data_test, test_size=0.5)

In [25]:
from datasets import Dataset, DatasetDict

ds = DatasetDict()
ds['train'] = Dataset.from_pandas(data_train).remove_columns(["__index_level_0__"])
ds['validation'] = Dataset.from_pandas(data_val).remove_columns(["__index_level_0__"])

ds

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'labels'],
        num_rows: 33568
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'labels'],
        num_rows: 7193
    })
})

In [26]:
ds['train'].features

{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}

In [27]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples[f"labels"]):
        labels.append(label)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs
tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)
tokenized_ds

Map:   0%|          | 0/33568 [00:00<?, ? examples/s]

Map:   0%|          | 0/7193 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 33568
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7193
    })
})

In [28]:
tokenized_ds['train'].features

{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [20]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
model = AutoModelForTokenClassification.from_pretrained(
    "dslim/bert-base-NER", num_labels=17, id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True
)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at dslim/bert-base-NER and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([17]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768])

Дообучаем модель

In [21]:
training_args = TrainingArguments(
    output_dir="model",
    learning_rate=1e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"].remove_columns(["ner_tags",'tokens']),
    eval_dataset=tokenized_ds["validation"].remove_columns(["ner_tags",'tokens']),
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.1898,0.150947
2,0.1321,0.102621
3,0.0819,0.073359


TrainOutput(global_step=6294, training_loss=0.1598629974205228, metrics={'train_runtime': 3215.9139, 'train_samples_per_second': 31.314, 'train_steps_per_second': 1.957, 'total_flos': 2350625902925664.0, 'train_loss': 0.1598629974205228, 'epoch': 3.0})

In [22]:
model_name_or_path = "model/checkpoint-6294"
pipe = pipeline("ner", model=model_name_or_path,tokenizer=tokenizer,device=0)

In [29]:
preds = []
for i in tqdm(data_test.values):
  pred = pipe(' '.join(i[0]))
  preds.append(pred)

100%|██████████████████████████████████████████████████████████████████████████████| 7194/7194 [05:30<00:00, 21.79it/s]


Процент верных ответов

In [30]:
k=0
for v1,v2 in zip(data_test.values, preds):
    true = list(filter(lambda x: x != 'O', v1[1]))
    if true==[x.get("entity") for x in v2]:
        k+=1
"{:.2%}".format(k/len(data.values))

'8.17%'

Обучим модель трансформер distilbert/distilbert-base-uncased

In [31]:
new_tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
new_data_collator = DataCollatorForTokenClassification(tokenizer=new_tokenizer)
new_model = AutoModelForTokenClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=17, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = new_tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples[f"labels"]):
        labels.append(label)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs
new_tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)
new_tokenized_ds

Map:   0%|          | 0/33568 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/7193 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 33568
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 7193
    })
})

In [33]:
new_training_args = TrainingArguments(
    output_dir="new_model",
    learning_rate=1e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

new_trainer = Trainer(
    model=new_model,
    args=new_training_args,
    train_dataset=new_tokenized_ds["train"].remove_columns(["ner_tags",'tokens']),
    eval_dataset=new_tokenized_ds["validation"].remove_columns(["ner_tags",'tokens']),
    tokenizer=new_tokenizer,
    data_collator=new_data_collator
)

new_trainer.train()

Epoch,Training Loss,Validation Loss
1,0.2007,0.169679
2,0.1381,0.137951
3,0.0907,0.125691


TrainOutput(global_step=6294, training_loss=0.16770618561149817, metrics={'train_runtime': 938.3223, 'train_samples_per_second': 107.323, 'train_steps_per_second': 6.708, 'total_flos': 1145799483401952.0, 'train_loss': 0.16770618561149817, 'epoch': 3.0})

In [35]:
model_name_or_path = "new_model/checkpoint-6294"
pipe = pipeline("ner", model=model_name_or_path,tokenizer=new_tokenizer,device=0)
preds = []
for i in tqdm(data_test.values):
  pred = pipe(' '.join(i[0]))
  preds.append(pred)

100%|█████████████████████████████████████████████████████████████████████████████| 7194/7194 [01:04<00:00, 110.69it/s]


Процент верных ответов

In [36]:
k=0
for v1,v2 in zip(data_test.values, preds):
    true = list(filter(lambda x: x != 'O', v1[1]))
    if true==[x.get("entity") for x in v2]:
        k+=1
"{:.2%}".format(k/len(data_test.values))

'46.34%'