In [86]:
# Attempt to obtain probabilities of entities in NER task

In [79]:
from transformers import pipeline
from transformers import AutoTokenizer

# transformer config from hugging face course
model_checkpoint = "huggingface-course/bert-finetuned-ner"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

token_classifier = pipeline(
    "token-classification", model=model_checkpoint, tokenizer=tokenizer, aggregation_strategy="simple"
)
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

[{'entity_group': 'PER',
  'score': 0.9988506,
  'word': 'Sylvain',
  'start': 11,
  'end': 18},
 {'entity_group': 'ORG',
  'score': 0.9647625,
  'word': 'Hugging Face',
  'start': 33,
  'end': 45},
 {'entity_group': 'LOC',
  'score': 0.9986118,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

In [80]:
# function to return minimum score from a dynamic list output from token_classifier
def min_score(input):
    output = min(i['score'] for i in input)
    return output

In [81]:
min_score(token_classifier("I am Dekai from New England"))

0.99894774

### Test SeqEval package

In [82]:
import evaluate

metric = evaluate.load("seqeval")

In [83]:
y_true = ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']
y_pred = ['B-ORG', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O']
# f1_score(y_true, y_pred)
metric.compute(predictions=[y_pred], references=[y_true])

{'MISC': {'precision': 1.0,
  'recall': 0.5,
  'f1': 0.6666666666666666,
  'number': 2},
 'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 0.6666666666666666,
 'overall_f1': 0.8,
 'overall_accuracy': 0.8888888888888888}

### Test datasets

In [84]:
from datasets import load_dataset

raw_datasets = load_dataset("conll2003")
raw_datasets

Found cached dataset conll2003 (C:/Users/dekai/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)
100%|██████████| 3/3 [00:00<00:00, 511.98it/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [68]:
train = raw_datasets['train']
# ner_tags = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
ner_tags = ['O', 'PER', 'PER', 'ORG', 'ORG', 'LOC', 'LOC', 'MISC', 'MISC']

In [91]:
# create a dict with 10 conll2003 sentences
train_dict = {}

for x in range(10):
    train_dict[x] = ' '.join(train['tokens'][x])

In [99]:
# evaluate min prediction probability for each sentence
for x in range(len(train_dict)):
    print(str(x) + ":" + str(min_score(token_classifier(train_dict[x]))))

0:0.99900407
1:0.99960923
2:0.99928546
3:0.99849707
4:0.9986235
5:0.99832624
6:0.99901825
7:0.9946227
8:0.99853235
9:0.99893945


### test slicing go_emotions dataset

In [134]:
go_emotions = load_dataset("go_emotions")

No config specified, defaulting to: go_emotions/simplified
Found cached dataset go_emotions (C:/Users/dekai/.cache/huggingface/datasets/go_emotions/simplified/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d)
100%|██████████| 3/3 [00:00<00:00, 999.68it/s]


In [129]:
go_emotions

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5427
    })
})

In [145]:
go_emotions_mini['train'] = go_emotions['train'].select(list(range(1000)))
go_emotions_mini['validation'] = go_emotions['validation'].select(list(range(100)))
go_emotions_mini['test'] = go_emotions['test'].select(list(range(100)))

In [147]:
go_emotions_mini['train'][0]

{'text': "My favourite food is anything I didn't have to cook myself.",
 'labels': [27],
 'id': 'eebbqej'}

In [148]:
go_emotions_mini['test'][0]

{'text': 'I’m really sorry about your situation :( Although I love the names Sapphira, Cirilla, and Scarlett!',
 'labels': [25],
 'id': 'eecwqtt'}

In [55]:
# turn ner_ids into ner_tags based on train.features
def id_to_tags (id_list):
    tag_list = []
    for x in id_list:
        tag_list.append(ner_tags[x])
    return tag_list


In [69]:
id_to_tags(raw_datasets['train']['ner_tags'][0])

['ORG', 'O', 'MISC', 'O', 'O', 'O', 'MISC', 'O', 'O']

In [67]:
print(train['tokens'][3])
print(id_to_tags(train['ner_tags'][3]))

['The', 'European', 'Commission', 'said', 'on', 'Thursday', 'it', 'disagreed', 'with', 'German', 'advice', 'to', 'consumers', 'to', 'shun', 'British', 'lamb', 'until', 'scientists', 'determine', 'whether', 'mad', 'cow', 'disease', 'can', 'be', 'transmitted', 'to', 'sheep', '.']
['O', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [74]:
' '.join(train['tokens'][3])

'The European Commission said on Thursday it disagreed with German advice to consumers to shun British lamb until scientists determine whether mad cow disease can be transmitted to sheep .'

In [100]:
token_classifier(' '.join(train['tokens'][7]))

[{'entity_group': 'ORG',
  'score': 0.9946227,
  'word': 'EU',
  'start': 33,
  'end': 35},
 {'entity_group': 'PER',
  'score': 0.99949455,
  'word': 'Franz Fischler',
  'start': 54,
  'end': 68}]