### Machine Learning dan spaCy

In [1]:
import spacy
nlp=spacy.load('en_core_web_sm')

print(nlp.pipe_names)

ner_lst = nlp.pipe_labels['ner']

print(len(ner_lst))
print(ner_lst)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
18
['CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART']


In [8]:
import spacy
nlp=spacy.load('en_core_web_sm')

article = """It’s that time of the year when Open Web builders 
head to the Web3 castle in Denver, Colorado. ETHDenver is upon us, 
and the NEAR community will be there with a full slate of events 
(February 15-20) focused on building a multi-chain future. 

Why is the NEAR community attending ETHDenver? Good question. 
The NEAR community believes in a collaborative, decentralized, 
multi-chain ecosystem of Open Web and Metaverse platforms. Projects 
like Aurora and Rainbow Bridge, for example, have helped build 
simple and secure bridges between NEAR and Ethererum, allowing 
users and developers to freely move assets between the two networks."""

doc=nlp(article)

for ent in doc.ents:
    print(ent.text,ent.label_)


#reverensi NER: https://nlp.cs.nyu.edu/ene/

Phoenix GPE
Arecaceae PERSON
Africa LOC
the Middle East LOC
South Asia LOC
Phoenix GPE
12–19 CARDINAL


In [3]:
# Sumber: https://stackoverflow.com/questions/69181078/spacy-how-do-you-add-custom-ner-labels-to-a-pre-trained-model
# dengan modifikasi
#
import spacy
import random
from spacy import util
from spacy.tokens import Doc
from spacy.training import Example
from spacy.language import Language
from pathlib import Path
import os

nlp = spacy.load('en_core_web_sm')

def print_doc_entities(_doc: Doc):
    if _doc.ents:
        for _ent in _doc.ents:
            print(f"     {_ent.text} {_ent.label_}")
    else:
        print("     NONE")

train_data = [
    ('We need to deliver it to Festy.', [(25, 30, 'DISTRICT')]),
    ('', [])
]

# Result before training
print(f"\nResult BEFORE training:")
# doc = nlp(u'I need a taxi to Festy.')
doc = nlp(u'I like dates')
print_doc_entities(doc)

# Karena training untuk ner, maka komponen 
# ner saja yang akan diaktifkan
#
disabled_pipes = []
for pipe_name in nlp.pipe_names:
    if pipe_name != 'ner':
        nlp.disable_pipes(pipe_name)
        disabled_pipes.append(pipe_name)

print("Proses training ...")

optimizer = nlp.create_optimizer()
for _ in range(25):
    random.shuffle(train_data)
    for raw_text, entity_offsets in train_data:
        doc = nlp.make_doc(raw_text)
        example = Example.from_dict(doc, {"entities": entity_offsets})
        nlp.update([example], sgd=optimizer)

# Enable all previously disabled pipe components
for pipe_name in disabled_pipes:
    nlp.enable_pipe(pipe_name)

# Result after training
print(f"Result AFTER training:")
doc = nlp(u'I need a taxi to Festy.')
print_doc_entities(doc)

current_dir = os.getcwd()

output_dir = Path(os.path.join(current_dir, 'zimera.model/'))
nlp.to_disk(output_dir)


Result BEFORE training:
     NONE
Proses training ...
Result AFTER training:
     Festy DISTRICT


akan ada folder model bernama zimera.model
'''bash
yrich@dll:~/Documents/GitHub/zimera/novice/03-09/zimera.model$ ls -la
total 136
drwxrwxr-x 10 yrich yrich  4096 Feb 14 20:14 .
drwxrwxr-x  3 yrich yrich  4096 Feb 14 20:14 ..
drwxrwxr-x  2 yrich yrich  4096 Feb 14 20:14 attribute_ruler
-rw-rw-r--  1 yrich yrich  5400 Feb 14 20:14 config.cfg
drwxrwxr-x  3 yrich yrich  4096 Feb 14 20:14 lemmatizer
-rw-rw-r--  1 yrich yrich 10184 Feb 14 20:14 meta.json
drwxrwxr-x  2 yrich yrich  4096 Feb 14 20:14 ner
drwxrwxr-x  2 yrich yrich  4096 Feb 14 20:14 parser
drwxrwxr-x  2 yrich yrich  4096 Feb 14 20:14 senter
drwxrwxr-x  2 yrich yrich  4096 Feb 14 20:14 tagger
drwxrwxr-x  2 yrich yrich  4096 Feb 14 20:14 tok2vec
-rw-rw-r--  1 yrich yrich 77777 Feb 14 20:14 tokenizer
drwxrwxr-x  2 yrich yrich  4096 Feb 14 20:14 vocab
'''

In [5]:
import spacy
from pathlib import Path
import os

current_dir = os.getcwd()

model_dir = Path(os.path.join(current_dir, 'zimera.model/'))

nlp=spacy.load(model_dir)

doc = nlp(u'I need a taxi to Festy.')
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities [('Festy', 'DISTRICT')]


In [9]:
from spacy.lang.id.examples import sentences

doc = nlp.pipe(sentences)