In [2]:
import spacy

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [32]:
doc = nlp("Tesla is going to acquire Twitter for $45 billion")

for ent in doc.ents:
     print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))

Tesla | ORG | Companies, agencies, institutions, etc.
Twitter | PRODUCT | Objects, vehicles, foods, etc. (not services)
$45 billion | MONEY | Monetary values, including unit


In [6]:
from spacy import displacy

displacy.render(doc, style="ent")

In [9]:
# nlp is already 'pre-trained', so this are the entities it supports:
nlp.pipe_labels['ner']

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [10]:
nlp.pipe_labels['tagger']

['$',
 "''",
 ',',
 '-LRB-',
 '-RRB-',
 '.',
 ':',
 'ADD',
 'AFX',
 'CC',
 'CD',
 'DT',
 'EX',
 'FW',
 'HYPH',
 'IN',
 'JJ',
 'JJR',
 'JJS',
 'LS',
 'MD',
 'NFP',
 'NN',
 'NNP',
 'NNPS',
 'NNS',
 'PDT',
 'POS',
 'PRP',
 'PRP$',
 'RB',
 'RBR',
 'RBS',
 'RP',
 'SYM',
 'TO',
 'UH',
 'VB',
 'VBD',
 'VBG',
 'VBN',
 'VBP',
 'VBZ',
 'WDT',
 'WP',
 'WP$',
 'WRB',
 'XX',
 '_SP',
 '``']

In [11]:
nlp.pipe_labels['parser']

['ROOT',
 'acl',
 'acomp',
 'advcl',
 'advmod',
 'agent',
 'amod',
 'appos',
 'attr',
 'aux',
 'auxpass',
 'case',
 'cc',
 'ccomp',
 'compound',
 'conj',
 'csubj',
 'csubjpass',
 'dative',
 'dep',
 'det',
 'dobj',
 'expl',
 'intj',
 'mark',
 'meta',
 'neg',
 'nmod',
 'npadvmod',
 'nsubj',
 'nsubjpass',
 'nummod',
 'oprd',
 'parataxis',
 'pcomp',
 'pobj',
 'poss',
 'preconj',
 'predet',
 'prep',
 'prt',
 'punct',
 'quantmod',
 'relcl',
 'xcomp']

In [28]:
doc = nlp("Michael Bloomberg founded Bloomberg L.P in 1982. Now Bloomberg company is worth 100 billion.")
for ent in doc.ents:
    print(ent.text, "|", ent.start_char, "|", ent.end_char, "|", ent.label_)

Michael Bloomberg | 0 | 17 | PERSON
Bloomberg L.P | 26 | 39 | PERSON
1982 | 43 | 47 | DATE
Bloomberg | 53 | 62 | PERSON
100 billion | 80 | 91 | MONEY


In [33]:
# single token
print(doc[0])

# type of token
print(type(doc[0]))

# span
print(doc[2:5])

# type
print(type(doc[2:5]))


Tesla
<class 'spacy.tokens.token.Token'>
going to acquire
<class 'spacy.tokens.span.Span'>


In [34]:
from spacy.tokens import Span
from spacy.matcher import Matcher
from spacy.language import Language
from spacy.util import filter_spans
from spacy.tokens import Doc
from spacy.tokens import Token
from spacy.tokens import DocBin

In [35]:
s1 = Span(doc, 0, 1, label="ORG")
s2 = Span(doc, 5, 6, label="ORG")

doc.set_ents([s1, s2], default="unmodified")

In [36]:
for ent in doc.ents:
     print(ent.text, "|", ent.label_)

Tesla | ORG
Twitter | ORG
$45 billion | MONEY
