In [1]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Apple apple PROPN NNP nsubj Xxxxx True False
is be VERB VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. u.k. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


In [2]:
# A named entity is a “real-world object” that’s assigned a name – 
# for example, a person, a country, a product or a book title. 
# spaCy can recognizevarious types of named entities in a document, by asking the model for a prediction. 
# Because models are statistical and strongly depend on the examples they were trained on, 
# this doesn’t always work perfectly and might need some tuning later, depending on your use case.

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [8]:
## python -m spacy download en_core_web_lg  很大。。。

import spacy

nlp = spacy.load('en_core_web_lg')
tokens = nlp(u'dog cat banana afskfsd')

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 7.0336733 False
cat True 6.6808186 False
banana True 6.700014 False
afskfsd False 0.0 True


In [9]:
# 根据word vector计算similarity
tokens = nlp(u'dog cat banana')

for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

dog dog 1.0
dog cat 0.80168545
dog banana 0.24327643
cat dog 0.80168545
cat cat 1.0
cat banana 0.28154364
banana dog 0.24327643
banana cat 0.28154364
banana banana 1.0


## pipelines

text -> tokenizer -> tagger -> parser -> ner -> ... -> Doc

tokenizer 把text分割成token

tagger assign part-of-speech tags (POS tagging / POST)

parser   assign dependency labels

ner      detect and label named entities

textcat  文本分类

In [None]:
from spacy import displacy

doc = nlp(u"This is a sentence.")
displacy.serve(doc, style="dep")


[93m    Serving on port 5000...[0m
    Using the 'dep' visualizer



127.0.0.1 - - [25/Apr/2019 12:21:49] "GET / HTTP/1.1" 200 3057
127.0.0.1 - - [25/Apr/2019 12:21:49] "GET /favicon.ico HTTP/1.1" 200 3057
