In [3]:
import spacy

nlp = spacy.blank("en")

doc = nlp ("how do you do?")

for token in doc:
    print(token.text)


how
do
you
do
?


In [4]:
token = doc[1]
print (token.text)

do


In [5]:
span = doc [2:4]
print (span.text)

you do


In [7]:
doc = nlp("It costs $5.")
print("index: ",[token.i for token in doc])
print("text: ", [token.text for token in doc])

print ("is_alpha: ", [token.is_alpha for token in doc])
print("is_punct:", [token.is_punct for token in doc])
print("like_num: ", [token.like_num for token in doc])

index:  [0, 1, 2, 3, 4]
text:  ['It', 'costs', '$', '5', '.']
is_alpha:  [True, True, False, False, False]
is_punct: [False, False, False, False, True]
like_num:  [False, False, False, True, False]


In [13]:
nlp = spacy.load("en_core_web_sm")

doc = nlp("She goes to school every day")

for token in doc:
    print(token.text,token.pos_)

She PRON
goes VERB
to ADP
school NOUN
every DET
day NOUN


In [9]:
for token in doc:
    print (token.text, token.pos_, token.dep_, token.head.text)

She PRON nsubj goes
goes VERB ROOT goes
to ADP prep goes
school NOUN pobj to
every DET det day
day NOUN npadvmod goes


In [14]:
from spacy import displacy

doc = nlp("she goes to school")

displacy.render (doc, style="dep", jupyter=True)

In [17]:
text = """ 
Guido van Rossum began working on Python in the late 1980s, as a successor to the ABC programming language, and first released it in 1991 as Python 0.9.0. Python 2.0 was released in 2000 and introduced new features, such as list comprehensions and a cycle-detecting garbage collection system (in addition to reference counting). Python 3.0 was released in 2008 and was a major revision of the language that is not completely backward-compatible. Python 2 was discontinued with version 2.7.18 in 2020.
"""
doc = nlp(text)

displacy.render(doc, style= "ent", jupyter=True)

In [19]:
doc = nlp("Apple is looking at buying U.K. Startup for $1 Billion")

for ent in doc.ents:
    print(ent.text, ent.label_)


Apple ORG
U.K. GPE
Startup PRODUCT
$1 Billion MONEY


In [20]:
displacy.render (doc,style="ent",jupyter=True)

In [21]:
spacy.explain("ORG")

'Companies, agencies, institutions, etc.'

In [23]:
spacy.explain("MONEY")

'Monetary values, including unit'

In [27]:
text = "it's official Apple is the first U.S. public company to reach a $1 trillion market value"

doc = nlp(text)

for token in doc:
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_

    print(token_text, token_pos, token_dep)

it PRON nsubj
's AUX ROOT
official ADJ amod
Apple PROPN nsubj
is AUX ccomp
the DET det
first ADJ amod
U.S. PROPN nmod
public ADJ amod
company NOUN attr
to PART aux
reach VERB relcl
a DET det
$ SYM quantmod
1 NUM compound
trillion NUM nummod
market NOUN compound
value NOUN dobj


In [30]:
text = "Upcomming iPhone X release date leaked as Apple reveals pre-orders"

doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)

iphone_x = doc[1:3]

print("Missing entity: ", iphone_x)

Apple ORG
Missing entity:  iPhone X


In [32]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

pattern = [{"TEXT": "iphone"}, {"TEXT": "X"}]
matcher.add("IPHONE_PATTERN", [pattern])

doc = nlp("Upcomming Iphone X release date leaked")

matches = matcher(doc)

print(matches)

[]


In [33]:
for match_id, start, end in matches:
    matched_span = doc[start:end]
    print (matched_span)

In [42]:
pattern = [
    {"IS_DIGIT":True},
    {"LOWER": "fiva"},
    {"LOWER": "world"},
    {"LOWER": "cup"},
    {"IS_PUNCT": True}
]
doc = nlp ("2018 FIVA World Cup: France won!")
matcher.add("ANIMAL_PATTERN", [pattern])
matches = matcher(doc)
for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)

[(18232475512666784510, 0, 5)]
2018 FIVA World Cup:


In [36]:
pattern = [
    {"LEMMA": "love", "POS":"VERB"},
    {"POS":"NOUN"}
]

doc = nlp("I loved dogs but now I love cats more.")
matcher.add("ANIMAL_PATTERN", [pattern])
matches = matcher(doc)
for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)


loved dogs
love cats


In [38]:
pattern = [
    {"LEMMA":"buy"},
    {"POS":"DET", "OP": "?"},
    {"POS": "NOUN"}
]

doc = nlp("I bought a smartphone. Now I'm buying apps.")
matcher.add("PHONE_PATTERN",[pattern])
matches = matcher(doc)
for match_id,start,end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)

bought a smartphone
buying apps
