##  spaCey Basics - Document, Tokens, Span

In [1]:
from spacy.lang.en import English


In [2]:
nlp = English()

In [3]:
articles = 'The match was very good. Sachin did not play well at all. The match was pathetic. Virat had a poor run. The computer shut down automatically'
doc = nlp(articles)

In [4]:
span = doc[2:5]
print(span.text)

was very good


In [5]:
for token in span:
    print("Token Index: ", token.i)
    print("Token Text: ",token.text)
    print("is Alphabetical: ",token.is_alpha)
    print("is Punctuation: ",token.is_punct)
    print("is Number: ",token.like_num)


Token Index:  2
Token Text:  was
is Alphabetical:  True
is Punctuation:  False
is Number:  False
Token Index:  3
Token Text:  very
is Alphabetical:  True
is Punctuation:  False
is Number:  False
Token Index:  4
Token Text:  good
is Alphabetical:  True
is Punctuation:  False
is Number:  False


## spaCey Statistical Models

In [13]:
import spacy
nlp = spacy.load('en_core_web_sm')
sentence = 'Walmart acquired Flipkart for $25 billion in 2018. This was the biggest acuqistion deal in India'

document = nlp(sentence)

for ent in document.ents:
    print(ent.text,ent.label_)
    
for token in document:
    print(token.text,token.pos_,token.dep_,token.head.text)

Flipkart ORG
$25 billion MONEY
2018 DATE
India GPE
Walmart PROPN nsubj acquired
acquired VERB ROOT acquired
Flipkart PROPN dobj acquired
for ADP prep acquired
$ SYM quantmod billion
25 NUM compound billion
billion NUM pobj for
in ADP prep billion
2018 NUM pobj in
. PUNCT punct acquired
This DET nsubj was
was AUX ROOT was
the DET det deal
biggest ADJ amod deal
acuqistion NOUN compound deal
deal NOUN attr was
in ADP prep deal
India PROPN pobj in


In [7]:
spacy.explain('GPE')

'Countries, cities, states'

In [12]:
spacy.explain('ADP')

'adposition'

In [14]:
text = "New iPhone X release date leaked as Apple reveals pre-orders by mistake"

# Process the text
doc = nlp(text)

# Iterate over the entities
for ent in doc.ents:
    # print the entity text and label
    print(ent.text, ent.label_)

# Get the span for "iPhone X"
iphone_x = doc[1:3]

# Print the span text
print('Missing entity:', iphone_x.text)

New iPhone EVENT
Apple ORG
Missing entity: iPhone X


In [15]:
text = "New iPhone X release date leaked as Apple reveals pre-orders by mistake"

# Import the Matcher and initialize it with the shared vocabulary
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

# Create a pattern matching two tokens: "iPhone" and "X"
pattern = [{'TEXT': 'iPhone'}, {'TEXT': 'X'}]

# Add the pattern to the matcher
matcher.add('IPHONE_X_PATTERN', None, pattern)

# Use the matcher on the doc
matches = matcher(doc)
print('Matches:', [doc[start:end].text for match_id, start, end in matches])

Matches: ['iPhone X']


In [16]:
doc = nlp("After making the iOS update you won't notice a radical system-wide redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of iOS 11's furniture remains the same as in iOS 10. But you will discover some tweaks once you delve a little deeper.")

# Write a pattern for full iOS versions ("iOS 7", "iOS 11", "iOS 10")
pattern = [{'TEXT': 'iOS'}, {'IS_DIGIT': True}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add('IOS_VERSION_PATTERN', None, pattern)
matches = matcher(doc)
print('Total matches found:', len(matches))

Total matches found: 3


In [17]:
doc = nlp("i downloaded Fortnite on my laptop and can't open the game at all. Help? so when I was downloading Minecraft, I got the Windows version where it is the '.zip' folder and I used the default program to unpack it... do I also need to download Winzip?")

# Write a pattern that matches a form of "download" plus proper noun
pattern = [{'LEMMA': 'download'}, {'POS': 'PROPN'}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add('DOWNLOAD_THINGS_PATTERN', None, pattern)
matches = matcher(doc)
print('Total matches found:', len(matches))

Total matches found: 3


In [18]:
# Look up the hash for the word "cat"
cat_hash = nlp.vocab.strings['Fortnite']
print(cat_hash)

# Look up the cat_hash to get the string
cat_string = nlp.vocab.strings[cat_hash]
print(cat_string)

16174960241667155899
Fortnite
