# <font color=blue>Basic Attributes</font>

In [3]:
import spacy
nlp = spacy.load('en')
doc = nlp("I am learning spaCy.")

In [6]:
type(doc), type(doc[0]), type(doc[1:3])

(spacy.tokens.doc.Doc, spacy.tokens.token.Token, spacy.tokens.span.Span)

## Token attributes
* _token_.i
* _token_.text
* _token_.is_alpha (same as token.text.isalpha())
* _token_.is_punct
* _token_.is_stop
* _token_.like_num

## Part-of-speech (_token_.pos_)

* ADJ: adjective
* ADP: adposition (preposition or postposition)
* ADV: adverb
* AUX: auxililary
* CONJ: conjuction
    * CCONJ: coordinating conjunction
    * SCONJ: subordinating conjunction - ex) if, while
* DET: determiner
* INTJ: interjection
* NOUN: noun
* NUM: numeral
* PART: particle
* PRON: pronoun
* PROPN: proper noun
* PUNCT: punctuation
* SYM: symbol
* VERB: verb
* X: other

## Dependency (_token_.dep_, _token_.head, _token_.children, _token_.ancestors)

* acomp: adjectival complement
* amod: adjectival modifier
* aux: auxiliary 
* compound
* conj: conjunct
* det: determiner 
* dobj: direct object 
* nsubj: nominal subject 
* pobj: prepositional Object

## Named Entities (_doc_.ents)

In [22]:
doc = nlp("Facebook, Inc. is an American social media conglomerate corporation based in Menlo Park, California.")

print(type(doc.ents[0]))

print([(ent, ent.label_) for ent in doc.ents])

<class 'spacy.tokens.span.Span'>
[(Facebook, Inc., 'ORG'), (American, 'NORP'), (Menlo Park, 'GPE'), (California, 'GPE')]


# <font color=blue>spaCy functions</font>

## spacy.explain()

In [23]:
spacy.explain('NORP')

'Nationalities or religious or political groups'

## spacy.matcher.Matcher()

In [45]:
from spacy.matcher import Matcher

nlp = spacy.load("en")
matcher = Matcher(nlp.vocab)
pattern = [{"LOWER": "best"}, {"IS_ALPHA": True, "OP": "?"}, {"LOWER": "phones"}]
matcher.add("BEST_PHONES", None, pattern)

doc = nlp("Best phones are all here. These are the best Samsung phones.")

for match_id, start, end in matcher(doc):
    print(doc[start:end].text)

Best phones
best Samsung phones


See https://spacy.io/usage/rule-based-matching#matcher

Available token pattern keys:

* ORTH
* LOWER
* LENGTH
* IS_ALPHA, IS_ASCII, IS_DIGIT
* IS_LOWER, IS_UPPER, IS_TITLE
* IS_PUNCT, IS_SPACE, IS_STOP
* LIKE_NUM, LIKE_URL, LIKE_EMAIL
* POS, TAG, DEP, LEMMA, SHAPE
* ENT_TYPE

In [58]:
### Pattern examples:

pattern = [{"LEMMA": {"IN": ["like", "love"]}}, {"POS": "NOUN"}]
# Matches "love cats" or "likes flowers"

pattern = [{"LENGTH": {">=": 10}}]
# Matches tokens of length >= 10

pattern = [{"LOWER": "facebook"}, {"LEMMA": {"IN": ["be", "'s"]}}, {"POS": "ADV", "OP": "*"}, {"POS": "ADJ"}]
# Matches "Facebook is annoying", "Facebook's very nice"