# Spacy Basics

In [2]:
import spacy

In [14]:
# loading a model name nlp
nlp = spacy.load('en_core_web_sm')# small version of english core library

In [15]:
# create document object - pass unicode string --> u
doc = nlp(u'Tesla is looing at buying U.S. startup for $6 million')

In [16]:
for token in doc:
    print(token.text)

Tesla
is
looing
at
buying
U.S.
startup
for
$
6
million


In [17]:
# pos -> part of speech
for token in doc:
    print(token.text, token.pos)

Tesla 96
is 87
looing 100
at 85
buying 100
U.S. 96
startup 92
for 85
$ 99
6 93
million 93


In [18]:
# pos_ -> part of speech it is
for token in doc:
    print(token.text, token.pos_)

Tesla PROPN
is AUX
looing VERB
at ADP
buying VERB
U.S. PROPN
startup NOUN
for ADP
$ SYM
6 NUM
million NUM


In [19]:
# pos_ -> part of speech it is
# dep_ -> depencency
for token in doc:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
looing VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


## pipeline 

In [20]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x163b091f0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x163b08ad0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x163686650>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x16340d7d0>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x163a55090>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x163686730>)]

In [21]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

## Tokenization

In [23]:
doc2 = nlp(u"Tesla isn't looking into startup anymore.")

In [24]:
# pos_ -> part of speech it is
# dep_ -> depencency
for token in doc2:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
n't PART neg
looking VERB ROOT
into ADP prep
startup NOUN pobj
anymore ADV advmod
. PUNCT punct


In [25]:
doc2[0]

Tesla

In [26]:
doc2[0].pos_

'PROPN'

## Dependencies

In [31]:
doc2[0].dep_

'nsubj'

In [35]:
spacy.explain('PROPN')

'proper noun'

In [36]:
spacy.explain('nsubj')

'nominal subject'

## Additional Token Attributes
For now we just want to illustrate some of the other information that spaCy assigns to tokens:

|Tag|Description|doc2[0].tag|
|:------|:------:|:------|
|`.text`|The original word text<!-- .element: style="text-align:left;" -->|`Tesla`|
|`.lemma_`|The base form of the word|`tesla`|
|`.pos_`|The simple part-of-speech tag|`PROPN`/`proper noun`|
|`.tag_`|The detailed part-of-speech tag|`NNP`/`noun, proper singular`|
|`.shape_`|The word shape – capitalization, punctuation, digits|`Xxxxx`|
|`.is_alpha`|Is the token an alpha character?|`True`|
|`.is_stop`|Is the token part of a stop list, i.e. the most common words of the language?|`False`|

In [37]:
doc2[4].lemma_

'into'

In [39]:
doc2[2].text

"n't"

In [47]:
# Simple Parts-of-Speech:

doc2[2].pos_

'PART'

In [48]:
#  Detailed Tags:
print(doc2[4].tag_ + ' / ' + spacy.explain(doc2[4].tag_))

IN / conjunction, subordinating or preposition


In [49]:
# word shapes:
print(doc2[0].text+': '+doc2[0].shape_)
print(doc[5].text+' : '+doc[5].shape_)

Tesla: Xxxxx
U.S. : X.X.


In [50]:
# Boolean Values:
print(doc2[0].is_alpha)
print(doc2[0].is_stop)

True
False


## Spans
Large Doc objects can be hard to work with at times. A **span** is a slice of Doc object in the form `Doc[start:stop]`.

In [51]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [52]:
life_quote = doc3[16:30]


In [53]:
print(life_quote)

"Life is what happens to us while we are making other plans"


In [54]:
type(life_quote)

spacy.tokens.span.Span

In [55]:
type(doc3)

spacy.tokens.doc.Doc

In [56]:
doc4 = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

In [57]:
for sentence in doc4.sents:
    print(sentence)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [60]:
doc4[6]

This

In [61]:
doc4[6].is_sent_start

True

In [63]:
doc4[8]

another

In [64]:
doc4[8].is_sent_start

False