In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [3]:
introdu_doc = nlp("""N early ten years had passed since the Dursleys had woken up to find their
nephew on the front step, but Privet Drive had hardly changed at all. The sun
rose on the same tidy front gardens and lit up the brass number four on the
Dursleys’ front door; it crept into their living room, which was almost exactly
the same as it had been on the night when Mr. Dursley had seen that fateful news
report about the owls. Only the photographs on the mantelpiece really showed
how much time had passed. Ten years ago, there had been lots of pictures of
what looked like a large pink beach ball wearing different-colored bonnets —
but Dudley Dursley was no longer a baby, and now the photographs showed a
large blond boy riding his first bicycle, on a carousel at the fair, playing a
computer game with his father, being hugged and kissed by his mother. The
room held no sign at all that another boy lived in the house, too.
Yet Harry Potter was still there, asleep at the moment, but not for long.
His Aunt Petunia was awake and it was her shrill voice that made the first noise
of the day.""")

In [4]:
[token.text for token in introdu_doc][:10]

['N',
 'early',
 'ten',
 'years',
 'had',
 'passed',
 'since',
 'the',
 'Dursleys',
 'had']

In [5]:
import pathlib
docs = nlp(pathlib.Path("./example.txt").read_text(encoding="utf-8"))
print([token.text for token in docs][:10])

['THE', 'VANISHING', 'GLASS', '\n', 'N', 'early', 'ten', 'years', 'had', 'passed']


Sentence DEtection

In [6]:
sentences  = list(introdu_doc.sents)
sentences[:10]

[N early ten years had passed since the Dursleys had woken up to find their
 nephew on the front step, but Privet Drive had hardly changed at all.,
 The sun
 rose on the same tidy front gardens and lit up the brass number four on the
 Dursleys’ front door; it crept into their living room, which was almost exactly
 the same as it had been on the night when Mr. Dursley had seen that fateful news
 report about the owls.,
 Only the photographs on the mantelpiece really showed
 how much time had passed.,
 Ten years ago, there had been lots of pictures of
 what looked like a large pink beach ball wearing different-colored bonnets —
 but Dudley Dursley was no longer a baby, and now the photographs showed a
 large blond boy riding his first bicycle, on a carousel at the fair, playing a
 computer game with his father, being hugged and kissed by his mother.,
 The
 room held no sign at all that another boy lived in the house, too.,
 Yet Harry Potter was still there, asleep at the moment, but not 

Creating a custom seperator

In [7]:
from spacy.language import Language

ellipsis_text =  """Gus, can you, ... never mind, I forgot
    what I was saying. So, do you think
    we should...
"""

@Language.component("custm_sep")
def custm(doc):
    tokens = list(doc)
    for token in tokens[::-1]:
        if token.text == "...":
            doc[token.i + 1].is_sent_start = True
    return doc

custm_nlp = spacy.load("en_core_web_sm")
custm_nlp.add_pipe("custm_sep", before="parser")

custm_nlp_doc = custm_nlp(ellipsis_text)


custm_nlp_sents = list(custm_nlp_doc.sents)

print(custm_nlp_sents)



[Gus, can you, ..., never mind, I forgot
    what I was saying., So, do you think
    we should..., 
]


Tokens

In [8]:
tokens = list(docs)
for token in tokens[:10]:
    print(token, token.idx)

THE 0
VANISHING 4
GLASS 14

 19
N 20
early 22
ten 28
years 32
had 38
passed 42


About tokens

In [9]:
for token in tokens:
    print(f"{token} => {token.text_with_ws} , {token.is_alpha}, {token.is_punct}, {token.is_stop}")

THE => THE  , True, False, True
VANISHING => VANISHING  , True, False, False
GLASS => GLASS , True, False, False

 => 
 , False, False, False
N => N  , True, False, False
early => early  , True, False, False
ten => ten  , True, False, True
years => years  , True, False, False
had => had  , True, False, True
passed => passed  , True, False, False
since => since  , True, False, True
the => the  , True, False, True
Dursleys => Dursleys  , True, False, False
had => had  , True, False, True
woken => woken  , True, False, False
up => up  , True, False, True
to => to  , True, False, True
find => find  , True, False, False
their => their , True, False, True

 => 
 , False, False, False
nephew => nephew  , True, False, False
on => on  , True, False, True
the => the  , True, False, True
front => front  , True, False, True
step => step , True, False, False
, => ,  , False, True, False
but => but  , True, False, True
Privet => Privet  , True, False, False
Drive => Drive  , True, False, False
had =

Stop Words

In [10]:
list(spacy.lang.en.STOP_WORDS)[:10]

['former', '‘s', 'am', 'per', 'his', 'everyone', 'always', 'back', 'i', 'very']

filtering tokens

In [11]:
[token.text for token in docs if not token.is_stop][:10]

['VANISHING',
 'GLASS',
 '\n',
 'N',
 'early',
 'years',
 'passed',
 'Dursleys',
 'woken',
 'find']

lemmatizaion

In [12]:
for token in docs:
    if str(token) != str(token.lemma_):
        print(token, token.lemma_)

THE the
VANISHING vanish
N n
years year
had have
passed pass
had have
woken wake
had have
changed change
The the
rose rise
gardens garden
lit light
’ '
crept creep
was be
had have
been be
had have
seen see
owls owl
Only only
photographs photograph
showed show
had have
passed pass
Ten ten
years year
had have
been be
lots lot
pictures picture
looked look
wearing wear
bonnets bonnet
was be
longer long
photographs photograph
showed show
riding ride
playing play
being be
hugged hug
kissed kiss
The the
held hold
lived live
Yet yet
was be
His his
was be
was be
made make
“ "
Up up
Get get
Now now
” "
woke wake
His his
rapped rap
“ "
Up up
” "
screeched screech
heard hear
her she
walking walk
frying fry
being be
He he
rolled roll
tried try
had have
been be
having have
It it
had have
been be
There there
had have
been be
flying fly
He he
had have
had have
His his
was be
“ "
Are be
” "
demanded demand
“ "
Nearly nearly
” "
said say
“ "
Well well
And and
” "
groaned groan
“ "
What what
did do
” "
s

Word Frequency

In [13]:
from collections import Counter

words = [token.text for token in docs if not token.is_stop and not token.is_punct  ]
frq = Counter(words).most_common(10)
frq

[('\n', 294),
 ('Harry', 79),
 ('Dudley', 54),
 ('said', 23),
 ('Aunt', 22),
 ('Petunia', 22),
 ('Vernon', 20),
 ('Uncle', 18),
 ('snake', 14),
 ('Dursleys', 13)]

Part-of-Speech 

In [14]:
[(token.text, token.tag_, spacy.explain(token.tag_)) for token in docs if not token.is_stop and not token.is_punct]

[('VANISHING', 'VBG', 'verb, gerund or present participle'),
 ('GLASS', 'NNP', 'noun, proper singular'),
 ('\n', '_SP', 'whitespace'),
 ('N', 'CD', 'cardinal number'),
 ('early', 'JJ', 'adjective (English), other noun-modifier (Chinese)'),
 ('years', 'NNS', 'noun, plural'),
 ('passed', 'VBN', 'verb, past participle'),
 ('Dursleys', 'NNPS', 'noun, proper plural'),
 ('woken', 'VBN', 'verb, past participle'),
 ('find', 'VB', 'verb, base form'),
 ('\n', '_SP', 'whitespace'),
 ('nephew', 'NN', 'noun, singular or mass'),
 ('step', 'NN', 'noun, singular or mass'),
 ('Privet', 'NNP', 'noun, proper singular'),
 ('Drive', 'NNP', 'noun, proper singular'),
 ('hardly', 'RB', 'adverb'),
 ('changed', 'VBN', 'verb, past participle'),
 ('sun', 'NN', 'noun, singular or mass'),
 ('\n', '_SP', 'whitespace'),
 ('rose', 'VBD', 'verb, past tense'),
 ('tidy', 'JJ', 'adjective (English), other noun-modifier (Chinese)'),
 ('gardens', 'NNS', 'noun, plural'),
 ('lit', 'VBD', 'verb, past tense'),
 ('brass', 'NN', 

Nouns and Adjective

In [15]:
nouns = [token for token in docs if token.pos_ == "NOUN" ]
adjectives = [token for token in docs if token.pos_ == "ADJ" ]
nouns[:10], adjectives[:10]

([years, nephew, step, sun, gardens, brass, number, door, living, room],
 [early, front, same, tidy, front, front, same, fateful, much, large])

Visualizing

In [27]:
from spacy import displacy

k = displacy.render(docs, options={"distance": 70})
k