In [1]:
import spacy

In [2]:

nlp = spacy.load('en_core_web_sm')
introduction_text = ('This tutorial is about Natural'
    ' Language Processing in Spacy.')
introduction_doc = nlp(introduction_text)
# Extract tokens for the given doc
print ([token.text for token in introduction_doc])
file_name = 'introduction.txt'
introduction_file_text = open(file_name).read()
introduction_file_doc = nlp(introduction_file_text)
# Extract tokens for the given doc
print ([token.text for token in introduction_file_doc])

['This', 'tutorial', 'is', 'about', 'Natural', 'Language', 'Processing', 'in', 'Spacy', '.']


FileNotFoundError: [Errno 2] No such file or directory: 'introduction.txt'

In [3]:
about_text = ('Gus Proto is a Python developer currently'
              ' working for a London-based Fintech'
              ' company. He is interested in learning'
              ' Natural Language Processing.')
about_doc = nlp(about_text)
sentences = list(about_doc.sents)
len(sentences)

for sentence in sentences:
    print (sentence)

Gus Proto is a Python developer currently working for a London-based Fintech company.
He is interested in learning Natural Language Processing.


In [4]:
def set_custom_boundaries(doc):
    # Adds support to use `...` as the delimiter for sentence detection
    for token in doc[:-1]:
        if token.text == '...':
            doc[token.i+1].is_sent_start = True
    return doc

ellipsis_text = ('Gus, can you, ... never mind, I forgot'
                 ' what I was saying. So, do you think'
                 ' we should ...')
# Load a new model instance
custom_nlp = spacy.load('en_core_web_sm')
custom_nlp.add_pipe(set_custom_boundaries, before='parser')
custom_ellipsis_doc = custom_nlp(ellipsis_text)
custom_ellipsis_sentences = list(custom_ellipsis_doc.sents)
for sentence in custom_ellipsis_sentences:
    print(sentence)




# Sentence Detection with no customization
ellipsis_doc = nlp(ellipsis_text)
ellipsis_sentences = list(ellipsis_doc.sents)
for sentence in ellipsis_sentences:
    print(sentence)


Gus, can you, ...
never mind, I forgot what I was saying.
So, do you think we should ...
Gus, can you, ... never mind, I forgot what I was saying.
So, do you think we should ...


In [5]:
for token in about_doc:
    print (token, token.idx)


Gus 0
Proto 4
is 10
a 13
Python 15
developer 22
currently 32
working 42
for 50
a 54
London 56
- 62
based 63
Fintech 69
company 77
. 84
He 86
is 89
interested 92
in 103
learning 106
Natural 115
Language 123
Processing 132
. 142


In [6]:
for token in about_doc:
    print (token, token.idx, token.text_with_ws,
           token.is_alpha, token.is_punct, token.is_space,
           token.shape_, token.is_stop)

Gus 0 Gus  True False False Xxx False
Proto 4 Proto  True False False Xxxxx False
is 10 is  True False False xx True
a 13 a  True False False x True
Python 15 Python  True False False Xxxxx False
developer 22 developer  True False False xxxx False
currently 32 currently  True False False xxxx False
working 42 working  True False False xxxx False
for 50 for  True False False xxx True
a 54 a  True False False x True
London 56 London True False False Xxxxx False
- 62 - False True False - False
based 63 based  True False False xxxx False
Fintech 69 Fintech  True False False Xxxxx False
company 77 company True False False xxxx False
. 84 .  False True False . False
He 86 He  True False False Xx True
is 89 is  True False False xx True
interested 92 interested  True False False xxxx False
in 103 in  True False False xx True
learning 106 learning  True False False xxxx False
Natural 115 Natural  True False False Xxxxx False
Language 123 Language  True False False Xxxxx False
Processing 132 Pro

In [7]:
import re
import spacy
from spacy.tokenizer import Tokenizer
custom_nlp = spacy.load('en_core_web_sm')
prefix_re = spacy.util.compile_prefix_regex(custom_nlp.Defaults.prefixes)
suffix_re = spacy.util.compile_suffix_regex(custom_nlp.Defaults.suffixes)
infix_re = re.compile(r'''[-~]''')
def customize_tokenizer(nlp):
    # Adds support to use `-` as the delimiter for tokenization
    return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                     suffix_search=suffix_re.search,
                     infix_finditer=infix_re.finditer,
                     token_match=None
                     )


custom_nlp.tokenizer = customize_tokenizer(custom_nlp)
custom_tokenizer_about_doc = custom_nlp(about_text)
print([token.text for token in custom_tokenizer_about_doc])

['Gus', 'Proto', 'is', 'a', 'Python', 'developer', 'currently', 'working', 'for', 'a', 'London', '-', 'based', 'Fintech', 'company', '.', 'He', 'is', 'interested', 'in', 'learning', 'Natural', 'Language', 'Processing', '.']


In [8]:
import spacy
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
len(spacy_stopwords)

for stop_word in list(spacy_stopwords)[:10]:
    print(stop_word)

also
bottom
sometimes
while
’m
anyway
here
some
can
full


In [9]:
for token in about_doc:
     if not token.is_stop:
         print (token)

Gus
Proto
Python
developer
currently
working
London
-
based
Fintech
company
.
interested
learning
Natural
Language
Processing
.


In [10]:
conference_help_text = ('Gus is helping organize a developer'
    'conference on Applications of Natural Language'
    ' Processing. He keeps organizing local Python meetups'
    ' and several internal talks at his workplace.')
conference_help_doc = nlp(conference_help_text)
for token in conference_help_doc:
    print (token, token.lemma_)


Gus Gus
is be
helping help
organize organize
a a
developerconference developerconference
on on
Applications Applications
of of
Natural Natural
Language Language
Processing Processing
. .
He -PRON-
keeps keep
organizing organize
local local
Python Python
meetups meetup
and and
several several
internal internal
talks talk
at at
his -PRON-
workplace workplace
. .


In [12]:
from collections import Counter
complete_text = ('Gus Proto is a Python developer currently'
    'working for a London-based Fintech company. He is'
    ' interested in learning Natural Language Processing.'
    ' There is a developer conference happening on 21 July'
    ' 2019 in London. It is titled "Applications of Natural'
    ' Language Processing". There is a helpline number '
    ' available at +1-1234567891. Gus is helping organize it.'
    ' He keeps organizing local Python meetups and several'
    ' internal talks at his workplace. Gus is also presenting'
    ' a talk. The talk will introduce the reader about "Use'
    ' cases of Natural Language Processing in Fintech".'
    ' Apart from his work, he is very passionate about music.'
    ' Gus is learning to play the Piano. He has enrolled '
    ' himself in the weekend batch of Great Piano Academy.'
    ' Great Piano Academy is situated in Mayfair or the City'
    ' of London and has world-class piano instructors.')

complete_doc = nlp(complete_text)
# Remove stop words and punctuation symbols
words = [token.text for token in complete_doc
         if not token.is_stop and not token.is_punct]
word_freq = Counter(words)
# 5 commonly occurring words with their frequencies
common_words = word_freq.most_common(5)
print (common_words)

# Unique words
unique_words = [word for (word, freq) in word_freq.items() if freq == 1]
print (unique_words)

[('Gus', 4), ('London', 3), ('Natural', 3), ('Language', 3), ('Processing', 3)]
['Proto', 'currentlyworking', 'based', 'company', 'interested', 'conference', 'happening', '21', 'July', '2019', 'titled', 'Applications', 'helpline', 'number', 'available', '+1', '1234567891', 'helping', 'organize', 'keeps', 'organizing', 'local', 'meetups', 'internal', 'talks', 'workplace', 'presenting', 'introduce', 'reader', 'Use', 'cases', 'Apart', 'work', 'passionate', 'music', 'play', 'enrolled', 'weekend', 'batch', 'situated', 'Mayfair', 'City', 'world', 'class', 'piano', 'instructors']
