In [1]:
!pip install spacy

Collecting spacy
  Obtaining dependency information for spacy from https://files.pythonhosted.org/packages/90/f0/0133b684e18932c7bf4075d94819746cee2c0329f2569db526b0fa1df1df/spacy-3.7.2-cp311-cp311-win_amd64.whl.metadata
  Downloading spacy-3.7.2-cp311-cp311-win_amd64.whl.metadata (26 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl (29 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Obtaining dependency information for spacy-loggers<2.0.0,>=1.0.0 from https://files.pythonhosted.org/packages/33/78/d1a1a026ef3af911159398c939b1509d5c36fe524c7b644f34a5146c4e16/spacy_loggers-1.0.5-py3-none-any.whl.metadata
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Obtaining dependency information for murmurhash<1.1.0,>=0.28.0 from https://files.pythonhosted.org/packages/71/46/af01a20ec368bd9cb49a1d2df15e3eca113bbf6952cc1f2a47f1c6801a7f/murmurhash-1.0.10-cp311

In [10]:
spacy.cli.download('en_core_web_sm')

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [12]:
import spacy

In [15]:
# Loading the language model instance in spacy
nlp = spacy.load('en_core_web_sm')
nlp

<spacy.lang.en.English at 0x1bbeb248cd0>

In [19]:
# Constructing a Doc Object with String as an input argument
introduction_doc = nlp(' This is a tutorial about NLP in spacy')
type(introduction_doc)

spacy.tokens.doc.Doc

In [18]:
'''
Doc object is a sequence of token objects,
- Each token object has an innformation about a particular piece '''
[token.text for token in introduction_doc] # On each token object, we called .text attribute to get the text contained within that object

[' ', 'This', 'is', 'a', 'tutorial', 'about', 'NLP', 'in', 'spacy']

In [21]:
import pathlib
file_name = 'NLP.txt'

In [22]:
# Reading from the file
introduction_doc = nlp(pathlib.Path(file_name).read_text(encoding='utf-8'))
print([token.text for token in introduction_doc])

['This', 'tutorial', 'is', 'about', 'Natural', 'Language', 'Processing', 'in', 'spacy']


In [24]:
about_text = '''
Chem is an aspiring ML developer. He wants to master NLP and computer vision. He looks forward to work as a remote developer for some foreign company someday'''

about_doc = nlp(about_text) # Creating a Doc object of the above string
sentences = list(about_doc.sents) # .sents property is used to extract sentences from the doc object created above
len(sentences) # checking the number of sentences

3

In [25]:
for sentence in sentences:
    print(f"{sentence[:5]}...")


Chem is an aspiring...
He wants to master NLP...
He looks forward to work...


In [27]:
'''
Customization of sentence detection behaviour can also be done by using custom delimiters '''

ellipsis_text = (
     "Gus, can you, ... never mind, I forgot"
     " what I was saying. So, do you think"
     " we should ..."
)
from spacy.language import Language
@Language.component("set_custom_boundaries")
# used the @Language.component("set_custom_boundaries") decorator to define a new function that takes a Doc object as an argument

def set_custom_boundaries(doc):
    """Add support to use `...` as a delimiter for sentence detection"""
    for token in doc[:-1]:
        if token.text == "...":
            doc[token.i + 1].is_sent_start = True
    return doc

custom_nlp = spacy.load("en_core_web_sm")
custom_nlp.add_pipe("set_custom_boundaries", before="parser")
custom_ellipsis_doc = custom_nlp(ellipsis_text)
custom_ellipsis_sentences = list(custom_ellipsis_doc.sents)
for sentence in custom_ellipsis_sentences:
    print(sentence)

Gus, can you, ...
never mind, I forgot what I was saying.
So, do you think we should ...


In [31]:
about_text = ("Chem is an aspiring Ml engineer currently learning Ml and NLP. He is very much interested in learning NLP and computer vision.")
about_doc = nlp(about_text)
for token in about_doc:
    print(token,token.idx)

Chem 0
is 5
an 8
aspiring 11
Ml 20
engineer 23
currently 32
learning 42
Ml 51
and 54
NLP 58
. 61
He 63
is 66
very 69
much 74
interested 79
in 90
learning 93
NLP 102
and 106
computer 110
vision 119
. 125


In [42]:
print(
    f"{'Text with Whitespace':22}"
    f"{'Is Alphanumeric?':15}"
    f"{'Is Punctuation?':18}"
    f"{'Is Stop Word?'}"
)
for token in about_doc:
    print(
        f"{str(token.text_with_ws):22}"     
        f"{str(token.is_alpha):15}"
        f"{str(token.is_punct):18}"
        f"{str(token.is_stop)}"
)

Text with Whitespace  Is Alphanumeric?Is Punctuation?   Is Stop Word?
Chem                  True           False             False
is                    True           False             True
an                    True           False             True
aspiring              True           False             False
Ml                    True           False             False
engineer              True           False             False
currently             True           False             False
learning              True           False             False
Ml                    True           False             False
and                   True           False             True
NLP                   True           False             False
.                     False          True              False
He                    True           False             True
is                    True           False             True
very                  True           False             True
much                 

In [43]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
len(spacy_stopwords)

326

In [45]:
for stopword in list(spacy_stopwords)[:10]:
    print(stopword)

still
sometimes
show
done
from
ourselves
anything
that
herself
fifty


In [48]:
custom_about_text = ''' Chem is learning ML. Chem believes that he can get better 1% everyday. He learns from various resources such as codebasics, RealPython and
other articles.'''
about_doc = nlp(custom_about_text)
print([token for token in about_doc if not token.is_stop])

[ , Chem, learning, ML, ., Chem, believes, better, 1, %, everyday, ., learns, resources, codebasics, ,, RealPython, 
, articles, .]


In [49]:
conference_help_text = ("Gus is helping organize a developer"
     " conference on Applications of Natural Language"
     " Processing. He keeps organizing local Python meetups"
     " and several internal talks at his workplace.")
conference_help_doc = nlp(conference_help_text)
for token in conference_help_doc:
    if str(token) != str(token.lemma_):
        print(f"{str(token):>20} : {str(token.lemma_)}")

                  is : be
                  He : he
               keeps : keep
          organizing : organize
             meetups : meetup
               talks : talk


In [55]:
from collections import Counter
complete_text = (
     "Gus Proto is a Python developer currently"
     " working for a London-based Fintech company. He is"
     " interested in learning Natural Language Processing."
     " There is a developer conference happening on 21 July"
     ' 2019 in London. It is titled "Applications of Natural'
     ' Language Processing". There is a helpline number'
     " available at +44-1234567891. Gus is helping organize it."
     " He keeps organizing local Python meetups and several"
     " internal talks at his workplace. Gus is also presenting"
     ' a talk. The talk will introduce the reader about "Use'
     ' cases of Natural Language Processing in Fintech".'
     " Apart from his work, he is very passionate about music."
     " Gus is learning to play the Piano. He has enrolled"
     " himself in the weekend batch of Great Piano Academy."
     " Great Piano Academy is situated in Mayfair or the City"
     " of London and has world-class piano instructors."
)
complete_doc = nlp(complete_text)
words = [token.text for token in complete_doc if not token.is_stop and not token.is_punct]
print(Counter(words).most_common(5))

[('Gus', 4), ('London', 3), ('Natural', 3), ('Language', 3), ('Processing', 3)]


In [64]:
for token in about_doc:
     print(f"""TOKEN: {str(token)}\n=====\nAG: {str(token.tag_):10} POS: {token.pos_}\nEXPLANATION: {spacy.explain(token.tag_)}\n""")

TOKEN:  
=====
AG: _SP        POS: SPACE
EXPLANATION: whitespace

TOKEN: Chem
=====
AG: NNP        POS: PROPN
EXPLANATION: noun, proper singular

TOKEN: is
=====
AG: VBZ        POS: AUX
EXPLANATION: verb, 3rd person singular present

TOKEN: learning
=====
AG: VBG        POS: VERB
EXPLANATION: verb, gerund or present participle

TOKEN: ML
=====
AG: NNP        POS: PROPN
EXPLANATION: noun, proper singular

TOKEN: .
=====
AG: .          POS: PUNCT
EXPLANATION: punctuation mark, sentence closer

TOKEN: Chem
=====
AG: NNP        POS: PROPN
EXPLANATION: noun, proper singular

TOKEN: believes
=====
AG: VBZ        POS: VERB
EXPLANATION: verb, 3rd person singular present

TOKEN: that
=====
AG: IN         POS: SCONJ
EXPLANATION: conjunction, subordinating or preposition

TOKEN: he
=====
AG: PRP        POS: PRON
EXPLANATION: pronoun, personal

TOKEN: can
=====
AG: MD         POS: AUX
EXPLANATION: verb, modal auxiliary

TOKEN: get
=====
AG: VB         POS: VERB
EXPLANATION: verb, base form

TOKEN:

In [70]:
complete_text = (
     "Gus Proto is a Python developer currently"
     " working for a London-based Fintech company. He is"
     " interested in learning Natural Language Processing."
     " There is a developer conference happening on 21 July"
     ' 2019 in London. It is titled "Applications of Natural'
     ' Language Processing". There is a helpline number'
     " available at +44-1234567891. Gus is helping organize it."
     " He keeps organizing local Python meetups and several"
     " internal talks at his workplace. Gus is also presenting"
     ' a talk. The talk will introduce the reader about "Use'
     ' cases of Natural Language Processing in Fintech".'
     " Apart from his work, he is very passionate about music."
     " Gus is learning to play the Piano. He has enrolled"
     " himself in the weekend batch of Great Piano Academy."
     " Great Piano Academy is situated in Mayfair or the City"
     " of London and has world-class piano instructors.")

complete_doc = nlp(complete_text)

def is_token_allowed(token):
     return bool(token and str(token).strip() and not token.is_stop and not token.is_punct)
    
def preprocess_token(token):
    return token.lemma_.strip().lower()

complete_filtered_tokens = [preprocess_token(token) for token in complete_doc if is_token_allowed(token)]

complete_filtered_tokens

['gus',
 'proto',
 'python',
 'developer',
 'currently',
 'work',
 'london',
 'base',
 'fintech',
 'company',
 'interested',
 'learn',
 'natural',
 'language',
 'processing',
 'developer',
 'conference',
 'happen',
 '21',
 'july',
 '2019',
 'london',
 'title',
 'application',
 'natural',
 'language',
 'processing',
 'helpline',
 'number',
 'available',
 '+44',
 '1234567891',
 'gus',
 'helping',
 'organize',
 'keep',
 'organize',
 'local',
 'python',
 'meetup',
 'internal',
 'talk',
 'workplace',
 'gus',
 'present',
 'talk',
 'talk',
 'introduce',
 'reader',
 'use',
 'case',
 'natural',
 'language',
 'processing',
 'fintech',
 'apart',
 'work',
 'passionate',
 'music',
 'gus',
 'learn',
 'play',
 'piano',
 'enrol',
 'weekend',
 'batch',
 'great',
 'piano',
 'academy',
 'great',
 'piano',
 'academy',
 'situate',
 'mayfair',
 'city',
 'london',
 'world',
 'class',
 'piano',
 'instructor']