In [3]:
doc = "Hi, There ! This is a notebook on Tokenization"
for i,token in enumerate(doc.split(" ")):
    print("Token {} - {}".format(i,token))

Token 0 - Hi,
Token 1 - There
Token 2 - !
Token 3 - This
Token 4 - is
Token 5 - a
Token 6 - notebook
Token 7 - on
Token 8 - Tokenization


In [4]:
import spacy
nlp = spacy.load("en_core_web_sm")

#tokenizigng
doc = nlp("Hi, There ! This is a notebook on Tokenization")
for token in doc:
    print("Token: {}".format(token))

Token: Hi
Token: ,
Token: There
Token: !
Token: This
Token: is
Token: a
Token: notebook
Token: on
Token: Tokenization


In [5]:
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer

sentences = [
    "Hi, This is our first Tokenizer Notebook",
    "Glad to see you here.",
    "What are you upto ?"
]

2024-06-10 09:14:22.112574: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-10 09:14:22.407758: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-10 09:14:23.358046: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-06-10 09:14:23.358152: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] 

In [6]:
tokenizer = Tokenizer(num_words=20)
tokenizer.fit_on_texts(sentences)
word_idx = tokenizer.word_index
print(word_idx)

{'you': 1, 'hi': 2, 'this': 3, 'is': 4, 'our': 5, 'first': 6, 'tokenizer': 7, 'notebook': 8, 'glad': 9, 'to': 10, 'see': 11, 'here': 12, 'what': 13, 'are': 14, 'upto': 15}


In [7]:
# converting each tokenized sentence into sequence
sequences = tokenizer.texts_to_sequences(sentences)
for seq in sequences:
    print(seq) 

[2, 3, 4, 5, 6, 7, 8]
[9, 10, 11, 1, 12]
[13, 14, 1, 15]


In [8]:
# to ensure that each sequence contains same number of tokens which are a primary need for any NN. We'll pad
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_sequences = pad_sequences(sequences, padding='post')
for seq in padded_sequences:
    print(seq)

[2 3 4 5 6 7 8]
[ 9 10 11  1 12  0  0]
[13 14  1 15  0  0  0]


In [9]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stemmer = PorterStemmer()

words = ["program","programming","programmer","programmed","programmatically"]

for word in words:
    print(("Original Word: {} & Stem: {}").format(word,stemmer.stem(word)))

Original Word: program & Stem: program
Original Word: programming & Stem: program
Original Word: programmer & Stem: programm
Original Word: programmed & Stem: program
Original Word: programmatically & Stem: programmat


In [13]:
# from sentences
sentence = "Only a troubled programmer uses troubling methods of programming to write a better program with which others are not troubled"
words = word_tokenize(sentence)

for word in words:
    print(("Original Word: {} & Stem: {}").format(word,stemmer.stem(word))) 

Original Word: Only & Stem: onli
Original Word: a & Stem: a
Original Word: troubled & Stem: troubl
Original Word: programmer & Stem: programm
Original Word: uses & Stem: use
Original Word: troubling & Stem: troubl
Original Word: methods & Stem: method
Original Word: of & Stem: of
Original Word: programming & Stem: program
Original Word: to & Stem: to
Original Word: write & Stem: write
Original Word: a & Stem: a
Original Word: better & Stem: better
Original Word: program & Stem: program
Original Word: with & Stem: with
Original Word: which & Stem: which
Original Word: others & Stem: other
Original Word: are & Stem: are
Original Word: not & Stem: not
Original Word: troubled & Stem: troubl


In [14]:
# similarly in place of porter stemmer we can use Snowball stemmer
# let's compare the result of snowball stemmer.MARTIN_EXTENSIONS
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

snowball_stemmer = SnowballStemmer('english')
words = ["program","programming","programmer","programmed","programmatically"]

for word in words:
    print(("Original Word: {} & Stem: {}").format(word,snowball_stemmer.stem(word)))

Original Word: program & Stem: program
Original Word: programming & Stem: program
Original Word: programmer & Stem: programm
Original Word: programmed & Stem: program
Original Word: programmatically & Stem: programmat


In [15]:
sentence = "Only a troubled programmer uses troubling methods of programming to write a better program with which others are not troubled"
words = word_tokenize(sentence)

for word in words:
    print(("Original Word: {} & Stem: {}").format(word,snowball_stemmer.stem(word))) 

Original Word: Only & Stem: onli
Original Word: a & Stem: a
Original Word: troubled & Stem: troubl
Original Word: programmer & Stem: programm
Original Word: uses & Stem: use
Original Word: troubling & Stem: troubl
Original Word: methods & Stem: method
Original Word: of & Stem: of
Original Word: programming & Stem: program
Original Word: to & Stem: to
Original Word: write & Stem: write
Original Word: a & Stem: a
Original Word: better & Stem: better
Original Word: program & Stem: program
Original Word: with & Stem: with
Original Word: which & Stem: which
Original Word: others & Stem: other
Original Word: are & Stem: are
Original Word: not & Stem: not
Original Word: troubled & Stem: troubl


In [16]:
from nltk.stem import LancasterStemmer
from nltk.tokenize import word_tokenize

lancaster_stemmer = LancasterStemmer()
words = ["program","programming","programmer","programmed","programmatically"]

for word in words:
    print(("Original Word: {} & Stem: {}").format(word,lancaster_stemmer.stem(word)))

Original Word: program & Stem: program
Original Word: programming & Stem: program
Original Word: programmer & Stem: program
Original Word: programmed & Stem: program
Original Word: programmatically & Stem: program


In [17]:
sentence = "Only a troubled programmer uses troubling methods of programming to write a better program with which others are not troubled"
words = word_tokenize(sentence)

for word in words:
    print(("Original Word: {} & Stem: {}").format(word,lancaster_stemmer.stem(word))) 

Original Word: Only & Stem: on
Original Word: a & Stem: a
Original Word: troubled & Stem: troubl
Original Word: programmer & Stem: program
Original Word: uses & Stem: us
Original Word: troubling & Stem: troubl
Original Word: methods & Stem: method
Original Word: of & Stem: of
Original Word: programming & Stem: program
Original Word: to & Stem: to
Original Word: write & Stem: writ
Original Word: a & Stem: a
Original Word: better & Stem: bet
Original Word: program & Stem: program
Original Word: with & Stem: with
Original Word: which & Stem: which
Original Word: others & Stem: oth
Original Word: are & Stem: ar
Original Word: not & Stem: not
Original Word: troubled & Stem: troubl


In [18]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [19]:
text = "The boy was going for a trip where he could say that he hiked, danced, sung, swam, surfed and cooked."

In [20]:
output = nlp(text)
for token in output:
    print("Text - {} and its lemma is {}".format(token.text, token.lemma_))

Text - The and its lemma is the
Text - boy and its lemma is boy
Text - was and its lemma is be
Text - going and its lemma is go
Text - for and its lemma is for
Text - a and its lemma is a
Text - trip and its lemma is trip
Text - where and its lemma is where
Text - he and its lemma is he
Text - could and its lemma is could
Text - say and its lemma is say
Text - that and its lemma is that
Text - he and its lemma is he
Text - hiked and its lemma is hike
Text - , and its lemma is ,
Text - danced and its lemma is danced
Text - , and its lemma is ,
Text - sung and its lemma is sung
Text - , and its lemma is ,
Text - swam and its lemma is swam
Text - , and its lemma is ,
Text - surfed and its lemma is surfed
Text - and and its lemma is and
Text - cooked and its lemma is cook
Text - . and its lemma is .


In [21]:
!pip install lemminflect

Collecting lemminflect
  Downloading lemminflect-0.2.3-py3-none-any.whl (769 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m769.7/769.7 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m[31m3.0 MB/s[0m eta [36m0:00:01[0m
Installing collected packages: lemminflect
Successfully installed lemminflect-0.2.3


In [22]:
# let's evaluate an example

import lemminflect
doc = nlp('He went to a trip to later brag that he hiked, swam, danced, sang, ran and cooked.')

for token in doc:
    print("Text - {} and its lemma is {}".format(token.text, token._.lemma()))

Text - He and its lemma is He
Text - went and its lemma is go
Text - to and its lemma is to
Text - a and its lemma is a
Text - trip and its lemma is trip
Text - to and its lemma is to
Text - later and its lemma is later
Text - brag and its lemma is brag
Text - that and its lemma is that
Text - he and its lemma is he
Text - hiked and its lemma is hike
Text - , and its lemma is ,
Text - swam and its lemma is swam
Text - , and its lemma is ,
Text - danced and its lemma is danced
Text - , and its lemma is ,
Text - sang and its lemma is sang
Text - , and its lemma is ,
Text - ran and its lemma is run
Text - and and its lemma is and
Text - cooked and its lemma is cook
Text - . and its lemma is .


In [23]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/maximus1/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [24]:
nlp.Defaults.stop_words

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [27]:
# let's check a demo.
stop_words = nlp.Defaults.stop_words
sentence = "An example of stopwords for those demo purposes only"



In [28]:
doc = nlp(sentence)
filtered_words =[]
for word in doc:
    if word not in stop_words:
        print(word) 

An
example
of
stopwords
for
those
demo
purposes
only


In [29]:
import spacy 
nlp = spacy.load('en_core_web_sm')
doc = nlp("This is an example to illustrate POS tagging using Spacy")

In [30]:
for token in doc:
    print("Text is {} and its POS is {}".format(token.text, token.pos_))

Text is This and its POS is PRON
Text is is and its POS is AUX
Text is an and its POS is DET
Text is example and its POS is NOUN
Text is to and its POS is PART
Text is illustrate and its POS is VERB
Text is POS and its POS is PROPN
Text is tagging and its POS is NOUN
Text is using and its POS is VERB
Text is Spacy and its POS is NOUN


In [31]:
# noun chunks are the noun plus the words describing the noun.
for chunk in doc.noun_chunks:
    print("The text is {} and its root text is {}, The dependency of root is {}".format(chunk.text,chunk.root.text, chunk.root.dep_))

The text is This and its root text is This, The dependency of root is nsubj
The text is an example and its root text is example, The dependency of root is attr
The text is POS tagging and its root text is tagging, The dependency of root is dobj
The text is Spacy and its root text is Spacy, The dependency of root is dobj


In [32]:
# we can render the relationships of various POS too
from spacy import displacy

In [33]:
displacy.render(doc,style='dep', jupyter=True, options={'distance':160})

In [34]:
text = "Hi, This is our first example about NER on VSCode, Hope we cover the concepts in detail."
doc = nlp(text)
for ent in doc.ents:
    print("The text is {} and its label is {}".format(ent.text, ent.label_))

The text is first and its label is ORDINAL
The text is NER and its label is ORG
The text is VSCode and its label is ORG


In [35]:
text2 = "Suraj is a resident of India, born on 4th April."
doc1 = nlp(text2)
for ent in doc1.ents:
        print("The text is {} and its label is {} ".format(ent.text, ent.label_))

The text is Suraj and its label is ORG 
The text is India and its label is GPE 
The text is 4th April and its label is DATE 


In [36]:
text3 = "This is Suraj and we want to show you some books on the topic- Gravitational Force"

In [37]:
doc2 = nlp(text3)
for ent in doc2.ents:
        print("The text is {} and its label is {} - It's start is {}, End is {} and It's start word index {} + end word index is {} ".format(ent.text, ent.label_, ent.start_char,ent.end_char, ent.start, ent.end))

The text is Suraj and its label is GPE - It's start is 8, End is 13 and It's start word index 2 + end word index is 3 
The text is Gravitational Force and its label is FAC - It's start is 63, End is 82 and It's start word index 14 + end word index is 16 


In [38]:
# let's have a look at an example where we are required to add a custom NER
text4 = "Suraj to build a github repository for maintenance"

In [39]:
doc3 = nlp(text4)
for ent in doc3.ents:
        print("The text is {} and its label is {} - It's start is {}, End is {} and It's start word index {} + end word index is {} ".format(ent.text, ent.label_, ent.start_char,ent.end_char, ent.start, ent.end))

The text is Suraj and its label is GPE - It's start is 0, End is 5 and It's start word index 0 + end word index is 1 


In [40]:
#let's add
#Create the EntityRuler
ruler = nlp.add_pipe("entity_ruler")

#List of Entities and Patterns
patterns = [
                {"label": "PRODUCT", "pattern": "github repository"}
            ]

ruler.add_patterns(patterns)

In [41]:
# let's find the updated one
doc3 = nlp(text4)
for ent in doc3.ents:
        print("The text is {} and its label is {} - It's start is {}, End is {} and It's start word index {} + end word index is {} ".format(ent.text, ent.label_, ent.start_char,ent.end_char, ent.start, ent.end))

The text is Suraj and its label is GPE - It's start is 0, End is 5 and It's start word index 0 + end word index is 1 
The text is github repository and its label is PRODUCT - It's start is 17, End is 34 and It's start word index 4 + end word index is 6 


In [42]:
#let;s check if this works on multiple instances of same unknown word

text5 = "This is a flute and we are looking for an E sharp flute, Can you please check all the flutes in your inventory ?"
doc4 = nlp(text5)
for ent in doc4.ents:
        print("The text is {} and its label is {} - It's start is {}, End is {} and It's start word index {} + end word index is {} ".format(ent.text, ent.label_, ent.start_char,ent.end_char, ent.start, ent.end))

# the o/p of the cell came after re-running

In [43]:
#let's add via ruler
#Create the EntityRuler

#List of Entities and Patterns
patterns = [
                {"label": "PRODUCT", "pattern": "flute"}
            ]

ruler.add_patterns(patterns)

In [44]:
# let's check
doc4 = nlp(text5)
for ent in doc4.ents:
        print("The text is {} and its label is {} - It's start is {}, End is {} and It's start word index {} + end word index is {} ".format(ent.text, ent.label_, ent.start_char,ent.end_char, ent.start, ent.end))

The text is flute and its label is PRODUCT - It's start is 10, End is 15 and It's start word index 3 + end word index is 4 
The text is flute and its label is PRODUCT - It's start is 50, End is 55 and It's start word index 12 + end word index is 13 


In [45]:
from spacy.matcher import Matcher
m_tool = Matcher(nlp.vocab)

In [46]:
patterns = [{'LOWER':'flute'}, {'LOWER':'flutes'}]

In [47]:
m_tool.add('flute',[patterns])

In [48]:
sentence = nlp(u'This is a flute and we are looking for an E sharp flute, Can you please check all the flutes in your inventory ?')
matches = m_tool(sentence)
print(matches)
# let's match
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  
    span = sentence[start:end]                   
    print(match_id, string_id, start, end, span.text)

[]


In [49]:
doc = nlp(u'We are looking for agile developers who can fasttrack project development in our organisation')
for chunk in doc.noun_chunks:
    print(chunk.text +'-'+ chunk.root.text + '-'+ chunk.root.dep_ +'-'+ chunk.root.head.text)

We-We-nsubj-looking
agile developers-developers-pobj-for
who-who-nsubj-fasttrack
project development-development-dobj-fasttrack
our organisation-organisation-pobj-in


In [50]:
len(list(doc.noun_chunks))

5

In [51]:
from spacy import displacy

In [52]:
doc = nlp('This is visualisation of NER module that will assist software developers in the domain of NLP in their company on Earth')
displacy.render(doc, style='ent', jupyter=True)

In [53]:
# we can even specify colors and effects to displacy
colors = {'ORG':'radial-gradient(yellow,cyan)','LOC':'radial-gradient(pink,blue)'}
options = {'ents':['ORG','LOC'], 'colors':colors}
displacy.render(doc, style='ent', jupyter=True, options=options)

In [54]:
text = "This is Suraj. I am here to illustrate the first demo of sentence segmentation."
doc = nlp(text)
for sent in doc.sents:
    print(sent)

This is Suraj.
I am here to illustrate the first demo of sentence segmentation.


In [55]:
text2 = "How we do things when no one is looking at it, Often matters; It is because that's what defines our character."
doc2 = nlp(text2)
sentences2 = []
for sent in doc2.sents:
    sentences2.append(sent)
    print(sent)

How we do things when no one is looking at it, Often matters; It is because that's what defines our character.


In [56]:
from spacy.language import Language

#always add the following decorator 
@Language.component("set_rules")
def set_rules(doc):
    for token in doc[:-1]:
        if token.text == ";" or token.text == ',':
            doc[token.i+1].is_sent_start=True
    return doc

In [57]:
# add this to existing nlp pipe
nlp.add_pipe("set_rules", before='parser')

<function __main__.set_rules(doc)>

In [58]:
nlp.pipe_names

['tok2vec',
 'tagger',
 'set_rules',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'entity_ruler']

In [59]:
# re-run the doc object creation
doc2 = nlp(text2)
sentences2_modified = []
for sent in doc2.sents:
    sentences2_modified.append(sent)
    print(sent)

How we do things when no one is looking at it,
Often matters;
It is because that's what defines our character.


In [60]:
len(sentences2_modified)

3

In [61]:
text3 = "Hi, This is Suraj. \n We are back with yet another illustration on the topic. \n"

In [62]:
nlp = spacy.load('en_core_web_sm', exclude=["parser"])
    
config = {"punct_chars": ['\n']}
nlp.add_pipe("sentencizer", config=config)

for sent in nlp(text3).sents:
    print("next sentence")
    print(sent)

next sentence
Hi, This is Suraj. 
 We are back with yet another illustration on the topic. 



In [1]:
#let's have a look at one such example
from sklearn.feature_extraction.text import CountVectorizer
text = ["Shall we start a youtube channel on it, What do you think about it?"]

In [2]:
vectorizer = CountVectorizer()
vectorizer.fit(text)
#summarise
print(vectorizer.get_feature_names())
print(vectorizer.vocabulary_)

['about', 'channel', 'do', 'it', 'on', 'shall', 'start', 'think', 'we', 'what', 'you', 'youtube']
{'shall': 5, 'we': 8, 'start': 6, 'youtube': 11, 'channel': 1, 'on': 4, 'it': 3, 'what': 9, 'do': 2, 'you': 10, 'think': 7, 'about': 0}




In [3]:
#encoding
encoded_vector = vectorizer.transform(text)
#print
array = encoded_vector.toarray()
print(array)

[[1 1 1 2 1 1 1 1 1 1 1 1]]


In [4]:
import pandas as pd
freq_matrix = pd.DataFrame(array, index = text, columns = vectorizer.get_feature_names())

In [5]:
freq_matrix

Unnamed: 0,about,channel,do,it,on,shall,start,think,we,what,you,youtube
"Shall we start a youtube channel on it, What do you think about it?",1,1,1,2,1,1,1,1,1,1,1,1


In [6]:
 # let's understand this via 2 examples.
text1 = "The food is cooked by the maid in the kitchen"
text2 = "The delicacy is cooked by the shef in the kitchen"

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
tfidf =TfidfVectorizer()

In [9]:
out = tfidf.fit_transform([text1,text2])

In [10]:
vector = tfidf.transform([text1,text2])
print(vector.toarray())

[[0.23602594 0.23602594 0.         0.33172622 0.23602594 0.23602594
  0.23602594 0.33172622 0.         0.70807782]
 [0.23602594 0.23602594 0.33172622 0.         0.23602594 0.23602594
  0.23602594 0.         0.33172622 0.70807782]]


In [12]:
feature_names = tfidf.get_feature_names_out()
print(feature_names)

['by' 'cooked' 'delicacy' 'food' 'in' 'is' 'kitchen' 'maid' 'shef' 'the']


In [13]:
for col in out.nonzero()[1]:
    print(str(feature_names[col]) +"-"+ str(out[0,col]))

kitchen-0.23602594054740156
in-0.23602594054740156
maid-0.3317262240477849
by-0.23602594054740156
cooked-0.23602594054740156
is-0.23602594054740156
food-0.3317262240477849
the-0.7080778216422047
shef-0.0
delicacy-0.0
kitchen-0.23602594054740156
in-0.23602594054740156
by-0.23602594054740156
cooked-0.23602594054740156
is-0.23602594054740156
the-0.7080778216422047


In [14]:
# printing the inverse document frequency.
tfidf.idf_

array([1.        , 1.        , 1.40546511, 1.40546511, 1.        ,
       1.        , 1.        , 1.40546511, 1.40546511, 1.        ])