# Tokenization, Stemming, Lemmatization and Pos_Tagging using libraries like spacy and nltk


In [1]:
# install necessary libraries
%pip install numpy
%pip install nltk
%pip install spacy

Note: you may need to restart the kernel to use updated packages.
Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Downloading click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting regex>=2021.8.3 (from nltk)
  Using cached regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tqdm (from nltk)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Using cached regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (796 kB)
Downloading click-8.2.1-py3-none-any.whl (102 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0mm
[?25hUsing cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, regex, click, nltk
Successfully installed click-8.2.1 nltk-3.9.1 regex-2024.11.6 tqdm-4.67.1
Note: you m

In [2]:
# corpus
corpus_original = "Need to finalize the demo corpus which will be used for this notebook and it should be done soon !!. It should be done by the ending of this month. But will it? This notebook has been run 4 times !!"
corpus = "Need to finalize the demo corpus which will be used for this notebook & should be done soon !!. It should be done by the ending of this month. But will it? This notebook has been run 4 times !!"

In [4]:
# lower case the corpus
corpus = corpus.lower()
corpus

'need to finalize the demo corpus which will be used for this notebook & should be done soon !!. it should be done by the ending of this month. but will it? this notebook has been run 4 times !!'

In [5]:
# Removing digits using regex
import re

corpus = re.sub(r'\d+', '', corpus)
corpus

'need to finalize the demo corpus which will be used for this notebook & should be done soon !!. it should be done by the ending of this month. but will it? this notebook has been run  times !!'

In [7]:
# Removing punctuations
import string

corpus = corpus.translate(str.maketrans('', '', string.punctuation))

corpus


'need to finalize the demo corpus which will be used for this notebook  should be done soon  it should be done by the ending of this month but will it this notebook has been run  times '

In [8]:
# Removing trailing whitespaces
corpus = ' '.join([token for token in corpus.split()])

corpus

'need to finalize the demo corpus which will be used for this notebook should be done soon it should be done by the ending of this month but will it this notebook has been run times'

# Tokenization

# ## What is Tokenization?

Tokenization is the process of breaking down text into smaller units called tokens. These tokens can be words, sentences, or subwords, depending on the application. Tokenization is a fundamental step in natural language processing (NLP) because it transforms raw text into a format that can be more easily analyzed and processed by algorithms.
 
For example, the sentence "Natural Language Processing is fun!" can be tokenized into the words: ["Natural", "Language", "Processing", "is", "fun", "!"].

Tokenization helps in tasks such as text analysis, information retrieval, and preparing data for machine learning models.
`

Download Necessray resources

In [22]:
# Download pre-trained english model for spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [28]:
# Download required resources
import nltk
# Download a list of common stopwords
nltk.download('stopwords')

# Download pre-trained tokenizer model
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /home/fahad/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/fahad/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/fahad/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

## Tokenization using NLTK

In [29]:
# Import necessary libraries
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [32]:
# Load english stopwords and create a set to faster lookups
stop_words_nltk = set(stopwords.words('english'))
print(stop_words_nltk)

{'it', 'mustn', "that'll", "it'll", 'themselves', "they've", 'about', 'them', 'by', "mightn't", 'herself', 'me', 'while', 'under', 'further', 'and', 'not', "haven't", 'mightn', 'which', 'out', 'my', "you're", 'won', 'himself', "you've", 'now', "needn't", 'its', 'into', 'there', 'being', 'do', 'because', 'don', 'isn', 'needn', 'in', "mustn't", "they're", 'is', 'more', 'where', 'wasn', "couldn't", 'only', 'ours', 'over', 'yours', 'when', 'from', 'very', "don't", 'before', "isn't", 'to', 'him', 'above', 'ourselves', 'wouldn', 'some', "he'll", 'nor', 'the', 'haven', 'on', 'weren', 'these', 'below', 'myself', "they'd", 'does', 'until', 'we', 'most', 'through', 'hers', 'our', 'a', 'he', "we'd", 'm', 've', 'their', 'down', 'such', 'was', 'but', 'am', "it's", "we're", 're', "she'll", 'if', 'have', 'itself', 'o', 'be', "hasn't", 'whom', 'ain', 'again', "i'll", 'then', 'theirs', 'having', 't', 'should', "doesn't", 'for', 'so', 'that', 'her', 'with', "he's", 'hasn', 'aren', 'has', 'shouldn', "the

In [33]:
tokenized_corpus_nltk = word_tokenize(corpus)
print(tokenized_corpus_nltk)

['need', 'to', 'finalize', 'the', 'demo', 'corpus', 'which', 'will', 'be', 'used', 'for', 'this', 'notebook', 'should', 'be', 'done', 'soon', 'it', 'should', 'be', 'done', 'by', 'the', 'ending', 'of', 'this', 'month', 'but', 'will', 'it', 'this', 'notebook', 'has', 'been', 'run', 'times']


In [35]:
# Filter out tokens that are in stopwords
tokenized_corpus_nltk_without_stopwords = [token for token in tokenized_corpus_nltk if token.lower() not in stop_words_nltk]

tokenized_corpus_nltk_without_stopwords

['need',
 'finalize',
 'demo',
 'corpus',
 'used',
 'notebook',
 'done',
 'soon',
 'done',
 'ending',
 'month',
 'notebook',
 'run',
 'times']

# Tokenization with Spacy

In [45]:
# Import spacy and load its english model
import spacy
spacy_model = spacy.load('en_core_web_sm')

In [46]:
# Stopwords in spacy
stopwords_spacy = spacy_model.Defaults.stop_words
print(stopwords_spacy)

{'’s', 'about', 'wherever', 'elsewhere', 'my', "'ve", 'now', 'there', 'do', 'thru', 'either', 'because', 'give', 'might', '’m', 'without', 'only', 'over', 'regarding', 'when', 'from', 'very', '’ll', 'before', 'below', 'front', 'does', 'sometimes', 'we', 'most', 'a', 'someone', 'much', '’d', 'somewhere', 'whatever', 'via', 'somehow', 'nine', 'have', 'again', 'part', 'then', 'already', 'with', 'another', 'third', 'she', "'s", 'whole', 'each', 'same', 'yourself', 'whenever', 'therefore', 'beforehand', 'during', 'one', 'almost', 'are', 'twenty', 'whether', 'alone', 'ever', 'two', 'ten', 'up', 'thereafter', 'except', 'than', 'all', 'indeed', 'once', '‘s', 'upon', 'various', 'may', 'mostly', 'herself', 'me', 'latter', 'seemed', 'out', 'besides', 'himself', 'anywhere', 'everything', 'moreover', 'nevertheless', 'together', 'would', 'per', 'three', 'noone', 'twelve', 'everywhere', 'seeming', 'always', 'myself', 'done', 'toward', 'hers', 'our', 'down', 'if', "'d", '’ve', 'n‘t', 'that', 'namely',

In [47]:
# Tokenize corpus using spacy

print("\nSpacy:")
tokenized_corpus_spacy = spacy_model(corpus)
print("Tokenized Corpus:", tokenized_corpus_spacy)



Spacy:
Tokenized Corpus: need to finalize the demo corpus which will be used for this notebook should be done soon it should be done by the ending of this month but will it this notebook has been run times


In [61]:
# tokens without stopwords
tokens_without_sw = [word for word in tokenized_corpus_spacy if not word in stopwords_spacy]
print(tokens_without_sw)


[need, to, finalize, the, demo, corpus, which, will, be, used, for, this, notebook, should, be, done, soon, it, should, be, done, by, the, ending, of, this, month, but, will, it, this, notebook, has, been, run, times]


## Difference between nltk and spacy

In [63]:
# Find the difference between the two lists (tokens without stopwords from spacy and nltk)
tokens_spacy_set = set([token.text.lower() for token in tokens_without_sw])
tokens_nltk_set = set([token.lower() for token in tokenized_corpus_nltk_without_stopwords])

diff_spacy_nltk = tokens_spacy_set - tokens_nltk_set
diff_nltk_spacy = tokens_nltk_set - tokens_spacy_set

print("Tokens in Spacy but not in NLTK:", diff_spacy_nltk)
print("Tokens in NLTK but not in Spacy:", diff_nltk_spacy)

Tokens in Spacy but not in NLTK: {'it', 'this', 'but', 'of', 'which', 'been', 'will', 'the', 'by', 'has', 'to', 'should', 'for', 'be'}
Tokens in NLTK but not in Spacy: set()


# Stemming

Stemming is a text normalization technique in Natural Language Processing (NLP) that reduces words to their root or base form. 

The root form, called a "stem," may not always be a valid word, but it represents related words with similar meanings. 

For example, "running", "runs", and "ran" can all be reduced to the stem "run". 

Stemming helps in grouping similar words together, which is useful for tasks like information retrieval and text analysis.


In [70]:
# Stem using port stemmer
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

print("Before Stemming:\n", corpus)

print('After Stemming:')

for word in tokenized_corpus_nltk:
    print(stemmer.stem(word), end=" ")

Before Stemming:
 need to finalize the demo corpus which will be used for this notebook should be done soon it should be done by the ending of this month but will it this notebook has been run times
After Stemming:
need to final the demo corpu which will be use for thi notebook should be done soon it should be done by the end of thi month but will it thi notebook ha been run time 

# Lemmatization

Lemmatization is a text normalization process in Natural Language Processing (NLP) that reduces words to their base or dictionary form, known as a "lemma."

Unlike stemming, which may simply chop off word endings and can result in non-words, lemmatization uses vocabulary and morphological analysis to return valid words.

For example, the words "running", "ran", and "runs" are all reduced to the lemma "run".

Lemmatization helps in grouping together different inflected forms of a word so they can be analyzed as a single item, which is useful for tasks like information retrieval, text analysis, and machine learning.


In [72]:
# use wordnetlemmatizer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to /home/fahad/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [74]:
lemmetizer = WordNetLemmatizer()

for word in tokenized_corpus_nltk:
    print(lemmetizer.lemmatize(word), end=" ")

need to finalize the demo corpus which will be used for this notebook should be done soon it should be done by the ending of this month but will it this notebook ha been run time 

# POS Tagging
 
POS (Part-of-Speech) Tagging is the process of labeling each word in a sentence with its appropriate part of speech, such as noun, verb, adjective, etc.

This is a fundamental step in many NLP tasks, as it provides information about the grammatical structure of a sentence and the relationships between words.

For example, in the sentence "The quick brown fox jumps over the lazy dog", POS tagging would identify "fox" and "dog" as nouns, "jumps" as a verb, and "quick", "brown", "lazy" as adjectives.
 
POS tagging can be performed using rule-based, statistical, or machine learning approaches. In Python, the NLTK library provides tools for POS tagging.
 
# Let's see how to perform POS tagging using NLTK.

In [75]:
# POS tagging using spacy

doc = spacy_model(corpus_original)

doc

Need to finalize the demo corpus which will be used for this notebook and it should be done soon !!. It should be done by the ending of this month. But will it? This notebook has been run 4 times !!

In [76]:
# Token and tag
for token in doc:
    print(token, ':', token.pos_)

Need : VERB
to : PART
finalize : VERB
the : DET
demo : NOUN
corpus : X
which : PRON
will : AUX
be : AUX
used : VERB
for : ADP
this : DET
notebook : NOUN
and : CCONJ
it : PRON
should : AUX
be : AUX
done : VERB
soon : ADV
! : PUNCT
! : PUNCT
. : PUNCT
It : PRON
should : AUX
be : AUX
done : VERB
by : ADP
the : DET
ending : NOUN
of : ADP
this : DET
month : NOUN
. : PUNCT
But : CCONJ
will : AUX
it : PRON
? : PUNCT
This : DET
notebook : NOUN
has : AUX
been : AUX
run : VERB
4 : NUM
times : NOUN
! : PUNCT
! : PUNCT


In [83]:
# POS tagging using nltk
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/fahad/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [85]:
from pprint import pprint

pprint(nltk.pos_tag(word_tokenize(corpus_original)))

[('Need', 'NN'),
 ('to', 'TO'),
 ('finalize', 'VB'),
 ('the', 'DT'),
 ('demo', 'NN'),
 ('corpus', 'NN'),
 ('which', 'WDT'),
 ('will', 'MD'),
 ('be', 'VB'),
 ('used', 'VBN'),
 ('for', 'IN'),
 ('this', 'DT'),
 ('notebook', 'NN'),
 ('and', 'CC'),
 ('it', 'PRP'),
 ('should', 'MD'),
 ('be', 'VB'),
 ('done', 'VBN'),
 ('soon', 'RB'),
 ('!', '.'),
 ('!', '.'),
 ('.', '.'),
 ('It', 'PRP'),
 ('should', 'MD'),
 ('be', 'VB'),
 ('done', 'VBN'),
 ('by', 'IN'),
 ('the', 'DT'),
 ('ending', 'VBG'),
 ('of', 'IN'),
 ('this', 'DT'),
 ('month', 'NN'),
 ('.', '.'),
 ('But', 'CC'),
 ('will', 'MD'),
 ('it', 'PRP'),
 ('?', '.'),
 ('This', 'DT'),
 ('notebook', 'NN'),
 ('has', 'VBZ'),
 ('been', 'VBN'),
 ('run', 'VBN'),
 ('4', 'CD'),
 ('times', 'NNS'),
 ('!', '.'),
 ('!', '.')]
