In [None]:
!pip install nltk



**Python NLTK** | WhitespaceTokenizer

With the help of *nltk.tokenize.WhitespaceTokenizer()* method, we are able to extract the tokens from string of words or sentences without whitespaces, new line and tabs by using *tokenize.WhitespaceTokenizer()* method.



In [None]:
# import WhitespaceTokenizer() method from nltk
from nltk.tokenize import WhitespaceTokenizer

# Create a reference variable for Class WhitespaceTokenizer
wt = WhitespaceTokenizer()

# Create a string input
text = "my name is gauri ramesh karkhile. I'm third year computer science and engineering student at VIIT Pune."

# Use tokenize method
output = wt.tokenize(text)

print("original text: " + text)

print("split the text using whitespace")
print(output)


original text: my name is gauri ramesh karkhile. I'm third year computer science and engineering student at VIIT Pune.
split the text using whitespace
['my', 'name', 'is', 'gauri', 'ramesh', 'karkhile.', "I'm", 'third', 'year', 'computer', 'science', 'and', 'engineering', 'student', 'at', 'VIIT', 'Pune.']


**Python NLTK** | WordPunctTokenizer

The WordPunctTokenizer in NLTK splits text into words and punctuation marks, treating punctuation as separate tokens. It's useful for basic tokenization tasks where punctuation carries meaning.

In [None]:
from nltk.tokenize import WordPunctTokenizer

# Create a string input
text = "my name is gauri ramesh karkhile. I'm third year computer science and engineering student at VIIT Pune."
text3 = "Hello user! Check out AI advancements. 😀🤬 #EMOJI 123"
# Use tokenize method
output = WordPunctTokenizer().tokenize(text3)
print("original text: " + text3)

print("split the text using WordPunctTokenizer")
print(output)


original text: Hello user! Check out AI advancements. 😀🤬 #EMOJI 123
split the text using WordPunctTokenizer
['Hello', 'user', '!', 'Check', 'out', 'AI', 'advancements', '.', '😀🤬', '#', 'EMOJI', '123']


**Python NLTK** | TreebankTokenzier


The TreebankWordTokenizer is a tokenizer in NLTK that is trained on the Penn Treebank corpus. It is a more sophisticated tokenizer than the WordPunctTokenizer and can handle a wider range of punctuation and contractions.

In [None]:
from nltk.tokenize import TreebankWordTokenizer

# Create a string input
text = "my name is gauri ramesh karkhile. I'm third year computer science and engineering student at VIIT Pune."

# Use tokenize method

output =  TreebankWordTokenizer().tokenize(text)
print("original text: " + text)

print("split the text using TreebankTokenzier")
print(output)


original text: my name is gauri ramesh karkhile. I'm third year computer science and engineering student at VIIT Pune.
split the text using TreebankTokenzier
['my', 'name', 'is', 'gauri', 'ramesh', 'karkhile.', 'I', "'m", 'third', 'year', 'computer', 'science', 'and', 'engineering', 'student', 'at', 'VIIT', 'Pune', '.']


**Python nltk** | Tweet Tokenizer

The TweetTokenizer is specifically designed for tokenizing tweets. It handles common Twitter conventions like hashtags, mentions, and emoticons.

In [None]:
from nltk.tokenize import TweetTokenizer
# Create a string input
text3 = "Hello user! Check out AI advancements. 1 3 5 😀🤬 #EMOJI"

# Use tokenize method
output =  TweetTokenizer().tokenize(text3)
print("original text: " + text3)

print("split the text using Tweet Tokenizer")
print(output)

original text: Hello user! Check out AI advancements. 1 3 5 😀🤬 #EMOJI
split the text using Tweet Tokenizer
['Hello', 'user', '!', 'Check', 'out', 'AI', 'advancements', '.', '1', '3', '5', '😀', '🤬', '#EMOJI']


**Python nltk** | Multi-word Expression Tokeziner

The MWETokenizer is used to tokenize multi-word expressions (MWEs) as single units. MWEs are phrases that have a specific meaning when used together, such as "kick the bucket," "out of the blue," or "by and large".

In [None]:
from nltk.tokenize import MWETokenizer

# Create a string input
text3 = "Hello user! Check out AI advancements. 1 3 5 😀🤬 #EMOJI"

# Use tokenize method
output =  MWETokenizer().tokenize(text3)
print("original text: " + text3)

print("split the text using Multi-word Expression Tokeziner")
print(output)

original text: Hello user! Check out AI advancements. 1 3 5 😀🤬 #EMOJI
split the text using Multi-word Expression Tokeziner
['H', 'e', 'l', 'l', 'o', ' ', 'u', 's', 'e', 'r', '!', ' ', 'C', 'h', 'e', 'c', 'k', ' ', 'o', 'u', 't', ' ', 'A', 'I', ' ', 'a', 'd', 'v', 'a', 'n', 'c', 'e', 'm', 'e', 'n', 't', 's', '.', ' ', '1', ' ', '3', ' ', '5', ' ', '😀', '🤬', ' ', '#', 'E', 'M', 'O', 'J', 'I']


In [None]:
from nltk.tokenize import MWETokenizer

tokeziner = MWETokenizer([('a', 'lot'), ('a', 'little')])
# Pass the MWE as a single list or tuple:
tokeziner.add_mwe(('name','is','gauri'))

#Assuming 'tokenizer' refers to 'tokeziner'
print(tokeziner.tokenize('my name is gauri ramesh karkhile. I have a lot of work'.split()))

['my', 'name_is_gauri', 'ramesh', 'karkhile.', 'I', 'have', 'a_lot', 'of', 'work']


In [None]:
from nltk.tokenize import MWETokenizer

tokeziner = MWETokenizer([('1', '3'), ( '😀','🤬')])
# Pass the MWE as a single list or tuple:
tokeziner.add_mwe(('hello','user'))

#Assuming 'tokenizer' refers to 'tokeziner'
print(tokeziner.tokenize('Hello user! Check out AI advancements. 1 3 5 😀🤬 #EMOJI'.split()))

['Hello', 'user!', 'Check', 'out', 'AI', 'advancements.', '1_3', '5', '😀🤬', '#EMOJI']


**Stemming**

**Porter’s Stemmer**
It uses a set of heuristic rules to iteratively remove suffixes. Example: EED -> EE means “if the word has at least one vowel and consonant plus EED ending, change the ending to EE” as ‘agreed’ becomes ‘agree’.

In [None]:
from nltk.stem import PorterStemmer, LancasterStemmer, RegexpStemmer, SnowballStemmer

In [None]:
l = LancasterStemmer()
r = RegexpStemmer("ing")
p = PorterStemmer()
s = SnowballStemmer('english')

In [None]:
s.stem("playing")

'play'

In [None]:
from nltk.stem import SnowballStemmer
SnowballStemmer.languages

('arabic',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'hungarian',
 'italian',
 'norwegian',
 'porter',
 'portuguese',
 'romanian',
 'russian',
 'spanish',
 'swedish')

In [None]:
from nltk.stem import PorterStemmer
porter_stemmer = PorterStemmer()

words= ["running", "jumps", "happily", "happilly", "excitment"]

stemmed_words = [porter_stemmer.stem(words) for words in words]

print("original words:", words)
print("stemmed words:", stemmed_words)


original words: ['running', 'jumps', 'happily', 'happilly', 'excitment']
stemmed words: ['run', 'jump', 'happili', 'happilli', 'excit']


In [None]:
from random import sample
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

In [None]:
def stem_words(text):
  return ' '.join([ps.stem(word) for word in text.split()])

  sample= 'walk walks walking walked'
  stem_words(sample)

**Snowball Stemmer**

An extension of the Porter Stemmer with more robust rules. The Snowball Stemmer, compared to the Porter Stemmer, is multi-lingual as it can handle non-English words. It supports various languages and is based on the ‘Snowball’ programming language, known for efficient processing of small strings.

In [None]:
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer(language='english')

words_to_stem = ['running', 'jumps', 'happily', 'happilly']

stemmed_words = [stemmer.stem(word) for word in words_to_stem]

print("original words:", words_to_stem)
print("stemmed words:", stemmed_words)

original words: ['running', 'jumps', 'happily', 'happilly']
stemmed words: ['run', 'jump', 'happili', 'happilli']


**Lancaster Stemmer**

The Lancaster stemmers are more aggressive and dynamic compared to the other two stemmers. The stemmer is really faster, but the algorithm is really confusing when dealing with small words. But they are not as efficient as Snowball Stemmers. The Lancaster stemmers save the rules externally and basically uses an iterative algorithm.

In [None]:
from nltk.stem import LancasterStemmer

stemmer = LancasterStemmer()

words_to_stem = ['running', 'jumps', 'happily', 'happilly']

stemmed_words = [stemmer.stem(word) for word in words_to_stem]

print("original words:", words_to_stem)
print("stemmed words:", stemmed_words)

original words: ['running', 'jumps', 'happily', 'happilly']
stemmed words: ['run', 'jump', 'happy', 'happil']


**Lancaster Stemmer**

In [None]:
!pip install nltk
import nltk
nltk.download('punkt_tab')



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
from nltk.stem import LancasterStemmer
from nltk.tokenize import word_tokenize

stemmer = LancasterStemmer()


sentence = "Love looks not with the eyes but with the mind, and therefore is winged Cupid painted blind."

words = word_tokenize(sentence)

stemmed_words = [stemmer.stem(word) for word in words]

print("Original words:", words)
print("Stemmed words:", stemmed_words)

Original words: ['Love', 'looks', 'not', 'with', 'the', 'eyes', 'but', 'with', 'the', 'mind', ',', 'and', 'therefore', 'is', 'winged', 'Cupid', 'painted', 'blind', '.']
Stemmed words: ['lov', 'look', 'not', 'with', 'the', 'ey', 'but', 'with', 'the', 'mind', ',', 'and', 'theref', 'is', 'wing', 'cupid', 'paint', 'blind', '.']


**Regexp Stemmer**

The Regexp Stemmer, or Regular Expression Stemmer, is a stemming algorithm that utilizes regular expressions to identify and remove suffixes from words. It allows users to define custom rules for stemming by specifying patterns to match and remove.

In [None]:
from nltk.stem import RegexpStemmer

stemmer = RegexpStemmer('ing$|s$|e$|able$', min=4)

words = ['running', 'jumps', 'happily', 'happilly']

stemmed_words = [stemmer.stem(word) for word in words]

print("Original words:", words)
print("Stemmed words:", stemmed_words)

Original words: ['running', 'jumps', 'happily', 'happilly']
Stemmed words: ['runn', 'jump', 'happily', 'happilly']


**Lemmatization**

Lemmatization is the process of reducing words to their base or dictionary form, known as the lemma. This technique considers the context and the meaning of the words, ensuring that the base form belongs to the language's dictionary. For example, the words "running," "ran," and "runs" are all lemmatized to the lemma "run."

In [None]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
wl = WordNetLemmatizer()

In [None]:
wl.lemmatize("mice")

'mouse'

In [None]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
wnl = WordNetLemmatizer()

list1 = ['kites', 'babies', 'dogs', 'flying', 'smiling', 'driving', 'died', 'tried', 'feet']
for words in list1:
    print(words + " ---> " + wnl.lemmatize(words))

kites ---> kite
babies ---> baby
dogs ---> dog
flying ---> flying
smiling ---> smiling
driving ---> driving
died ---> died
tried ---> tried
feet ---> foot


**Sentence lemmatization examples**

In [None]:
import nltk
nltk.download('punkt_tab')
from nltk.stem import WordNetLemmatizer
string = 'the cat is sitting with the bats on the striped mat under many flying geese'
wnl = WordNetLemmatizer()

# Converting String into tokens
list2 = nltk.word_tokenize(string)
print(list2)
#> ['the', 'cat', 'is', 'sitting', 'with', 'the', 'bats', 'on',
# 'the', 'striped', 'mat', 'under', 'many', 'flying', 'geese']

lemmatized_string = ' '.join([wnl.lemmatize(words) for words in list2])

print(lemmatized_string)
#> the cat is sitting with the bat on the striped mat under many flying goose

['the', 'cat', 'is', 'sitting', 'with', 'the', 'bats', 'on', 'the', 'striped', 'mat', 'under', 'many', 'flying', 'geese']
the cat is sitting with the bat on the striped mat under many flying goose


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


**After stopwords removal**

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords
for words in list2:
  if words not in stopwords.words('english'):
    print(words + " ---> " + wnl.lemmatize(words))

cat ---> cat
sitting ---> sitting
bats ---> bat
striped ---> striped
mat ---> mat
many ---> many
flying ---> flying
geese ---> goose


**Wordnet Lemmatizer (with POS tag)**

Words like ‘sitting’, ‘flying’ etc remained the same after lemmatization. This is because these words are treated as a noun in the given sentence rather than a verb. To overcome come this, we use POS (Part of Speech) tags.

In [None]:
# WORDNET LEMMATIZER (with appropriate pos tags)

import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger_eng')
from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()

# Define function to lemmatize each word with its POS tag

# POS_TAGGER_FUNCTION : TYPE 1
def pos_tagger(nltk_tag):
	if nltk_tag.startswith('J'):
		return wordnet.ADJ
	elif nltk_tag.startswith('V'):
		return wordnet.VERB
	elif nltk_tag.startswith('N'):
		return wordnet.NOUN
	elif nltk_tag.startswith('R'):
		return wordnet.ADV
	else:
		return None

sentence = 'the cat is sitting with the bats on the striped mat under many badly flying geese'

# tokenize the sentence and find the POS tag for each token
pos_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))

print(pos_tagged)
#>[('the', 'DT'), ('cat', 'NN'), ('is', 'VBZ'), ('sitting', 'VBG'), ('with', 'IN'),
# ('the', 'DT'), ('bats', 'NNS'), ('on', 'IN'), ('the', 'DT'), ('striped', 'JJ'),
# ('mat', 'NN'), ('under', 'IN'), ('many', 'JJ'), ('flying', 'VBG'), ('geese', 'JJ')]

# As you may have noticed, the above pos tags are a little confusing.

# we use our own pos_tagger function to make things simpler to understand.
wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged))
print(wordnet_tagged)
#>[('the', None), ('cat', 'n'), ('is', 'v'), ('sitting', 'v'), ('with', None),
# ('the', None), ('bats', 'n'), ('on', None), ('the', None), ('striped', 'a'),
# ('mat', 'n'), ('under', None), ('many', 'a'), ('flying', 'v'), ('geese', 'a')]

lemmatized_sentence = []
for word, tag in wordnet_tagged:
	if tag is None:
		# if there is no available tag, append the token as is
		lemmatized_sentence.append(word)
	else:
		# else use the tag to lemmatize the token
		lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
lemmatized_sentence = " ".join(lemmatized_sentence)

print(lemmatized_sentence)
#> the cat can be sit with the bat on the striped mat under many fly geese


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


[('the', 'DT'), ('cat', 'NN'), ('is', 'VBZ'), ('sitting', 'VBG'), ('with', 'IN'), ('the', 'DT'), ('bats', 'NNS'), ('on', 'IN'), ('the', 'DT'), ('striped', 'JJ'), ('mat', 'NN'), ('under', 'IN'), ('many', 'JJ'), ('badly', 'RB'), ('flying', 'VBG'), ('geese', 'JJ')]
[('the', None), ('cat', 'n'), ('is', 'v'), ('sitting', 'v'), ('with', None), ('the', None), ('bats', 'n'), ('on', None), ('the', None), ('striped', 'a'), ('mat', 'n'), ('under', None), ('many', 'a'), ('badly', 'r'), ('flying', 'v'), ('geese', 'a')]
the cat be sit with the bat on the striped mat under many badly fly geese


**TextBlob**

TextBlob is a python library used for processing textual data. It provides a simple API to access its methods and perform basic NLP tasks.

Download TextBlob package : In your anaconda prompt or terminal, type: pip install textblob

In [None]:
from textblob import TextBlob, Word

my_word = 'cats'

# create a Word object
w = Word(my_word)

print(w.lemmatize())
#> cat

sentence = 'the bats saw the cats with stripes hanging upside down by their feet.'

s = TextBlob(sentence)
lemmatized_sentence = " ".join([w.lemmatize() for w in s.words])

print(lemmatized_sentence)
#> the bat saw the cat with stripe hanging upside down by their foot


cat
the bat saw the cat with stripe hanging upside down by their foot


**TextBlob (with POS tag)**

Same as in Wordnet approach without using appropriate POS tags, we observe the same limitations in this approach as well. So, we use one of the more powerful aspects of the TextBlob module the ‘Part of Speech’ tagging to overcome this problem.

In [None]:
from textblob import TextBlob

# Define function to lemmatize each word with its POS tag

# POS_TAGGER_FUNCTION : TYPE 2
def pos_tagger(sentence):
	sent = TextBlob(sentence)
	tag_dict = {"J": 'a', "N": 'n', "V": 'v', "R": 'r'}
	words_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]
	lemma_list = [wd.lemmatize(tag) for wd, tag in words_tags]
	return lemma_list

# Lemmatize
sentence = "the bats saw the cats with stripes hanging upside down by their feet"
lemma_list = pos_tagger(sentence)
lemmatized_sentence = " ".join(lemma_list)
print(lemmatized_sentence)
#> the bat saw the cat with stripe hang upside down by their foot
t_blob = TextBlob(sentence)
lemmatized_sentence = " ".join([w.lemmatize() for w in t_blob.words])
print(lemmatized_sentence)
#> the bat saw the cat with stripe hanging upside down by their foot


the bat saw the cat with stripe hang upside down by their foot
the bat saw the cat with stripe hanging upside down by their foot


**Natural Language Processing with Tokenization and Lemmatization**

In [None]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

# Download necessary NLTK data
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [None]:

# Sample text
text = "The striped bats are hanging on their feet for best"

# Tokenize the text
words = nltk.word_tokenize(text)

# Initialize the stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Apply stemming
stemmed_words = [stemmer.stem(word) for word in words]

# Function to get the part of speech tag for lemmatization
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# Apply lemmatization
lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words]

# Print results
print("Original Text: ", text)
print("Tokenized Words: ", words)
print("Stemmed Words: ", stemmed_words)
print("Lemmatized Words: ", lemmatized_words)

Original Text:  The striped bats are hanging on their feet for best
Tokenized Words:  ['The', 'striped', 'bats', 'are', 'hanging', 'on', 'their', 'feet', 'for', 'best']
Stemmed Words:  ['the', 'stripe', 'bat', 'are', 'hang', 'on', 'their', 'feet', 'for', 'best']
Lemmatized Words:  ['The', 'strip', 'bat', 'be', 'hang', 'on', 'their', 'foot', 'for', 'best']
