In [None]:
!pip install nltk




# Tokenization

In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize

corpus = """My name is Sachin. This gem is your's to take, use and keep
How are you?"""

sentences = sent_tokenize(corpus)
print(sentences)

['My name is Sachin.', "This gem is your's to take, use and keep \nHow are you?"]


In [None]:
words = []
for sentence in sentences:
  words += word_tokenize(sentence)
print(words)

['My', 'name', 'is', 'Sachin', '.', 'This', 'gem', 'is', 'your', "'s", 'to', 'take', ',', 'use', 'and', 'keep', 'How', 'are', 'you', '?']


In [None]:
word_tokenize(corpus) == words

True

In [None]:
from nltk import wordpunct_tokenize

wordpunct_tokenize(corpus)   # Treats punctuations like .,?' are seperate tokens

['My',
 'name',
 'is',
 'Sachin',
 '.',
 'This',
 'gem',
 'is',
 'your',
 "'",
 's',
 'to',
 'take',
 ',',
 'use',
 'and',
 'keep',
 'How',
 'are',
 'you',
 '?']

In [None]:
from nltk.tokenize import TreebankWordTokenizer

corpus2 = "I am Sachin. This your's to take, use. Don't steal it,they'll arrest you"

tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(corpus2) # It treats . at the the end of sentence as part of that sentence. It can seperate contractions like don't, they'll etc.

['I',
 'am',
 'Sachin.',
 'This',
 'your',
 "'s",
 'to',
 'take',
 ',',
 'use.',
 'Do',
 "n't",
 'steal',
 'it',
 ',',
 'they',
 "'ll",
 'arrest',
 'you']

In [None]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r"[\w']+")
tokenizer.tokenize(corpus2) # Keeps all words and digits(and ' as part of words) but not symbols

['I',
 'am',
 'Sachin',
 'This',
 "your's",
 'to',
 'take',
 'use',
 "Don't",
 'steal',
 'it',
 "they'll",
 'arrest',
 'you']

# Stemming

Word stems include prefix, suffix, and root of the word known as Lemma. Stemming chops off prefix, and suffix leaving behind only root of the word.

Stemming is like chopping off the ends of words to get to their basic form. For example, "playing," "played," and "plays" all get reduced to "play." This basic form isn't always a real word, and it's different from a "lemma," which is the accurate dictionary form of a word.

In [None]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()  #Very simple but does lot of mistakes
stemmer.stem('playing')

'play'

In [None]:
for word in ["played", "playing", "plays","player"]:
  print(stemmer.stem(word))

play
play
play
player


In [None]:
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer(language='english') # Takes param 'language compulsorily and better than PorterStemmer

for word in ["played", "playing", "plays","player"]:
  print(stemmer.stem(word))

play
play
play
player


In [None]:
from nltk.stem import RegexpStemmer

stemmer = RegexpStemmer("ing$|s$|e$|able$|y$", min=4) #If it ends with such prefixes remove them
for word in ["played", "playing", "plays","player"]:
  print(stemmer.stem(word))


played
play
play
player


In [None]:
from nltk.stem import PorterStemmer, SnowballStemmer, RegexpStemmer

porter_stemmer = PorterStemmer()
snowball_stemmer = SnowballStemmer('english')
regexp_stemmer = RegexpStemmer("ing$|s$|e$|able$|y$", min=4)

words_to_stem = ["universal", "universality", "universe", "universem",
                 "generous", "generously", "generosity",
                 "beautiful", "beautifully", "beauty"]

print("Porter Stemmer Output:")
for word in words_to_stem:
  print(f"{word} -> {porter_stemmer.stem(word)}")

print("\nSnowball Stemmer Output:")
for word in words_to_stem:
  print(f"{word} -> {snowball_stemmer.stem(word)}")

print("\nRegexp Stemmer Output:")
for word in words_to_stem:
  print(f"{word} -> {regexp_stemmer.stem(word)}")

Porter Stemmer Output:
universal -> univers
universality -> univers
universe -> univers
universem -> universem
generous -> gener
generously -> gener
generosity -> generos
beautiful -> beauti
beautifully -> beauti
beauty -> beauti

Snowball Stemmer Output:
universal -> univers
universality -> univers
universe -> univers
universem -> universem
generous -> generous
generously -> generous
generosity -> generos
beautiful -> beauti
beautifully -> beauti
beauty -> beauti

Regexp Stemmer Output:
universal -> universal
universality -> universalit
universe -> univers
universem -> universem
generous -> generou
generously -> generousl
generosity -> generosit
beautiful -> beautiful
beautifully -> beautifull
beauty -> beaut


# Lemmatization

To solve the issue of stemming which gives incorrect words, lemmatization is used.
The root word that is obtained is called lemma, and is always a valid word in the dictionary. Hence lemmatization is realiable and important.

1. **WordNetLemmatizer**

WordNet is a large dictionary with a vast number of English words and their relationships. It organizes words into sets of synonyms, called **synonym sets (synsets)**. Each synset represents a distinct concept and can include multiple words that have similar meanings. WordNet also provides short definitions and examples for each synset. Such dictionaries, like WordNet, are used as a knowledge base in lemmatization to find the correct base form of a word based on its meaning and context.


This lemmatizer uses WordNet, a large dictionary of English words and their relationships. It helps find the base form (lemma) of a word, like changing "running" to "run," by looking it up in WordNet and considering if it's a noun, verb, etc.

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
from nltk.corpus import wordnet

wordnet.synsets("playing")

[Synset('playing.n.01'),
 Synset('playing.n.02'),
 Synset('acting.n.01'),
 Synset('play.v.01'),
 Synset('play.v.02'),
 Synset('play.v.03'),
 Synset('act.v.03'),
 Synset('play.v.05'),
 Synset('play.v.06'),
 Synset('play.v.07'),
 Synset('act.v.05'),
 Synset('play.v.09'),
 Synset('play.v.10'),
 Synset('play.v.11'),
 Synset('play.v.12'),
 Synset('play.v.13'),
 Synset('play.v.14'),
 Synset('play.v.15'),
 Synset('play.v.16'),
 Synset('play.v.17'),
 Synset('play.v.18'),
 Synset('toy.v.02'),
 Synset('play.v.20'),
 Synset('dally.v.04'),
 Synset('play.v.22'),
 Synset('dally.v.01'),
 Synset('play.v.24'),
 Synset('act.v.10'),
 Synset('play.v.26'),
 Synset('bring.v.03'),
 Synset('play.v.28'),
 Synset('play.v.29'),
 Synset('bet.v.02'),
 Synset('play.v.31'),
 Synset('play.v.32'),
 Synset('play.v.33'),
 Synset('meet.v.10'),
 Synset('play.v.35')]

**WordNet.morphy()**

In [None]:
words = ["playing", "played", "plays", "player"]

for word in words:
  print(wordnet.morphy(word))  # WordNetLemmatizer is actually a wrapper around wordnet, which uses this morphy() under the hood.

playing
play
play
player


In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

for word in words:
  print(lemmatizer.lemmatize(word)) # We can see that Lemmatizer still has a bit different output than the morphy

playing
played
play
player


In [None]:
# We can give "pos" arg which specifies to also give that word's "Parts of Speech"(POS) Tag
print(lemmatizer.lemmatize("playing", pos="v")) # give a lemma that is a verb of the given word
print(lemmatizer.lemmatize("playing", pos="n")) # give a lemma that is a noun of the given word
print(lemmatizer.lemmatize("playing", pos="a")) # give a lemma that is a adjective of the given word


play
playing
playing
playing


# Parts of Speech

Here are some common parts of speech with examples and their corresponding tags used in NLTK:

*   **Noun (N):** Represents a person, place, thing, or idea.
    *   Examples: `cat`, `city`, `book`, `happiness`
    *   Tags: `NN` (singular noun), `NNS` (plural noun), `NNP` (proper noun singular), `NNPS` (proper noun plural)

*   **Verb (V):** Describes an action or a state of being.
    *   Examples: `run`, `eat`, `is`, `believe`
    *   Tags: `VB` (verb base form), `VBD` (verb past tense), `VBG` (verb present participle), `VBN` (verb past participle), `VBP` (verb non-3rd person singular present), `VBZ` (verb 3rd person singular present)

*   **Adjective (A):** Describes or Modifies a noun or pronoun.
    *   Examples: `happy`, `big`, `red`, `interesting`
    *   Tags: `JJ` (adjective or numeral, ordinal), `JJR` (adjective comparative), `JJS` (adjective superlative)

*   **Adverb (R):** Describes or Modifies a verb, adjective, or other adverb.
    *   Examples: `quickly`, `very`, `happily`, `well`
    *   Tags: `RB` (adverb), `RBR` (adverb comparative), `RBS` (adverb superlative)

*   **Pronoun (PRP):** A generalization of a noun without specific names .
    *   Examples: `he`, `she`, `it`, `they`, `I`
    *   Tags: `PRP` (personal pronoun), `PRP$` (possessive pronoun)

*   **Preposition (P):** Shows the relationship between a noun or pronoun and another word in the sentence.
    *   Examples: `on`, `in`, `at`, `by`, `with`, `under`
    *   Tags: `IN` (preposition or conjunction, subordinating)

*   **Conjunction (C):** Connects words, phrases, or clauses.
    *   Examples: `and`, `but`, `or`, `for`
    *   Tags: `CC` (coordinating conjunction)

*   **Determiner (D):** Comes before a noun to specify its quantity or to clarify what it refers to.
    *   Examples: `the`, `a`, `an`, `this`, `that`
    *   Tags: `DT` (determiner)

*   **Interjection (UH):** Expresses strong emotion.
    *   Examples: `Oh!`, `Wow!`, `Hey!`
    *   Tags: `UH` (interjection)

PoS Tagging in NLTK

In [None]:
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('averaged_perceptron_tagger')



[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
import nltk
tags = nltk.pos_tag(["playing"]) # It takes list of strings not str, since its expected to be used after word tokenizers
print(tags)

[('playing', 'VBG')]


In [None]:
words = ["play","playing", "played", "been", "red","gleefully","on","and","this"]

# for word in words:
#   print(nltk.pos_tag([word])) # or simply

nltk.pos_tag(words)

[('play', 'NN'),
 ('playing', 'VBG'),
 ('played', 'VBD'),
 ('been', 'VBN'),
 ('red', 'JJ'),
 ('gleefully', 'RB'),
 ('on', 'IN'),
 ('and', 'CC'),
 ('this', 'DT')]

**Note**: the result of pos tag is list of tuples, Hence to get tag of each word use double indexing tag[word_index][1]

In [None]:
# Ex : Find all the words that are verbs or noun
tags = nltk.pos_tag(words)
verb_codes = ["NN","VBG" ]

for i in range(len(words)):
  if tags[i][1] in verb_codes:
    print(words[i],"=>",tags[i][1])


play => NN
playing => VBG
