In [1]:
# Install nltk library (only once per Colab session)
!pip install nltk




In [2]:
# Import nltk
import nltk

# Download required NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [4]:
# Sample text used for all techniques
text = "I don't like bad design! Natural language processing is amazing. \
Check this out @OpenAI #NLP #AI "


# **ðŸŸ¢ PART 1: TOKENIZATION**

In [5]:
# Whitespace Tokenization
whitespace_tokens = text.split()

print("Whitespace Tokenization:")
print(whitespace_tokens)

Whitespace Tokenization:
['I', "don't", 'like', 'bad', 'design!', 'Natural', 'language', 'processing', 'is', 'amazing.', 'Check', 'this', 'out', '@OpenAI', '#NLP', '#AI']


In [6]:
from nltk.tokenize import wordpunct_tokenize

# Punctuation Tokenization
punct_tokens = wordpunct_tokenize(text)

print("\nPunctuation Tokenization:")
print(punct_tokens)



Punctuation Tokenization:
['I', 'don', "'", 't', 'like', 'bad', 'design', '!', 'Natural', 'language', 'processing', 'is', 'amazing', '.', 'Check', 'this', 'out', '@', 'OpenAI', '#', 'NLP', '#', 'AI']


In [7]:
from nltk.tokenize import TreebankWordTokenizer

# Treebank Tokenization
treebank_tokenizer = TreebankWordTokenizer()
treebank_tokens = treebank_tokenizer.tokenize(text)

print("\nTreebank Tokenization:")
print(treebank_tokens)



Treebank Tokenization:
['I', 'do', "n't", 'like', 'bad', 'design', '!', 'Natural', 'language', 'processing', 'is', 'amazing.', 'Check', 'this', 'out', '@', 'OpenAI', '#', 'NLP', '#', 'AI']


In [8]:
from nltk.tokenize import TweetTokenizer

# Tweet Tokenization
tweet_tokenizer = TweetTokenizer()
tweet_tokens = tweet_tokenizer.tokenize(text)

print("\nTweet Tokenization:")
print(tweet_tokens)


Tweet Tokenization:
['I', "don't", 'like', 'bad', 'design', '!', 'Natural', 'language', 'processing', 'is', 'amazing', '.', 'Check', 'this', 'out', '@OpenAI', '#NLP', '#AI']


In [9]:
from nltk.tokenize import MWETokenizer

# Define multi-word expressions
mwe_tokenizer = MWETokenizer([('natural', 'language'), ('machine', 'learning')])

# Apply MWE tokenizer
mwe_tokens = mwe_tokenizer.tokenize(text.lower().split())

print("\nMWE Tokenization:")
print(mwe_tokens)



MWE Tokenization:
['i', "don't", 'like', 'bad', 'design!', 'natural_language', 'processing', 'is', 'amazing.', 'check', 'this', 'out', '@openai', '#nlp', '#ai']


# **ðŸŸ¢ PART 2: STEMMING**

In [10]:
from nltk.stem import PorterStemmer

porter = PorterStemmer()

words = ["running", "runs", "ran", "easily", "fairly"]

print("\nPorter Stemming:")
for word in words:
    print(word, "â†’", porter.stem(word))



Porter Stemming:
running â†’ run
runs â†’ run
ran â†’ ran
easily â†’ easili
fairly â†’ fairli


In [11]:
from nltk.stem import SnowballStemmer

snowball = SnowballStemmer("english")

print("\nSnowball Stemming:")
for word in words:
    print(word, "â†’", snowball.stem(word))



Snowball Stemming:
running â†’ run
runs â†’ run
ran â†’ ran
easily â†’ easili
fairly â†’ fair


# **ðŸŸ¢ PART 3: LEMMATIZATION**

In [12]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

words = ["running", "better", "cars", "went"]

print("\nWordNet Lemmatization:")
for word in words:
    print(word, "â†’", lemmatizer.lemmatize(word))



WordNet Lemmatization:
running â†’ running
better â†’ better
cars â†’ car
went â†’ went


In [13]:
# Verb-based lemmatization
print("\nLemmatization with POS='v':")
for word in ["running", "went"]:
    print(word, "â†’", lemmatizer.lemmatize(word, pos='v'))



Lemmatization with POS='v':
running â†’ run
went â†’ go


# FINAL SUMMARY


Whitespace Tokenizer: Splits text using spaces only.

Punctuation Tokenizer: Splits text at punctuation marks.

Treebank Tokenizer: Handles contractions like donâ€™t â†’ do, nâ€™t.

Tweet Tokenizer: Designed for social media text (hashtags, mentions, emojis).

MWE Tokenizer: Combines multi-word expressions into a single token.

Porter Stemmer: Fast rule-based word stemming.

Snowball Stemmer: Improved and more accurate stemming.

WordNet Lemmatizer: Converts words to their meaningful dictionary form.