In [38]:
import nltk
from nltk.tokenize import word_tokenize, TreebankWordTokenizer, TweetTokenizer, MWETokenizer, WhitespaceTokenizer, regexp_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer

# Download required NLTK data
nltk.download('punkt')       # For tokenization
nltk.download('wordnet')     # For lemmatization

# Bigger sample text for better demonstration
text = """The quick brown foxes are jumping over lazy dogs in the United States of America. 
Meanwhile, New York City is bustling with energy and innovation. 
People from all over the world come to the USA to pursue their dreams and build a better future."""

# ---------------------- TOKENIZATION SECTION ----------------------

# WhitespaceTokenizer simply splits by space
print("Whitespace Tokenizer:", WhitespaceTokenizer().tokenize(text))

# Regexp tokenizer splits by words and punctuation using regex pattern
print("Punctuation Tokenizer:", regexp_tokenize(text, pattern=r"\w+|[^\w\s]"))

# Treebank tokenizer uses rules to split contractions and punctuation more precisely
print("Treebank Tokenizer:", TreebankWordTokenizer().tokenize(text))

# TweetTokenizer handles social media text like hashtags, emojis, links
tweet_text = "Wow!!! This is #awesome :) http://example.com @user"
print("Tweet Tokenizer:", TweetTokenizer().tokenize(tweet_text))

# MWETokenizer treats multi-word expressions as single tokens (e.g., "New York", "United States")
mwe = MWETokenizer([('New', 'York'), ('United', 'States'), ('United', 'States', 'of', 'America')])
print("MWE Tokenizer:", mwe.tokenize(text.split()))

# ---------------------- STEMMING SECTION ----------------------

# Initialize stemmers
ps = PorterStemmer()
ss = SnowballStemmer('english')

# Tokenize words for stemming
words = word_tokenize(text)

# Apply Porter Stemmer (rule-based stemming)
print("Porter Stemmer:", [ps.stem(w) for w in words])

# Apply Snowball Stemmer (more refined than Porter)
print("Snowball Stemmer:", [ss.stem(w) for w in words])

# ---------------------- LEMMATIZATION SECTION ----------------------

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Apply lemmatization (returns base dictionary word form)
print("Lemmatization:", [lemmatizer.lemmatize(w) for w in words])


Whitespace Tokenizer: ['The', 'quick', 'brown', 'foxes', 'are', 'jumping', 'over', 'lazy', 'dogs', 'in', 'the', 'United', 'States', 'of', 'America.', 'Meanwhile,', 'New', 'York', 'City', 'is', 'bustling', 'with', 'energy', 'and', 'innovation.', 'People', 'from', 'all', 'over', 'the', 'world', 'come', 'to', 'the', 'USA', 'to', 'pursue', 'their', 'dreams', 'and', 'build', 'a', 'better', 'future.']
Punctuation Tokenizer: ['The', 'quick', 'brown', 'foxes', 'are', 'jumping', 'over', 'lazy', 'dogs', 'in', 'the', 'United', 'States', 'of', 'America', '.', 'Meanwhile', ',', 'New', 'York', 'City', 'is', 'bustling', 'with', 'energy', 'and', 'innovation', '.', 'People', 'from', 'all', 'over', 'the', 'world', 'come', 'to', 'the', 'USA', 'to', 'pursue', 'their', 'dreams', 'and', 'build', 'a', 'better', 'future', '.']
Treebank Tokenizer: ['The', 'quick', 'brown', 'foxes', 'are', 'jumping', 'over', 'lazy', 'dogs', 'in', 'the', 'United', 'States', 'of', 'America.', 'Meanwhile', ',', 'New', 'York', 'Cit

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Gauri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Gauri\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
