In [1]:
import nltk

# Download necessary NLTK data packages
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [2]:
import nltk
from nltk.tokenize import (WhitespaceTokenizer,
                           WordPunctTokenizer,
                           TreebankWordTokenizer,
                           TweetTokenizer,
                           MWETokenizer)
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer

# --- SAMPLE TEXTS ---
# Text 1: Standard text for general tokenization
text_std = "The quick brown fox jumps over the lazy dog. It's awesome!"

# Text 2: Text with Multi-Word Expressions (MWE) like 'New York'
text_mwe = "I love New York and machine learning is fun."

# Text 3: Tweet text with emojis and hashtags
text_tweet = "Just finished the NLP assignment! :D #NLP #Python @GoogleColab"

In [3]:
# ==========================================
# 1. TOKENIZATION
# ==========================================
print("--- 1. TOKENIZATION ---")

# A. Whitespace Tokenization
ws_tokenizer = WhitespaceTokenizer()
print(f"Whitespace: {ws_tokenizer.tokenize(text_std)}")

# B. Punctuation-based Tokenization
# Splits text on whitespace and punctuation
wp_tokenizer = WordPunctTokenizer()
print(f"Punctuation-based: {wp_tokenizer.tokenize(text_std)}")

# C. Treebank Tokenization (Standard NLTK method)
# Uses standard English grammar conventions (separates "It's" into "It" and "'s")
tb_tokenizer = TreebankWordTokenizer()
print(f"Treebank: {tb_tokenizer.tokenize(text_std)}")

# D. Tweet Tokenization
# Preserves emojis and hashtags usually lost in other tokenizers
tw_tokenizer = TweetTokenizer()
print(f"Tweet: {tw_tokenizer.tokenize(text_tweet)}")

# E. MWE (Multi-Word Expression) Tokenization
# You must manually define which words belong together
mwe_tokenizer = MWETokenizer()
mwe_tokenizer.add_mwe(('New', 'York'))        # Add 'New York' as a single token
mwe_tokenizer.add_mwe(('machine', 'learning')) # Add 'machine learning'
# Note: MWETokenizer requires a list of strings as input, not raw text
# We use standard split() first to feed it tokens
print(f"MWE: {mwe_tokenizer.tokenize(text_mwe.split())}")

--- 1. TOKENIZATION ---
Whitespace: ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog.', "It's", 'awesome!']
Punctuation-based: ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.', 'It', "'", 's', 'awesome', '!']
Treebank: ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog.', 'It', "'s", 'awesome', '!']
Tweet: ['Just', 'finished', 'the', 'NLP', 'assignment', '!', ':D', '#NLP', '#Python', '@GoogleColab']
MWE: ['I', 'love', 'New_York', 'and', 'machine_learning', 'is', 'fun.']


In [4]:
# ==========================================
# 2. STEMMING
# ==========================================
print("\n--- 2. STEMMING ---")
words_to_stem = ["running", "generous", "happily", "organization", "wolves"]

# A. Porter Stemmer (Older, more aggressive)
porter = PorterStemmer()
porter_results = [porter.stem(w) for w in words_to_stem]
print(f"Original Words: {words_to_stem}")
print(f"Porter Stemmer: {porter_results}")

# B. Snowball Stemmer (Newer, slightly more accurate)
snowball = SnowballStemmer("english")
snowball_results = [snowball.stem(w) for w in words_to_stem]
print(f"Snowball Stemmer: {snowball_results}")


--- 2. STEMMING ---
Original Words: ['running', 'generous', 'happily', 'organization', 'wolves']
Porter Stemmer: ['run', 'gener', 'happili', 'organ', 'wolv']
Snowball Stemmer: ['run', 'generous', 'happili', 'organ', 'wolv']


In [5]:
# ==========================================
# 3. LEMMATIZATION
# ==========================================
print("\n--- 3. LEMMATIZATION ---")
# Lemmatization reduces words to their base dictionary form (Lemma)
lemmatizer = WordNetLemmatizer()

words_to_lemmatize = ["running", "corpora", "better", "rocks"]

# Note: Lemmatizers work best when provided context (POS tags).
# Without context, it assumes everything is a Noun.
lemma_results = [lemmatizer.lemmatize(w) for w in words_to_lemmatize]

# Example with explicit POS tagging (v=verb, a=adjective)
lemma_verb = lemmatizer.lemmatize("running", pos="v")     # Should become 'run'
lemma_adj = lemmatizer.lemmatize("better", pos="a")       # Should become 'good'

print(f"Basic Lemmatization: {lemma_results}")
print(f"'running' as verb: {lemma_verb}")
print(f"'better' as adjective: {lemma_adj}")


--- 3. LEMMATIZATION ---
Basic Lemmatization: ['running', 'corpus', 'better', 'rock']
'running' as verb: run
'better' as adjective: good
