In [1]:
# Perform tokenization (Whitespace, Punctuation-based, Treebank, Tweet, MWE) using NLTK library. 
# Use porter stemmer and snowball stemmer for stemming. Use any technique for lemmatization. 

In [1]:
import nltk
from nltk.tokenize import word_tokenize, wordpunct_tokenize, TreebankWordTokenizer, TweetTokenizer
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('wordnet')

# Sample text
text = "NLTK is a powerful library for natural language processing tasks. It's #awesome!"

# Tokenization
print("\n**Whitespace Tokenization:**")
tokens_ws = word_tokenize(text)
print(tokens_ws)

print("\n**Punctuation-based Tokenization:**")
tokens_punct = wordpunct_tokenize(text)
print(tokens_punct)

print("\n**Treebank Tokenization:**")
treebank_tokenizer = TreebankWordTokenizer()
tokens_treebank = treebank_tokenizer.tokenize(text)
print(tokens_treebank)

print("\n**Tweet Tokenization:**")
tweet_tokenizer = TweetTokenizer()
tokens_tweet = tweet_tokenizer.tokenize(text)
print(tokens_tweet)

# Stemming
print("\n**Porter Stemming:**")
porter_stemmer = PorterStemmer()
stems_porter = [porter_stemmer.stem(token) for token in tokens_ws]
print(stems_porter)

print("\n**Snowball Stemming:**")
snowball_stemmer = SnowballStemmer("english")  # Choose a language
stems_snowball = [snowball_stemmer.stem(token) for token in tokens_ws]
print(stems_snowball)

# Lemmatization
print("\n**Lemmatization:**")
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(token) for token in tokens_ws]
print(lemmas)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



**Whitespace Tokenization:**
['NLTK', 'is', 'a', 'powerful', 'library', 'for', 'natural', 'language', 'processing', 'tasks', '.', 'It', "'s", '#', 'awesome', '!']

**Punctuation-based Tokenization:**
['NLTK', 'is', 'a', 'powerful', 'library', 'for', 'natural', 'language', 'processing', 'tasks', '.', 'It', "'", 's', '#', 'awesome', '!']

**Treebank Tokenization:**
['NLTK', 'is', 'a', 'powerful', 'library', 'for', 'natural', 'language', 'processing', 'tasks.', 'It', "'s", '#', 'awesome', '!']

**Tweet Tokenization:**
['NLTK', 'is', 'a', 'powerful', 'library', 'for', 'natural', 'language', 'processing', 'tasks', '.', "It's", '#awesome', '!']

**Porter Stemming:**
['nltk', 'is', 'a', 'power', 'librari', 'for', 'natur', 'languag', 'process', 'task', '.', 'it', "'s", '#', 'awesom', '!']

**Snowball Stemming:**
['nltk', 'is', 'a', 'power', 'librari', 'for', 'natur', 'languag', 'process', 'task', '.', 'it', "'s", '#', 'awesom', '!']

**Lemmatization:**
['NLTK', 'is', 'a', 'powerful', 'library