In [1]:
#Perform tokenization (whitespace, punctuation- based, treebank, tweet, MWE) using NLTK library.
#Use Porter stemmer and snowball stemmer for stemming.
#Use any technique for lemmatization.

In [2]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, TreebankWordTokenizer, TweetTokenizer, MWETokenizer
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer

In [3]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ganga\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ganga\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ganga\AppData\Roaming\nltk_data...
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ganga\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [4]:
# Example text
text = """The Uniform Civil Code is not just a law; it’s a step towards equality, justice, and unity in a diverse nation like India."""

In [5]:
# 1. Tokenization
# Whitespace-based tokenization
whitespace_tokens = text.split()
whitespace_tokens

['The',
 'Uniform',
 'Civil',
 'Code',
 'is',
 'not',
 'just',
 'a',
 'law;',
 'it’s',
 'a',
 'step',
 'towards',
 'equality,',
 'justice,',
 'and',
 'unity',
 'in',
 'a',
 'diverse',
 'nation',
 'like',
 'India.']

In [6]:
# Punctuation-based tokenization using word_tokenize
punctuation_tokens = word_tokenize(text)
punctuation_tokens

['The',
 'Uniform',
 'Civil',
 'Code',
 'is',
 'not',
 'just',
 'a',
 'law',
 ';',
 'it',
 '’',
 's',
 'a',
 'step',
 'towards',
 'equality',
 ',',
 'justice',
 ',',
 'and',
 'unity',
 'in',
 'a',
 'diverse',
 'nation',
 'like',
 'India',
 '.']

In [7]:
# Treebank tokenization
treebank_tokenizer = TreebankWordTokenizer()
treebank_tokens = treebank_tokenizer.tokenize(text)
treebank_tokens

['The',
 'Uniform',
 'Civil',
 'Code',
 'is',
 'not',
 'just',
 'a',
 'law',
 ';',
 'it’s',
 'a',
 'step',
 'towards',
 'equality',
 ',',
 'justice',
 ',',
 'and',
 'unity',
 'in',
 'a',
 'diverse',
 'nation',
 'like',
 'India',
 '.']

In [8]:
# Tweet tokenization
tweet_tokenizer = TweetTokenizer()
tweet_tokens = tweet_tokenizer.tokenize(text)
tweet_tokens

['The',
 'Uniform',
 'Civil',
 'Code',
 'is',
 'not',
 'just',
 'a',
 'law',
 ';',
 'it',
 '’',
 's',
 'a',
 'step',
 'towards',
 'equality',
 ',',
 'justice',
 ',',
 'and',
 'unity',
 'in',
 'a',
 'diverse',
 'nation',
 'like',
 'India',
 '.']

In [9]:
# Define Multi-Word Expressions
mwe_tokenizer = MWETokenizer([('Uniform', 'Civil', 'Code'), ('diverse', 'nation')])

# Perform MWE tokenization
mwe_tokens = mwe_tokenizer.tokenize(word_tokenize(text))

mwe_tokens

['The',
 'Uniform_Civil_Code',
 'is',
 'not',
 'just',
 'a',
 'law',
 ';',
 'it',
 '’',
 's',
 'a',
 'step',
 'towards',
 'equality',
 ',',
 'justice',
 ',',
 'and',
 'unity',
 'in',
 'a',
 'diverse_nation',
 'like',
 'India',
 '.']

In [10]:
# 2. Stemming
porter_stemmer = PorterStemmer()
snowball_stemmer = SnowballStemmer("english")

In [12]:
porter_stems = [porter_stemmer.stem(word) for word in punctuation_tokens]
snowball_stems = [snowball_stemmer.stem(word) for word in punctuation_tokens]

In [13]:
print("\nPorter Stemmer Results:", porter_stems)


Porter Stemmer Results: ['the', 'uniform', 'civil', 'code', 'is', 'not', 'just', 'a', 'law', ';', 'it', '’', 's', 'a', 'step', 'toward', 'equal', ',', 'justic', ',', 'and', 'uniti', 'in', 'a', 'divers', 'nation', 'like', 'india', '.']


In [14]:
print("Snowball Stemmer Results:", snowball_stems)

Snowball Stemmer Results: ['the', 'uniform', 'civil', 'code', 'is', 'not', 'just', 'a', 'law', ';', 'it', '’', 's', 'a', 'step', 'toward', 'equal', ',', 'justic', ',', 'and', 'uniti', 'in', 'a', 'divers', 'nation', 'like', 'india', '.']


In [15]:
# 3. Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in punctuation_tokens]

In [16]:
print("\nLemmatization Results:", lemmatized_words)


Lemmatization Results: ['The', 'Uniform', 'Civil', 'Code', 'is', 'not', 'just', 'a', 'law', ';', 'it', '’', 's', 'a', 'step', 'towards', 'equality', ',', 'justice', ',', 'and', 'unity', 'in', 'a', 'diverse', 'nation', 'like', 'India', '.']
