In [None]:
import nltk
from nltk.tokenize import (word_tokenize, WordPunctTokenizer, TreebankWordTokenizer,
                           TweetTokenizer, MWETokenizer)
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [None]:
# Input text
text = "don't Natural language processing (NLP) is a field of computer science, artificial intelligence and computational linguistics concerned with the interactions between computers and human (natural) languages, and, in particular, concerned with programming computers to fruitfully process large natural language corpora. Challenges in natural language processing frequently involve natural language understanding, natural language generation (frequently from formal, machine-readable logical forms), connecting language and machine perception, managing human-computer dialog systems, or some combination thereof."


In [None]:
# 1. Whitespace Tokenization
whitespace_tokens = text.split()
print("Whitespace Tokenization:", whitespace_tokens)

Whitespace Tokenization: ["don't", 'Natural', 'language', 'processing', '(NLP)', 'is', 'a', 'field', 'of', 'computer', 'science,', 'artificial', 'intelligence', 'and', 'computational', 'linguistics', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', '(natural)', 'languages,', 'and,', 'in', 'particular,', 'concerned', 'with', 'programming', 'computers', 'to', 'fruitfully', 'process', 'large', 'natural', 'language', 'corpora.', 'Challenges', 'in', 'natural', 'language', 'processing', 'frequently', 'involve', 'natural', 'language', 'understanding,', 'natural', 'language', 'generation', '(frequently', 'from', 'formal,', 'machine-readable', 'logical', 'forms),', 'connecting', 'language', 'and', 'machine', 'perception,', 'managing', 'human-computer', 'dialog', 'systems,', 'or', 'some', 'combination', 'thereof.']


In [None]:
# 2. Punctuation-based Tokenization
word_punct_tokens = WordPunctTokenizer().tokenize(text)
print("Punctuation-based Tokenization:", word_punct_tokens)


Punctuation-based Tokenization: ['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'field', 'of', 'computer', 'science', ',', 'artificial', 'intelligence', 'and', 'computational', 'linguistics', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', '(', 'natural', ')', 'languages', ',', 'and', ',', 'in', 'particular', ',', 'concerned', 'with', 'programming', 'computers', 'to', 'fruitfully', 'process', 'large', 'natural', 'language', 'corpora', '.', 'Challenges', 'in', 'natural', 'language', 'processing', 'frequently', 'involve', 'natural', 'language', 'understanding', ',', 'natural', 'language', 'generation', '(', 'frequently', 'from', 'formal', ',', 'machine', '-', 'readable', 'logical', 'forms', '),', 'connecting', 'language', 'and', 'machine', 'perception', ',', 'managing', 'human', '-', 'computer', 'dialog', 'systems', ',', 'or', 'some', 'combination', 'thereof', '.']


In [None]:
word_punct_tokens = nltk.wordpunct_tokenize(text)
print(word_punct_tokens)

['running', 'better', 'jumping', 'ran']


In [None]:
# 3. Treebank Tokenization
treebank_tokens = TreebankWordTokenizer().tokenize(text)
print("Treebank Tokenization:", treebank_tokens)

Treebank Tokenization: ['do', "n't", 'Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'field', 'of', 'computer', 'science', ',', 'artificial', 'intelligence', 'and', 'computational', 'linguistics', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', '(', 'natural', ')', 'languages', ',', 'and', ',', 'in', 'particular', ',', 'concerned', 'with', 'programming', 'computers', 'to', 'fruitfully', 'process', 'large', 'natural', 'language', 'corpora.', 'Challenges', 'in', 'natural', 'language', 'processing', 'frequently', 'involve', 'natural', 'language', 'understanding', ',', 'natural', 'language', 'generation', '(', 'frequently', 'from', 'formal', ',', 'machine-readable', 'logical', 'forms', ')', ',', 'connecting', 'language', 'and', 'machine', 'perception', ',', 'managing', 'human-computer', 'dialog', 'systems', ',', 'or', 'some', 'combination', 'thereof', '.']


In [None]:
# 4. Tweet Tokenization
tweet_tokenizer = TweetTokenizer()
tweet_tokens = tweet_tokenizer.tokenize(text)
print("Tweet Tokenization:", tweet_tokens)

Tweet Tokenization: ["don't", 'Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'field', 'of', 'computer', 'science', ',', 'artificial', 'intelligence', 'and', 'computational', 'linguistics', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', '(', 'natural', ')', 'languages', ',', 'and', ',', 'in', 'particular', ',', 'concerned', 'with', 'programming', 'computers', 'to', 'fruitfully', 'process', 'large', 'natural', 'language', 'corpora', '.', 'Challenges', 'in', 'natural', 'language', 'processing', 'frequently', 'involve', 'natural', 'language', 'understanding', ',', 'natural', 'language', 'generation', '(', 'frequently', 'from', 'formal', ',', 'machine-readable', 'logical', 'forms', ')', ',', 'connecting', 'language', 'and', 'machine', 'perception', ',', 'managing', 'human-computer', 'dialog', 'systems', ',', 'or', 'some', 'combination', 'thereof', '.']


In [None]:
# 5. Multi-Word Expression Tokenization
tokenizer = MWETokenizer()
tokenizer.add_mwe(("Natural", "language", "processing"))
tokenizer.add_mwe(("artificial", "intelligence"))
mwe_tokens = tokenizer.tokenize(word_tokenize(text))
print("MWE Tokenization:", mwe_tokens)

MWE Tokenization: ['do', "n't", 'Natural_language_processing', '(', 'NLP', ')', 'is', 'a', 'field', 'of', 'computer', 'science', ',', 'artificial_intelligence', 'and', 'computational', 'linguistics', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', '(', 'natural', ')', 'languages', ',', 'and', ',', 'in', 'particular', ',', 'concerned', 'with', 'programming', 'computers', 'to', 'fruitfully', 'process', 'large', 'natural', 'language', 'corpora', '.', 'Challenges', 'in', 'natural', 'language', 'processing', 'frequently', 'involve', 'natural', 'language', 'understanding', ',', 'natural', 'language', 'generation', '(', 'frequently', 'from', 'formal', ',', 'machine-readable', 'logical', 'forms', ')', ',', 'connecting', 'language', 'and', 'machine', 'perception', ',', 'managing', 'human-computer', 'dialog', 'systems', ',', 'or', 'some', 'combination', 'thereof', '.']


In [None]:
# Porter Stemmer
porter_stemmer = PorterStemmer()
porter_stems = [porter_stemmer.stem(token) for token in word_tokenize(text)]
print("Porter Stemmer:", porter_stems)


Porter Stemmer: ['natur', 'languag', 'process', '(', 'nlp', ')', 'is', 'a', 'field', 'of', 'comput', 'scienc', ',', 'artifici', 'intellig', 'and', 'comput', 'linguist', 'concern', 'with', 'the', 'interact', 'between', 'comput', 'and', 'human', '(', 'natur', ')', 'languag', ',', 'and', ',', 'in', 'particular', ',', 'concern', 'with', 'program', 'comput', 'to', 'fruit', 'process', 'larg', 'natur', 'languag', 'corpora', '.', 'challeng', 'in', 'natur', 'languag', 'process', 'frequent', 'involv', 'natur', 'languag', 'understand', ',', 'natur', 'languag', 'gener', '(', 'frequent', 'from', 'formal', ',', 'machine-read', 'logic', 'form', ')', ',', 'connect', 'languag', 'and', 'machin', 'percept', ',', 'manag', 'human-comput', 'dialog', 'system', ',', 'or', 'some', 'combin', 'thereof', '.']


In [None]:
# Snowball Stemmer
snowball_stemmer = SnowballStemmer("english")
snowball_stems = [snowball_stemmer.stem(token) for token in word_tokenize(text)]
print("Snowball Stemmer:", snowball_stems)


Snowball Stemmer: ['natur', 'languag', 'process', '(', 'nlp', ')', 'is', 'a', 'field', 'of', 'comput', 'scienc', ',', 'artifici', 'intellig', 'and', 'comput', 'linguist', 'concern', 'with', 'the', 'interact', 'between', 'comput', 'and', 'human', '(', 'natur', ')', 'languag', ',', 'and', ',', 'in', 'particular', ',', 'concern', 'with', 'program', 'comput', 'to', 'fruit', 'process', 'larg', 'natur', 'languag', 'corpora', '.', 'challeng', 'in', 'natur', 'languag', 'process', 'frequent', 'involv', 'natur', 'languag', 'understand', ',', 'natur', 'languag', 'generat', '(', 'frequent', 'from', 'formal', ',', 'machine-read', 'logic', 'form', ')', ',', 'connect', 'languag', 'and', 'machin', 'percept', ',', 'manag', 'human-comput', 'dialog', 'system', ',', 'or', 'some', 'combin', 'thereof', '.']


In [None]:
# Lemmatization
lemmatizer = WordNetLemmatizer()

In [None]:
# Function to get POS tag for lemmatization
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [None]:
text = "running better jumping ran"

In [None]:
lemmatized_words = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in word_tokenize(text)]
print("Lemmatized Words:", lemmatized_words)

Lemmatized Words: ['run', 'well', 'jumping', 'ran']


In [None]:
wnl = WordNetLemmatizer()
example_words = ["program","programming","programer","programs","programmed"]

print("{0:20}{1:20}".format("--Word--","--Lemma--"))
for word in example_words:
   print ("{0:20}{1:20}".format(word, wnl.lemmatize(word, pos="v")))

--Word--            --Lemma--           
program             program             
programming         program             
programer           programer           
programs            program             
programmed          program             
