<h2>Tokenization</h2>Tokenization is the process of splitting a text into smaller units, such as words or sentences.

In [2]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize


text = "Tokenization is the process of splitting a text into smaller units, such as words or sentences."

# Tokenize the text into words
tokens = word_tokenize(text)
print("Word Tokens:", tokens)

# Tokenize the text into sentences
sentences = sent_tokenize(text)
print()
print("Sentences:", sentences)


Word Tokens: ['Tokenization', 'is', 'the', 'process', 'of', 'splitting', 'a', 'text', 'into', 'smaller', 'units', ',', 'such', 'as', 'words', 'or', 'sentences', '.']

Sentences: ['Tokenization is the process of splitting a text into smaller units, such as words or sentences.']


<h2>Stopword</h2> Common words that often do not convey significant meaning to the text.

In [10]:
from nltk.corpus import stopwords


nltk.download('stopwords')


text = "Stopwords are common words that often do not convey significant meaning and are typically removed from the text."

# Tokenize the text into words
tokens = word_tokenize(text)

# Remove stopwords
filtered_tokens = [word for word in tokens if word.lower() not in stopwords.words('english')]
print("Filtered Tokens after Stopword Removal:", filtered_tokens)


Filtered Tokens after Stopword Removal: ['Stopwords', 'common', 'words', 'often', 'convey', 'significant', 'meaning', 'typically', 'removed', 'text', '.']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arjun\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<h2>Stemming and Lemmatization</h2>Stemming usually remove the suffixes to produce the root form, while lemmatization considers the context and converts words to their meaningful base form.

In [11]:
from nltk.stem import PorterStemmer, WordNetLemmatizer


text = "Stemming and Lemmatization both aim to reduce words to their base or root form, but they achieve this in slightly different ways."

# Tokenize the text into words
tokens = word_tokenize(text)

# Stemming
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in tokens]
print("Stemmed Words:", stemmed_words)

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
print()
print("Lemmatized Words:", lemmatized_words)


Stemmed Words: ['stem', 'and', 'lemmat', 'both', 'aim', 'to', 'reduc', 'word', 'to', 'their', 'base', 'or', 'root', 'form', ',', 'but', 'they', 'achiev', 'thi', 'in', 'slightli', 'differ', 'way', '.']

Lemmatized Words: ['Stemming', 'and', 'Lemmatization', 'both', 'aim', 'to', 'reduce', 'word', 'to', 'their', 'base', 'or', 'root', 'form', ',', 'but', 'they', 'achieve', 'this', 'in', 'slightly', 'different', 'way', '.']


<h2>Part-of-Speech (POS) Tagging</h2> POS tagging assigns a part of speech to each word in a sentence, such as noun, verb, adjective, etc.

In [12]:

text = "POS tagging assigns a part of speech to each word in a sentence, such as noun, verb, adjective, etc."

# Tokenize the text into words
tokens = word_tokenize(text)

# Perform POS tagging
pos_tags = nltk.pos_tag(tokens)
print("POS Tags:", pos_tags)


POS Tags: [('POS', 'NNP'), ('tagging', 'VBG'), ('assigns', 'RP'), ('a', 'DT'), ('part', 'NN'), ('of', 'IN'), ('speech', 'NN'), ('to', 'TO'), ('each', 'DT'), ('word', 'NN'), ('in', 'IN'), ('a', 'DT'), ('sentence', 'NN'), (',', ','), ('such', 'JJ'), ('as', 'IN'), ('noun', 'NNS'), (',', ','), ('verb', 'NN'), (',', ','), ('adjective', 'JJ'), (',', ','), ('etc', 'FW'), ('.', '.')]
