In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

In [2]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
document = """Natural language processing (NLP) is a subfield of artificial intelligence (AI) that focuses on the interaction between computers and humans using natural language. It involves the analysis, understanding, and generation of human language, enabling machines to process and comprehend text in a meaningful way. NLP techniques are widely used in various applications such as sentiment analysis, machine translation, chatbots, and information retrieval. Preprocessing is an essential step in NLP, which involves tokenization, part-of-speech tagging, stop words removal, stemming, and lemmatization."""

In [3]:
# Tokenization

"""
In Python tokenization basically refers to splitting up a larger body of text into smaller lines, words or even creating words for a non-English language.
"""

tokens = word_tokenize(document)

In [4]:
# POS Tagging

"""
POS Tagging Parts of speech Tagging is responsible for reading the text in a language and assigning some specific token (Parts of Speech) to each word.
"""

pos_tags = pos_tag(tokens)

In [5]:
# Stop words removal

"""
Stop words removal in Python is a common preprocessing step in Natural Language Processing (NLP) applications.
Stop words are words that do not add much meaning to a sentence and are pre-defined and cannot be removed
"""

stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

In [6]:
# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]

In [7]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

In [8]:
# Print the results
print("Original Document:\n", document)
print("\nTokens:\n", tokens)
print("\nPOS Tags:\n", pos_tags)
print("\nFiltered Tokens (after stop words removal):\n", filtered_tokens)
print("\nStemmed Tokens:\n", stemmed_tokens)
print("\nLemmatized Tokens:\n", lemmatized_tokens)