In [None]:
import nltk

# Download all required resources
nltk.download('punkt')                      # for tokenization
nltk.download('stopwords')                  # for stopword removal
nltk.download('averaged_perceptron_tagger') # for POS tagging
nltk.download('wordnet')                    # for lemmatization
nltk.download('omw-1.4')                    # wordnet lemmatizer support


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dhans\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dhans\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\dhans\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dhans\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\dhans\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
#  STEP 1: Fix LookupError by downloading NLTK resources
import nltk
nltk.download('punkt_tab')
nltk.download('punkt')                      # Tokenizer
nltk.download('stopwords')                  # Stop words
nltk.download('averaged_perceptron_tagger') # POS Tagging
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')                    # Lemmatization
nltk.download('omw-1.4')                    # WordNet Lemma support

#  STEP 2: Imports
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer

#  STEP 3: Sample document
document = """Natural Language Processing (NLP) is a fascinating field of Artificial Intelligence 
that helps computers understand human language. It includes tasks like machine translation, 
sentiment analysis, and speech recognition."""

print("\n--- Part 1: Preprocessing ---\n")

# Tokenization
tokens = word_tokenize(document)
print("1. Tokens:\n", tokens)

# POS Tagging
pos_tags = pos_tag(tokens)
print("\n2. POS Tags:\n", pos_tags)

# Stopwords Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words and word.isalpha()]
print("\n3. Tokens after Stopword Removal:\n", filtered_tokens)

# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print("\n4. Stemmed Tokens:\n", stemmed_tokens)

# Helper for POS Tag mapping
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) 
                     for word, pos in pos_tag(filtered_tokens)]
print("\n5. Lemmatized Tokens:\n", lemmatized_tokens)

print("\n--- Part 2: TF-IDF Vectorization ---\n")

# Sample document set
corpus = [
    "Natural Language Processing is fun and useful.",
    "Machine learning and NLP can be used for sentiment analysis.",
    "TF-IDF is a common technique for text representation."
]

# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(corpus)

# Show the results
print("TF-IDF Feature Names:\n", vectorizer.get_feature_names_out())
print("\nTF-IDF Matrix:\n", X.toarray())


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\dhans\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dhans\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dhans\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\dhans\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\dhans\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dhans\AppData\Roaming\nltk


--- Part 1: Preprocessing ---

1. Tokens:
 ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'a', 'fascinating', 'field', 'of', 'Artificial', 'Intelligence', 'that', 'helps', 'computers', 'understand', 'human', 'language', '.', 'It', 'includes', 'tasks', 'like', 'machine', 'translation', ',', 'sentiment', 'analysis', ',', 'and', 'speech', 'recognition', '.']

2. POS Tags:
 [('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('(', '('), ('NLP', 'NNP'), (')', ')'), ('is', 'VBZ'), ('a', 'DT'), ('fascinating', 'JJ'), ('field', 'NN'), ('of', 'IN'), ('Artificial', 'NNP'), ('Intelligence', 'NNP'), ('that', 'WDT'), ('helps', 'VBZ'), ('computers', 'NNS'), ('understand', 'VBP'), ('human', 'JJ'), ('language', 'NN'), ('.', '.'), ('It', 'PRP'), ('includes', 'VBZ'), ('tasks', 'NNS'), ('like', 'IN'), ('machine', 'NN'), ('translation', 'NN'), (',', ','), ('sentiment', 'NN'), ('analysis', 'NN'), (',', ','), ('and', 'CC'), ('speech', 'JJ'), ('recognition', 'NN'), ('.', '.')]

3. 