In [1]:
import nltk
import string
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import word_tokenize


In [2]:
# Download required resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to C:\Users\HARSH/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HARSH/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\HARSH/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to C:\Users\HARSH/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Sample Document
doc = "Hi I am Eating a Mango which is very tasty"
print("Original Document:")
print(doc)

Original Document:
Hi I am Eating a Mango which is very tasty


In [4]:
# Step 1: Tokenization
tokens = word_tokenize(doc)
print("\nTokenized Words:")
print(tokens)


Tokenized Words:
['Hi', 'I', 'am', 'Eating', 'a', 'Mango', 'which', 'is', 'very', 'tasty']


In [5]:
# Step 2: POS Tagging
pos_tags = nltk.pos_tag(tokens)
print("\nPOS Tagging:")
print(pos_tags)


POS Tagging:
[('Hi', 'NNP'), ('I', 'PRP'), ('am', 'VBP'), ('Eating', 'VBG'), ('a', 'DT'), ('Mango', 'NNP'), ('which', 'WDT'), ('is', 'VBZ'), ('very', 'RB'), ('tasty', 'JJ')]


In [7]:
# Step 3: Stop Words Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [w for w in tokens if w.lower() not in stop_words and w not in string.punctuation]
print("\nAfter Stop Words Removal:")
print(filtered_tokens)


After Stop Words Removal:
['Hi', 'Eating', 'Mango', 'tasty']


In [8]:
 # Step 4: Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(w) for w in tokens]
print("\nAfter Stemming:")
print(stemmed_tokens)


After Stemming:
['hi', 'i', 'am', 'eat', 'a', 'mango', 'which', 'is', 'veri', 'tasti']


In [9]:
# Step 5: Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(w.lower()) for w in filtered_tokens]
print("\nAfter Lemmatization:")
print(lemmatized_tokens)


After Lemmatization:
['hi', 'eating', 'mango', 'tasty']


In [10]:
# Step 6: TF-IDF Representation
corpus = [doc] # Add more documents to this list if needed
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print("\nTF-IDF Representation:")
print(tfidf_df)


TF-IDF Representation:
         am    eating        hi        is     mango     tasty      very  \
0  0.353553  0.353553  0.353553  0.353553  0.353553  0.353553  0.353553   

      which  
0  0.353553  


In [11]:
 # Step 7: CountVectorizer Representation
count_vectorizer = CountVectorizer()
count_matrix = count_vectorizer.fit_transform(corpus)
count_df = pd.DataFrame(count_matrix.toarray(), columns=count_vectorizer.get_feature_names_out())
print("\nCountVectorizer Representation:")
print(count_df)


CountVectorizer Representation:
   am  eating  hi  is  mango  tasty  very  which
0   1       1   1   1      1      1     1      1
