In [1]:
# Required Imports

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import string


In [2]:
# Download necessary NLTK data

nltk. download('punkt')
nltk. download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Sample document

corpus = "I am Siddhi Horane from sinhgad institute of technology and science lonavala ."

In [4]:
 # Step 1: Tokenization

tokens = word_tokenize(corpus)
print("Tokens:", tokens)

Tokens: ['I', 'am', 'Siddhi', 'Horane', 'from', 'sinhgad', 'institute', 'of', 'technology', 'and', 'science', 'lonavala', '.']


In [5]:
# step 2 : POS Tagging

from nltk import pos_tag

In [6]:
tokens = word_tokenize(corpus)
print(pos_tag(tokens))

[('I', 'PRP'), ('am', 'VBP'), ('Siddhi', 'NNP'), ('Horane', 'NNP'), ('from', 'IN'), ('sinhgad', 'JJ'), ('institute', 'NN'), ('of', 'IN'), ('technology', 'NN'), ('and', 'CC'), ('science', 'NN'), ('lonavala', 'NN'), ('.', '.')]


In [7]:
# Step 3: Remove stop words and punctuation

stop_words = set(stopwords.words("english"))
cleaned = [word.lower() for word in tokens if word.lower() not in stop_words and word not in string.punctuation]
print("Cleaned Words:", cleaned)


Cleaned Words: ['siddhi', 'horane', 'sinhgad', 'institute', 'technology', 'science', 'lonavala']


In [8]:
# Step 4: Stemming

stemmer = PorterStemmer()
stemmed = [stemmer.stem(word) for word in cleaned]
print("Stemmed Words:", stemmed)

Stemmed Words: ['siddhi', 'horan', 'sinhgad', 'institut', 'technolog', 'scienc', 'lonavala']


In [9]:
# Step 5: TF-IDF on two simple sentences

from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
# Sample documents

corpus = [
    "Siddhi Horane is a computer engineering student.",
    "she is from sinhgad institute of technology lonavala."
]

In [11]:
# Initialize vectorizer

vectorizer = TfidfVectorizer()

In [13]:
# Fit and transform the documents

matrix = vectorizer.fit_transform(corpus)
print(vectorizer.vocabulary_)

{'siddhi': 9, 'horane': 3, 'is': 5, 'computer': 0, 'engineering': 1, 'student': 11, 'she': 8, 'from': 2, 'sinhgad': 10, 'institute': 4, 'of': 7, 'technology': 12, 'lonavala': 6}


In [14]:
# Step 6: Show TF-IDF values

tfidf_matrix = vectorizer.transform(corpus)
print(tfidf_matrix)

  (0, 11)	0.42615959880289433
  (0, 9)	0.42615959880289433
  (0, 5)	0.3032160644503863
  (0, 3)	0.42615959880289433
  (0, 1)	0.42615959880289433
  (0, 0)	0.42615959880289433
  (1, 12)	0.3649964681447582
  (1, 10)	0.3649964681447582
  (1, 8)	0.3649964681447582
  (1, 7)	0.3649964681447582
  (1, 6)	0.3649964681447582
  (1, 5)	0.25969799324016246
  (1, 4)	0.3649964681447582
  (1, 2)	0.3649964681447582
