In [2]:
import nltk
import shutil
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer


# OPTIONAL: Reset NLTK data (uncomment only if you want to redownload)
# shutil.rmtree("C:/Users/nanda/AppData/Roaming/nltk_data", ignore_errors=True)

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
# Preprocessing functions
def preprocess_text(text):
    """Preprocess the text by converting to lowercase and removing special characters"""
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)  # Remove punctuation
    return text

def tokenize_text(text):
    """Tokenize the text into words"""
    return word_tokenize(text)

def pos_tag_tokens(tokens):
    """Perform POS tagging on tokens"""
    return pos_tag(tokens)

def remove_stop_words(tokens):
    """Remove stop words from tokens"""
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word.lower() not in stop_words]

def stem_tokens(tokens):
    """Apply stemming to tokens"""
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in tokens]

def lemmatize_tokens(tokens):
    """Apply lemmatization to tokens with POS tagging"""
    lemmatizer = WordNetLemmatizer()

    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wn.ADJ
        elif treebank_tag.startswith('V'):
            return wn.VERB
        elif treebank_tag.startswith('N'):
            return wn.NOUN
        elif treebank_tag.startswith('R'):
            return wn.ADV
        else:
            return wn.NOUN

    pos_tags = pos_tag(tokens)
    return [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags]

def get_tfidf_representation(documents):
    """Calculate TF-IDF representation of documents"""
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
    return tfidf_matrix, tfidf_vectorizer.get_feature_names_out()

# Sample document
text = "I am a student. Hello!! There is a session going on."

# Apply all preprocessing steps
preprocessed_text = preprocess_text(text)
tokens = tokenize_text(preprocessed_text)
pos_tags = pos_tag_tokens(tokens)
filtered_tokens = remove_stop_words(tokens)
stemmed_tokens = stem_tokens(filtered_tokens)
lemmatized_tokens = lemmatize_tokens(filtered_tokens)

tfidf_matrix, tfidf_features = get_tfidf_representation([preprocessed_text])

# Print all results
print("Original Text:\n", text)
print("\nPreprocessed Text:\n", preprocessed_text)
print("\nTokens:\n", tokens)
print("\nPOS Tags:\n", pos_tags)
print("\nFiltered Tokens (Stop Words Removed):\n", filtered_tokens)
print("\nStemmed Tokens:\n", stemmed_tokens)
print("\nLemmatized Tokens:\n", lemmatized_tokens)
print("\nTF-IDF Features:\n", tfidf_features)
print("\nTF-IDF Matrix:\n", tfidf_matrix.toarray())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nanda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\nanda\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nanda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nanda\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\nanda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\nanda\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\a

Original Text:
 I am a student. Hello!! There is a session going on.

Preprocessed Text:
 i am a student  hello   there is a session going on 

Tokens:
 ['i', 'am', 'a', 'student', 'hello', 'there', 'is', 'a', 'session', 'going', 'on']

POS Tags:
 [('i', 'NN'), ('am', 'VBP'), ('a', 'DT'), ('student', 'NN'), ('hello', 'NN'), ('there', 'EX'), ('is', 'VBZ'), ('a', 'DT'), ('session', 'NN'), ('going', 'VBG'), ('on', 'IN')]

Filtered Tokens (Stop Words Removed):
 ['student', 'hello', 'session', 'going']

Stemmed Tokens:
 ['student', 'hello', 'session', 'go']

Lemmatized Tokens:
 ['student', 'hello', 'session', 'go']

TF-IDF Features:
 ['am' 'going' 'hello' 'is' 'on' 'session' 'student' 'there']

TF-IDF Matrix:
 [[0.35355339 0.35355339 0.35355339 0.35355339 0.35355339 0.35355339
  0.35355339 0.35355339]]
