In [8]:
import nltk
import string
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer

# Download necessary NLTK resources
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to C:\Users\Gaurang
[nltk_data]     Vaghela\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Gaurang
[nltk_data]     Vaghela\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Gaurang
[nltk_data]     Vaghela\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Gaurang Vaghela\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [17]:
file = open("C:/Users/Gaurang Vaghela/OneDrive/Desktop/Dataset/Text.txt",'r')
text = """Natural Language Processing (NLP) is a field of AI that enables computers to understand human language.
It involves text processing techniques like tokenization, stemming, and lemmatization."""

print(text)
print('\n')

tokens_sent = nltk.sent_tokenize(text)
print(tokens_sent)
print('\n')

tokens_words = nltk.word_tokenize(text)
print(tokens_words)

Natural Language Processing (NLP) is a field of AI that enables computers to understand human language.
It involves text processing techniques like tokenization, stemming, and lemmatization.


['Natural Language Processing (NLP) is a field of AI that enables computers to understand human language.', 'It involves text processing techniques like tokenization, stemming, and lemmatization.']


['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'a', 'field', 'of', 'AI', 'that', 'enables', 'computers', 'to', 'understand', 'human', 'language', '.', 'It', 'involves', 'text', 'processing', 'techniques', 'like', 'tokenization', ',', 'stemming', ',', 'and', 'lemmatization', '.']


In [18]:
tagged = pos_tag(tokens_words)
print(tagged)

[('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('(', '('), ('NLP', 'NNP'), (')', ')'), ('is', 'VBZ'), ('a', 'DT'), ('field', 'NN'), ('of', 'IN'), ('AI', 'NNP'), ('that', 'WDT'), ('enables', 'VBZ'), ('computers', 'NNS'), ('to', 'TO'), ('understand', 'VB'), ('human', 'JJ'), ('language', 'NN'), ('.', '.'), ('It', 'PRP'), ('involves', 'VBZ'), ('text', 'JJ'), ('processing', 'VBG'), ('techniques', 'NNS'), ('like', 'IN'), ('tokenization', 'NN'), (',', ','), ('stemming', 'VBG'), (',', ','), ('and', 'CC'), ('lemmatization', 'NN'), ('.', '.')]


In [11]:
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens_words if word.lower() not in stop_words and word not in string.punctuation]
print("Filtered Tokens (Stopwords Removed):", filtered_tokens)

Filtered Tokens (Stopwords Removed): ['Natural', 'Language', 'Processing', 'NLP', 'field', 'AI', 'enables', 'computers', 'understand', 'human', 'language', 'involves', 'text', 'processing', 'techniques', 'like', 'tokenization', 'stemming', 'lemmatization']


In [12]:
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print("Stemmed Tokens:", stemmed_tokens)

Stemmed Tokens: ['natur', 'languag', 'process', 'nlp', 'field', 'ai', 'enabl', 'comput', 'understand', 'human', 'languag', 'involv', 'text', 'process', 'techniqu', 'like', 'token', 'stem', 'lemmat']


In [13]:
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("Lemmatized Tokens:", lemmatized_tokens)

Lemmatized Tokens: ['Natural', 'Language', 'Processing', 'NLP', 'field', 'AI', 'enables', 'computer', 'understand', 'human', 'language', 'involves', 'text', 'processing', 'technique', 'like', 'tokenization', 'stemming', 'lemmatization']


In [14]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(tokens_sent)

# Convert TF-IDF matrix to DataFrame for better visualization
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
print("\nTF-IDF Representation:\n", df_tfidf)


TF-IDF Representation:
          ai       and  computers   enables     field     human  involves  \
0  0.246136  0.000000   0.246136  0.246136  0.246136  0.246136  0.000000   
1  0.000000  0.324336   0.000000  0.000000  0.000000  0.000000  0.324336   

         is        it  language  ...       nlp        of  processing  \
0  0.246136  0.000000  0.492273  ...  0.246136  0.246136    0.175128   
1  0.000000  0.324336  0.000000  ...  0.000000  0.000000    0.230768   

   stemming  techniques      text      that        to  tokenization  \
0  0.000000    0.000000  0.000000  0.246136  0.246136      0.000000   
1  0.324336    0.324336  0.324336  0.000000  0.000000      0.324336   

   understand  
0    0.246136  
1    0.000000  

[2 rows x 23 columns]
