In [6]:
!pip install nltk



In [16]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.probability import FreqDist
import string

# Downloading required datasets
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng') # Added to resolve LookupError for POS tagging

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [25]:
text = """
Elon Musk is the Richest person on the planet earth.
He is going to IPO his most valued company SpaceX in 2026.
SpaceX is a pvt ltd Company now.
Larry page is the second richest guy.
His company is Google.
That is already in stock markets
"""

print("Original Text:\n")
print(text)


Original Text:


Elon Musk is the Richest person on the planet earth.
He is going to IPO his most valued company SpaceX in 2026.
SpaceX is a pvt ltd Company now.
Larry page is the second richest guy.
His company is Google.
That is already in stock markets



In [26]:
sentences = sent_tokenize(text)

print("Sentence Tokenization:\n")
for s in sentences:
    print(s)


Sentence Tokenization:


Elon Musk is the Richest person on the planet earth.
He is going to IPO his most valued company SpaceX in 2026.
SpaceX is a pvt ltd Company now.
Larry page is the second richest guy.
His company is Google.
That is already in stock markets


In [27]:
words = word_tokenize(text)

print("Word Tokenization:\n")
print(words)


Word Tokenization:

['Elon', 'Musk', 'is', 'the', 'Richest', 'person', 'on', 'the', 'planet', 'earth', '.', 'He', 'is', 'going', 'to', 'IPO', 'his', 'most', 'valued', 'company', 'SpaceX', 'in', '2026', '.', 'SpaceX', 'is', 'a', 'pvt', 'ltd', 'Company', 'now', '.', 'Larry', 'page', 'is', 'the', 'second', 'richest', 'guy', '.', 'His', 'company', 'is', 'Google', '.', 'That', 'is', 'already', 'in', 'stock', 'markets']


In [28]:
stop_words = set(stopwords.words("english"))

filtered_words = [
    word for word in words
    if word.lower() not in stop_words and word not in string.punctuation
]

print("After Stopword Removal:\n")
print(filtered_words)


After Stopword Removal:

['Elon', 'Musk', 'Richest', 'person', 'planet', 'earth', 'going', 'IPO', 'valued', 'company', 'SpaceX', '2026', 'SpaceX', 'pvt', 'ltd', 'Company', 'Larry', 'page', 'second', 'richest', 'guy', 'company', 'Google', 'already', 'stock', 'markets']


In [29]:
pos_tags = nltk.pos_tag(words)

print("POS Tagging:\n")
print(pos_tags)


POS Tagging:

[('Elon', 'NNP'), ('Musk', 'NNP'), ('is', 'VBZ'), ('the', 'DT'), ('Richest', 'NNP'), ('person', 'NN'), ('on', 'IN'), ('the', 'DT'), ('planet', 'NN'), ('earth', 'NN'), ('.', '.'), ('He', 'PRP'), ('is', 'VBZ'), ('going', 'VBG'), ('to', 'TO'), ('IPO', 'VB'), ('his', 'PRP$'), ('most', 'JJS'), ('valued', 'VBN'), ('company', 'NN'), ('SpaceX', 'NNP'), ('in', 'IN'), ('2026', 'CD'), ('.', '.'), ('SpaceX', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('pvt', 'NN'), ('ltd', 'JJ'), ('Company', 'NNP'), ('now', 'RB'), ('.', '.'), ('Larry', 'NNP'), ('page', 'NN'), ('is', 'VBZ'), ('the', 'DT'), ('second', 'JJ'), ('richest', 'NN'), ('guy', 'NN'), ('.', '.'), ('His', 'PRP$'), ('company', 'NN'), ('is', 'VBZ'), ('Google', 'NNP'), ('.', '.'), ('That', 'DT'), ('is', 'VBZ'), ('already', 'RB'), ('in', 'IN'), ('stock', 'NN'), ('markets', 'NNS')]


In [30]:
fdist = FreqDist(filtered_words)

print("Most Common Words:\n")
print(fdist.most_common(5))


Most Common Words:

[('company', 2), ('SpaceX', 2), ('Elon', 1), ('Musk', 1), ('Richest', 1)]
