In [1]:
# !pip install nltk scikit-learn pandas

import nltk
import re
import pandas as pd

print("Downloading NLTK resources...")
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
print("Downloads complete.")

Downloading NLTK resources...
Downloads complete.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tarru\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tarru\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tarru\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [3]:
raw_text = """
Text analytics is the process of deriving high-quality information from text. 
It involves structuring the input text, deriving patterns within the structured data, 
and finally, the interpretation and evaluation of the output. 
High-quality information is typically derived through the devising of patterns and trends.
"""

print("--- Raw Text ---")
print(raw_text)

--- Raw Text ---

Text analytics is the process of deriving high-quality information from text. 
It involves structuring the input text, deriving patterns within the structured data, 
and finally, the interpretation and evaluation of the output. 
High-quality information is typically derived through the devising of patterns and trends.



In [4]:
tokens = word_tokenize(raw_text.lower())
print(f"1. Tokens (lowercase): \n{tokens}\n")

words = [word for word in tokens if word.isalpha()]
print(f"2. Words (no punctuation): \n{words}\n")

words_no_stopwords = [word for word in words if word not in stop_words]
print(f"3. Words (no stop words): \n{words_no_stopwords}\n")

lemmatized_words = [lemmatizer.lemmatize(word) for word in words_no_stopwords]
print(f"4. Lemmatized Words (final clean text): \n{lemmatized_words}\n")

processed_text = " ".join(lemmatized_words)

1. Tokens (lowercase): 
['text', 'analytics', 'is', 'the', 'process', 'of', 'deriving', 'high-quality', 'information', 'from', 'text', '.', 'it', 'involves', 'structuring', 'the', 'input', 'text', ',', 'deriving', 'patterns', 'within', 'the', 'structured', 'data', ',', 'and', 'finally', ',', 'the', 'interpretation', 'and', 'evaluation', 'of', 'the', 'output', '.', 'high-quality', 'information', 'is', 'typically', 'derived', 'through', 'the', 'devising', 'of', 'patterns', 'and', 'trends', '.']

2. Words (no punctuation): 
['text', 'analytics', 'is', 'the', 'process', 'of', 'deriving', 'information', 'from', 'text', 'it', 'involves', 'structuring', 'the', 'input', 'text', 'deriving', 'patterns', 'within', 'the', 'structured', 'data', 'and', 'finally', 'the', 'interpretation', 'and', 'evaluation', 'of', 'the', 'output', 'information', 'is', 'typically', 'derived', 'through', 'the', 'devising', 'of', 'patterns', 'and', 'trends']

3. Words (no stop words): 
['text', 'analytics', 'process', 

In [5]:
corpus = [processed_text]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

feature_names = vectorizer.get_feature_names_out()

df_bow = pd.DataFrame(X.toarray(), columns=feature_names)

print("--- Bag-of-Words Feature Vector ---")
df_bow

--- Bag-of-Words Feature Vector ---


Unnamed: 0,analytics,data,derived,deriving,devising,evaluation,finally,information,input,interpretation,involves,output,pattern,process,structured,structuring,text,trend,typically,within
0,1,1,1,2,1,1,1,2,1,1,1,1,2,1,1,1,3,1,1,1


In [6]:
freq_dist = nltk.FreqDist(lemmatized_words)

print("--- Word Frequency Analysis ---")
print("Most common words in the text:")
freq_dist.most_common(5)

--- Word Frequency Analysis ---
Most common words in the text:


[('text', 3),
 ('deriving', 2),
 ('information', 2),
 ('pattern', 2),
 ('analytics', 1)]