In [18]:
import numpy as np
import pandas as pd
import nltk

In [19]:
with open("document.txt", "r", encoding="utf-8-sig") as file:
    document = file.read()
print(document)

Text analytics is the process of extracting meaningful insights from textual data. It involves various preprocessing techniques such as tokenization, stop-word removal, stemming, and lemmatization. These techniques help in structuring unstructured text, making it easier to analyze.

Natural Language Processing (NLP) plays a significant role in text analytics by enabling machines to understand and process human language. Applications of text analytics include sentiment analysis, document classification, and information retrieval.

By computing Term Frequency-Inverse Document Frequency (TF-IDF), we can identify important words in a document relative to a collection of documents.



In [20]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

In [21]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to /home/shreenath/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/shreenath/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/shreenath/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/shreenath/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/shreenath/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/shreenath/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [22]:
stop_words = set(stopwords.words('english'))
tokens = word_tokenize(document)
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print("\nTokens after Stop Word Removal:\n\n", filtered_tokens)


Tokens after Stop Word Removal:

 ['Text', 'analytics', 'process', 'extracting', 'meaningful', 'insights', 'textual', 'data', '.', 'involves', 'various', 'preprocessing', 'techniques', 'tokenization', ',', 'stop-word', 'removal', ',', 'stemming', ',', 'lemmatization', '.', 'techniques', 'help', 'structuring', 'unstructured', 'text', ',', 'making', 'easier', 'analyze', '.', 'Natural', 'Language', 'Processing', '(', 'NLP', ')', 'plays', 'significant', 'role', 'text', 'analytics', 'enabling', 'machines', 'understand', 'process', 'human', 'language', '.', 'Applications', 'text', 'analytics', 'include', 'sentiment', 'analysis', ',', 'document', 'classification', ',', 'information', 'retrieval', '.', 'computing', 'Term', 'Frequency-Inverse', 'Document', 'Frequency', '(', 'TF-IDF', ')', ',', 'identify', 'important', 'words', 'document', 'relative', 'collection', 'documents', '.']


In [23]:
from nltk.stem import PorterStemmer
from tabulate import tabulate

sentences = sent_tokenize(document)

df = pd.DataFrame({'Original Sentence': sentences})

stemmer = PorterStemmer()

def stem_words(sentence):
    words = word_tokenize(sentence)
    return ' '.join([stemmer.stem(word) for word in words])

df['Stemmed Sentence'] = df['Original Sentence'].apply(stem_words)

pd.set_option('display.max_colwidth', 50)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', None)

print("\nProcessed DataFrame (Stemmed Sentences):\n")
print(tabulate(df, headers='keys', tablefmt='grid'))


Processed DataFrame (Stemmed Sentences):

+----+-------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+
|    | Original Sentence                                                                                                                                     | Stemmed Sentence                                                                                                                        |
|  0 | Text analytics is the process of extracting meaningful insights from textual data.                                                                    | text analyt is the process of extract meaning insight from textual data .                                                               |
+----+------------------------------------------------------------

In [24]:
from nltk.stem import WordNetLemmatizer
from tabulate import tabulate

sentences = sent_tokenize(document)

df = pd.DataFrame({'Original Sentence': sentences})

lemmatizer = WordNetLemmatizer()

def lemmatize_words(sentence):
    words = word_tokenize(sentence)
    return ' '.join([lemmatizer.lemmatize(word) for word in words])

df['Lemmatized Sentence'] = df['Original Sentence'].apply(lemmatize_words)

pd.set_option('display.max_colwidth', 50)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', None)

print("\nProcessed DataFrame (Lemmatized Sentences):\n")
print(tabulate(df, headers='keys', tablefmt='grid'))


Processed DataFrame (Lemmatized Sentences):

+----+-------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+
|    | Original Sentence                                                                                                                                     | Lemmatized Sentence                                                                                                                                     |
|  0 | Text analytics is the process of extracting meaningful insights from textual data.                                                                    | Text analytics is the process of extracting meaningful insight from textual data .                                                                      |
+----+---------

In [25]:
import string

tokens = word_tokenize(document)
tokens = [word for word in tokens if word not in string.punctuation]

df = pd.DataFrame({'tokenized_word:': tokens})

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

print("\nProcessed DataFrame:\n")
print(df.to_string(index=True))


Processed DataFrame:

      tokenized_word:
0                Text
1           analytics
2                  is
3                 the
4             process
5                  of
6          extracting
7          meaningful
8            insights
9                from
10            textual
11               data
12                 It
13           involves
14            various
15      preprocessing
16         techniques
17               such
18                 as
19       tokenization
20          stop-word
21            removal
22           stemming
23                and
24      lemmatization
25              These
26         techniques
27               help
28                 in
29        structuring
30       unstructured
31               text
32             making
33                 it
34             easier
35                 to
36            analyze
37            Natural
38           Language
39         Processing
40                NLP
41              plays
42                  a
43       

In [26]:
from nltk import pos_tag

tokens = word_tokenize(document)
tokens = [word for word in tokens if word not in string.punctuation]

pos_tags = pos_tag(tokens)

df = pd.DataFrame(pos_tags, columns=['Tokenized Word', 'POS Tag'])

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

print("\nProcessed DataFrame (POS Tagging):\n")
print(df.to_string(index=True))


Processed DataFrame (POS Tagging):

       Tokenized Word POS Tag
0                Text      NN
1           analytics     NNS
2                  is     VBZ
3                 the      DT
4             process      NN
5                  of      IN
6          extracting     VBG
7          meaningful      JJ
8            insights     NNS
9                from      IN
10            textual      JJ
11               data     NNS
12                 It     PRP
13           involves     VBZ
14            various      JJ
15      preprocessing     VBG
16         techniques     NNS
17               such      JJ
18                 as      IN
19       tokenization      NN
20          stop-word      NN
21            removal      NN
22           stemming      NN
23                and      CC
24      lemmatization      NN
25              These      DT
26         techniques     NNS
27               help     VBP
28                 in      IN
29        structuring     VBG
30       unstructured      JJ
31 

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

sentences = sent_tokenize(document)

vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform(sentences)

df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:.3f}'.format)

print("\nTF-IDF Representation of the Uploaded Document:\n")
print(df_tfidf.round(3).T.to_string(index=True))


TF-IDF Representation of the Uploaded Document:

                   0     1     2     3     4     5
analysis       0.000 0.000 0.000 0.000 0.325 0.000
analytics      0.219 0.000 0.000 0.165 0.225 0.000
analyze        0.000 0.000 0.322 0.000 0.000 0.000
and            0.000 0.193 0.000 0.165 0.225 0.000
applications   0.000 0.000 0.000 0.000 0.325 0.000
as             0.000 0.279 0.000 0.000 0.000 0.000
by             0.000 0.000 0.000 0.195 0.000 0.176
can            0.000 0.000 0.000 0.000 0.000 0.214
classification 0.000 0.000 0.000 0.000 0.325 0.000
collection     0.000 0.000 0.000 0.000 0.000 0.214
computing      0.000 0.000 0.000 0.000 0.000 0.214
data           0.316 0.000 0.000 0.000 0.000 0.000
document       0.000 0.000 0.000 0.000 0.267 0.351
documents      0.000 0.000 0.000 0.000 0.000 0.214
easier         0.000 0.000 0.322 0.000 0.000 0.000
enabling       0.000 0.000 0.000 0.238 0.000 0.000
extracting     0.316 0.000 0.000 0.000 0.000 0.000
frequency      0.000 0.000 0.000