In [1]:
import nltk
import re
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [3]:
text= '''Tokenization is the first step in text analytics. The
process of breaking down a text paragraph into smaller chunks
such as words or sentences is called Tokenization.'''

Tokenization

In [5]:
#Sentence Tokenization
tokenized_text= sent_tokenize(text)
print(tokenized_text)

['Tokenization is the first step in text analytics.', 'The\nprocess of breaking down a text paragraph into smaller chunks\nsuch as words or sentences is called Tokenization.']


In [6]:
#Word Tokenization
tokenized_word=word_tokenize(text)
print(tokenized_word)

['Tokenization', 'is', 'the', 'first', 'step', 'in', 'text', 'analytics', '.', 'The', 'process', 'of', 'breaking', 'down', 'a', 'text', 'paragraph', 'into', 'smaller', 'chunks', 'such', 'as', 'words', 'or', 'sentences', 'is', 'called', 'Tokenization', '.']


Removing Punctuations and Stop Word

In [7]:
# print stop words of English
stop_words=set(stopwords.words("english"))
print(stop_words)

{'didn', 'do', 'who', "shan't", 'while', 'both', 'then', 'why', 't', 'yours', 'where', "it's", 'off', 'their', 'o', 'other', "we'll", "you're", 'having', "she'd", "they'd", "should've", 'you', 'no', 'i', 'her', 'such', 'between', "couldn't", 'each', 'these', 'are', 'same', "aren't", 'does', 'or', "mightn't", 'before', 'most', 'once', 'under', 'but', 'any', 'so', 'what', 'some', 'below', 'was', 'his', 'll', 'won', 'over', 'wouldn', 'here', 'out', 'yourselves', 'aren', "i'd", "she's", "weren't", 'if', "didn't", "that'll", "he's", 'until', "hasn't", 'mightn', 'up', 'itself', "she'll", "you'd", 'because', 'again', "we'd", "he'd", 'have', 'they', 'am', 'theirs', 'how', 'than', 've', 'your', 'it', 'own', "it'd", 'which', 'ours', 'against', 'down', 'only', 'ourselves', "we've", 'ma', 'she', "you've", "they've", 'our', "they're", 'more', 'a', 'when', 'with', 'as', 'wasn', 'shan', 'will', 'during', "it'll", 'should', 'shouldn', 'this', 'be', 'being', 'ain', 'doesn', 'few', 'further', "we're", "

In [9]:
text= "How to remove stop words with NLTK library in Python?"
text= re.sub('[^a-zA-Z]', ' ',text)
tokens = word_tokenize(text.lower())
filtered_text=[]
for w in tokens:
  if w not in stop_words:
    filtered_text.append(w)
print("Tokenized Sentence:",tokens)
print("Filterd Sentence:",filtered_text)

Tokenized Sentence: ['how', 'to', 'remove', 'stop', 'words', 'with', 'nltk', 'library', 'in', 'python']
Filterd Sentence: ['remove', 'stop', 'words', 'nltk', 'library', 'python']


Perform Stemming

In [11]:
e_words= ["wait", "waiting", "waited", "waits"]
ps =PorterStemmer()
for w in e_words:
  rootWord=ps.stem(w)
  print(rootWord)

wait
wait
wait
wait


Perform Lemmatization

In [12]:
wordnet_lemmatizer = WordNetLemmatizer()
text = "studies studying cries cry"
tokenization = nltk.word_tokenize(text)
for w in tokenization:
  print("Lemma for {} is {}".format(w,wordnet_lemmatizer.lemmatize(w)))

Lemma for studies is study
Lemma for studying is studying
Lemma for cries is cry
Lemma for cry is cry


Apply POS Tagging to text

In [13]:
data="The pink sweater fit her perfectly"
words=word_tokenize(data)
for word in words:
  print(nltk.pos_tag([word]))

[('The', 'DT')]
[('pink', 'NN')]
[('sweater', 'NN')]
[('fit', 'NN')]
[('her', 'PRP$')]
[('perfectly', 'RB')]


TF-IDF Representation of Documents

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
# Multiple sample documents (for IDF to make sense)
documents = [
    "Natural Language Processing is an interesting field.",
    "Machine learning and AI are transforming the world.",
    "Python is widely used for data science and AI.",
    "Natural Language Processing and machine learning are core parts of AI."
]

In [17]:
# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Transform documents
tfidf_matrix = vectorizer.fit_transform(documents)

In [18]:
# Show feature names and TF-IDF matrix
print("\nFeature Names:", vectorizer.get_feature_names_out())
print("\nTF-IDF Matrix:\n", tfidf_matrix.toarray())


Feature Names: ['ai' 'core' 'data' 'field' 'interesting' 'language' 'learning' 'machine'
 'natural' 'parts' 'processing' 'python' 'science' 'transforming' 'used'
 'widely' 'world']

TF-IDF Matrix:
 [[0.         0.         0.         0.50867187 0.50867187 0.40104275
  0.         0.         0.40104275 0.         0.40104275 0.
  0.         0.         0.         0.         0.        ]
 [0.33406745 0.         0.         0.         0.         0.
  0.41263976 0.41263976 0.         0.         0.         0.
  0.         0.52338122 0.         0.         0.52338122]
 [0.27448674 0.         0.43003652 0.         0.         0.
  0.         0.         0.         0.         0.         0.43003652
  0.43003652 0.         0.43003652 0.43003652 0.        ]
 [0.27178692 0.42580674 0.         0.         0.         0.33571092
  0.33571092 0.33571092 0.33571092 0.42580674 0.33571092 0.
  0.         0.         0.         0.         0.        ]]
