In [1]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize

text = """Natural Language Toolkit (NLTK) is one of the largest Python 
libraries for performing various Natural Language Processing tasks. 
From rudimentary tasks such as text pre-processing to tasks like 
vectorized representation of text – NLTK’s API has covered everything."""

words = word_tokenize(text)
sentences = sent_tokenize(text)

print("Words:", words)
print("\nSentences:", sentences)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\fahee\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


Words: ['Natural', 'Language', 'Toolkit', '(', 'NLTK', ')', 'is', 'one', 'of', 'the', 'largest', 'Python', 'libraries', 'for', 'performing', 'various', 'Natural', 'Language', 'Processing', 'tasks', '.', 'From', 'rudimentary', 'tasks', 'such', 'as', 'text', 'pre-processing', 'to', 'tasks', 'like', 'vectorized', 'representation', 'of', 'text', '–', 'NLTK', '’', 's', 'API', 'has', 'covered', 'everything', '.']

Sentences: ['Natural Language Toolkit (NLTK) is one of the largest Python \nlibraries for performing various Natural Language Processing tasks.', 'From rudimentary tasks such as text pre-processing to tasks like \nvectorized representation of text – NLTK’s API has covered everything.']


In [3]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

text = """Natural Language Toolkit (NLTK) works as a powerful Python 
library that a wide range of tools for Natural Language Processing 
(NLP). From fundamental tasks like text pre-processing to more 
advanced operations such as semantic reasoning, NLTK provides a 
versatile API that caters to the diverse needs of language-related 
tasks."""

stop_words = set(stopwords.words('english'))
words = word_tokenize(text)
filtered_text = [word for word in words if word.lower() not in stop_words]

print("Filtered Text:", filtered_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fahee\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Filtered Text: ['Natural', 'Language', 'Toolkit', '(', 'NLTK', ')', 'works', 'powerful', 'Python', 'library', 'wide', 'range', 'tools', 'Natural', 'Language', 'Processing', '(', 'NLP', ')', '.', 'fundamental', 'tasks', 'like', 'text', 'pre-processing', 'advanced', 'operations', 'semantic', 'reasoning', ',', 'NLTK', 'provides', 'versatile', 'API', 'caters', 'diverse', 'needs', 'language-related', 'tasks', '.']


In [5]:
import string

text = """Let’s eat, Grandma! 
Grandma, Let’s eat! 
Silvia, Are you free tomorrow? 
Yes, I’m free on Saturday."""

cleaned_text = text.lower().translate(str.maketrans('', '', string.punctuation))

print("Cleaned Text:", cleaned_text)

Cleaned Text: let’s eat grandma 
grandma let’s eat 
silvia are you free tomorrow 
yes i’m free on saturday


In [7]:
import re

text = """ @@Natural   Language Processing (NLP)!!!  is a    field of AI that  
focuses on  ...     
   enabling computers to understand,    interpret, & generate   human  
language.   
 NLP   includes  tasks like *tokenization, lemmatization,*  && 
sentiment analysis.     
 It  helps in   applications such as chatbots,   machine translation,  
and   voice assistants!!!   
 However,   cleaning text—removing   extra spaces, punctuations, && 
special $$$ characters—is crucial.     
 Without   preprocessing,  NLP models may not    perform    
accurately !!!     
 So,   can you clean this   messy text & make  it    structured???   """

cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()  # Remove extra spaces

print("Cleaned Text:", cleaned_text)

Cleaned Text: Natural Language Processing NLP is a field of AI that focuses on enabling computers to understand interpret generate human language NLP includes tasks like tokenization lemmatization sentiment analysis It helps in applications such as chatbots machine translation and voice assistants However cleaning textremoving extra spaces punctuations special charactersis crucial Without preprocessing NLP models may not perform accurately So can you clean this messy text make it structured


In [9]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

text = """The researchers are analyzing various datasets to study the effects 
of automation. They observed that automated systems perform tasks more 
efficiently than humans. Many industries have been adopting AI-driven 
solutions to improve productivity. Running complex algorithms helps in 
predicting future trends accurately. Several companies are investing in 
developing smarter and more adaptive models. Data scientists continuously 
refine their models to achieve better performance. The advancements in 
technology have transformed the way businesses operate."""

words = word_tokenize(text)

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

stemmed_words = [stemmer.stem(word) for word in words]
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

print("Stemmed Words:", stemmed_words)
print("\nLemmatized Words:", lemmatized_words)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\fahee\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\fahee\AppData\Roaming\nltk_data...


Stemmed Words: ['the', 'research', 'are', 'analyz', 'variou', 'dataset', 'to', 'studi', 'the', 'effect', 'of', 'autom', '.', 'they', 'observ', 'that', 'autom', 'system', 'perform', 'task', 'more', 'effici', 'than', 'human', '.', 'mani', 'industri', 'have', 'been', 'adopt', 'ai-driven', 'solut', 'to', 'improv', 'product', '.', 'run', 'complex', 'algorithm', 'help', 'in', 'predict', 'futur', 'trend', 'accur', '.', 'sever', 'compani', 'are', 'invest', 'in', 'develop', 'smarter', 'and', 'more', 'adapt', 'model', '.', 'data', 'scientist', 'continu', 'refin', 'their', 'model', 'to', 'achiev', 'better', 'perform', '.', 'the', 'advanc', 'in', 'technolog', 'have', 'transform', 'the', 'way', 'busi', 'oper', '.']

Lemmatized Words: ['The', 'researcher', 'are', 'analyzing', 'various', 'datasets', 'to', 'study', 'the', 'effect', 'of', 'automation', '.', 'They', 'observed', 'that', 'automated', 'system', 'perform', 'task', 'more', 'efficiently', 'than', 'human', '.', 'Many', 'industry', 'have', 'bee