In [None]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download("punkt_tab")
nltk.download("stopwords")
nltk.download("wordnet")

text = "Visit our NLP toolkit page at https://example.com/nlp-toolkit?utm_source=chat to download the installer and sample data (see the quick-start guide at https://example.com/docs/quick-start.pdf); if you need help, email support at support@example.com or tweet us @NLP_Toolkit with the hashtag #NLP — we also keep short video tutorials on youtube.com/channel/EXAMPLE and a changelog at https://example.com/changelog so you’ll always know what’s new."

# 1) Lowercasing
text = text.lower()

# 2) Remove URLs
text = re.sub(r"http\S+|www\S+|https\S+", "", text)

# 3) Remove HTML tags
text = re.sub(r"<.*?>", "", text)

# 4) Remove numbers
text = re.sub(r"\d+", "", text)

# 5) Remove punctuation (except apostrophes)
text = re.sub(r"[^\w\s']", "", text)

# 6) Remove extra spaces
text = re.sub(r"\s+", " ", text).strip()
print("Cleaned:", text)

# 7) Tokenization
tokens = word_tokenize(text)

# 8) Stopword removal
stop_words = set(stopwords.words("english"))
filtered = [w for w in tokens if w not in stop_words]

# 9) Stemming
stemmer = PorterStemmer()
stemmed = [stemmer.stem(w) for w in filtered]

# 10) Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(w) for w in filtered]

print("Tokens:", tokens)
print("Filtered:", filtered)
print("Stemmed:", stemmed)
print("Lemmatized:", lemmatized)

Cleaned: visit our nlp toolkit page at to download the installer and sample data see the quickstart guide at if you need help email support at supportexamplecom or tweet us nlp_toolkit with the hashtag nlp we also keep short video tutorials on youtubecomchannelexample and a changelog at so youll always know whats new
Tokens: ['visit', 'our', 'nlp', 'toolkit', 'page', 'at', 'to', 'download', 'the', 'installer', 'and', 'sample', 'data', 'see', 'the', 'quickstart', 'guide', 'at', 'if', 'you', 'need', 'help', 'email', 'support', 'at', 'supportexamplecom', 'or', 'tweet', 'us', 'nlp_toolkit', 'with', 'the', 'hashtag', 'nlp', 'we', 'also', 'keep', 'short', 'video', 'tutorials', 'on', 'youtubecomchannelexample', 'and', 'a', 'changelog', 'at', 'so', 'youll', 'always', 'know', 'whats', 'new']
Filtered: ['visit', 'nlp', 'toolkit', 'page', 'download', 'installer', 'sample', 'data', 'see', 'quickstart', 'guide', 'need', 'help', 'email', 'support', 'supportexamplecom', 'tweet', 'us', 'nlp_toolkit', 

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
text

'visit our nlp toolkit page at to download the installer and sample data see the quickstart guide at if you need help email support at supportexamplecom or tweet us nlp_toolkit with the hashtag nlp we also keep short video tutorials on youtubecomchannelexample and a changelog at so youll always know whats new'