In [None]:
import re
import spacy

# Load English language model
nlp = spacy.load("en_core_web_sm")

text = "NLP is amazing! Visit https://nlp.ai 🚀 It helps computers understand human language. <br> Running, runs, and ran are forms of 'run'. 12345"

# 1) Lowercasing
text = text.lower()

# 2) Remove URLs
text = re.sub(r"http\S+|www\S+|https\S+", "", text)

# 3) Remove HTML tags
text = re.sub(r"<.*?>", "", text)

# 4) Remove numbers
text = re.sub(r"\d+", "", text)

# 5) Remove punctuation (except apostrophes)
text = re.sub(r"[^\w\s']", "", text)

# 6) Remove extra spaces
text = re.sub(r"\s+", " ", text).strip()

print("Cleaned:", text)

# 7) Process text with spaCy
doc = nlp(text)

# 8) Tokenization
tokens = [token.text for token in doc]

# 9) Stopword removal
filtered = [token.text for token in doc if not token.is_stop and token.is_alpha]

# 10) Lemmatization (spaCy does this better than NLTK)
lemmatized = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]

print("Tokens:", tokens)
print("Filtered:", filtered)
print("Lemmatized:", lemmatized)

Cleaned: nlp is amazing visit it helps computers understand human language running runs and ran are forms of 'run'
Tokens: ['nlp', 'is', 'amazing', 'visit', 'it', 'helps', 'computers', 'understand', 'human', 'language', 'running', 'runs', 'and', 'ran', 'are', 'forms', 'of', "'", 'run', "'"]
Filtered: ['nlp', 'amazing', 'visit', 'helps', 'computers', 'understand', 'human', 'language', 'running', 'runs', 'ran', 'forms', 'run']
Lemmatized: ['nlp', 'amazing', 'visit', 'help', 'computer', 'understand', 'human', 'language', 'running', 'run', 'run', 'form', 'run']


In [None]:
text

"nlp is amazing visit it helps computers understand human language running runs and ran are forms of 'run'"

In [None]:
doc

nlp is amazing visit it helps computers understand human language running runs and ran are forms of 'run'

In [None]:
lemmatized

['nlp',
 'amazing',
 'visit',
 'help',
 'computer',
 'understand',
 'human',
 'language',
 'running',
 'run',
 'run',
 'form',
 'run']

In [None]:
filtered

['nlp',
 'amazing',
 'visit',
 'helps',
 'computers',
 'understand',
 'human',
 'language',
 'running',
 'runs',
 'ran',
 'forms',
 'run']