Project: Simple text preprocessing pipeline in Python

This script demonstrates the basic steps of text preprocessing for English texts:
- converting text to lowercase,
- removing punctuation,
- tokenization,
- removing stopwords (with "no" and "not" kept as exceptions),
- stemming.

The pipeline prepares the text for further analysis, such as classification or sentiment analysis.

In [4]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [5]:
# Text input
text = "I can't believe this hotel! The staff was RUDE, and the room was dirty. Will never stay here again. Worst experience ever. Also, there was no WiFi, no hot water, and the TV was broken. On the bright side, the breakfast was okay."

In [6]:
# Lowercase
lower_text = text.lower()
print(lower_text)

i can't believe this hotel! the staff was rude, and the room was dirty. will never stay here again. worst experience ever. also, there was no wifi, no hot water, and the tv was broken. on the bright side, the breakfast was okay.


In [7]:
# Remove punctuation
no_punct_text = []
no_punct_text = re.sub(r"[^\w\s]", "", lower_text)
print(no_punct_text)

i cant believe this hotel the staff was rude and the room was dirty will never stay here again worst experience ever also there was no wifi no hot water and the tv was broken on the bright side the breakfast was okay


In [8]:
# Tokenization
tokens = word_tokenize(no_punct_text)
print(tokens)

['i', 'cant', 'believe', 'this', 'hotel', 'the', 'staff', 'was', 'rude', 'and', 'the', 'room', 'was', 'dirty', 'will', 'never', 'stay', 'here', 'again', 'worst', 'experience', 'ever', 'also', 'there', 'was', 'no', 'wifi', 'no', 'hot', 'water', 'and', 'the', 'tv', 'was', 'broken', 'on', 'the', 'bright', 'side', 'the', 'breakfast', 'was', 'okay']


In [11]:
# Stopwords

en_stopwords = stopwords.words('english')
en_stopwords.remove("no")
en_stopwords.remove("not")

text_stopwords = ' '.join([word for word in tokens if word not in (en_stopwords)])
print(text_stopwords)

cant believe hotel staff rude room dirty never stay worst experience ever also no wifi no hot water tv broken bright side breakfast okay


In [15]:
# Stemming

ps = PorterStemmer()
text_stemmed = [ps.stem(word) for word in text_stopwords.split()]
print(text_stemmed)

['cant', 'believ', 'hotel', 'staff', 'rude', 'room', 'dirti', 'never', 'stay', 'worst', 'experi', 'ever', 'also', 'no', 'wifi', 'no', 'hot', 'water', 'tv', 'broken', 'bright', 'side', 'breakfast', 'okay']
