In [7]:
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download necessary NLTK data files
# Download 'punkt_tab' instead of 'punkt'
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# Example text
text = "NLTK is a leading platform for building Python programs to work with human language data."

# 1. Tokenization
tokens = word_tokenize(text)

# 2. Lowercasing
tokens = [token.lower() for token in tokens]

# 3. Removing punctuation
tokens = [token for token in tokens if token not in string.punctuation]

# 4. Removing stopwords
stop_words = set(stopwords.words('english'))
tokens = [token for token in tokens if token not in stop_words]

# 5. Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in tokens]

# 6. Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

# 7. Remove extra whitespaces
preprocessed_text = ' '.join(lemmatized_tokens)

# Results
print("Original Text: ", text)
print("Tokens: ", tokens)
print("Stemmed Tokens: ", stemmed_tokens)
print("Lemmatized Tokens: ", lemmatized_tokens)
print("Preprocessed Text: ", preprocessed_text)

Original Text:  NLTK is a leading platform for building Python programs to work with human language data.
Tokens:  ['nltk', 'leading', 'platform', 'building', 'python', 'programs', 'work', 'human', 'language', 'data']
Stemmed Tokens:  ['nltk', 'lead', 'platform', 'build', 'python', 'program', 'work', 'human', 'languag', 'data']
Lemmatized Tokens:  ['nltk', 'leading', 'platform', 'building', 'python', 'program', 'work', 'human', 'language', 'data']
Preprocessed Text:  nltk leading platform building python program work human language data


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
