In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Step 1: Tokenization
text = "Natural Language Processing is fascinating and I love working with NLP."
tokens = word_tokenize(text)
print("Tokens:", tokens)

# Step 2: Removing Stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print("Filtered Tokens:", filtered_tokens)

# Step 3: Stemming
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_tokens]
print("Stemmed Words:", stemmed_words)

# Step 4: Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("Lemmatized Words:", lemmatized_words)

# Step 5: Creating a Corpus
corpus = ["Natural Language Processing is amazing.", "I love NLP and its applications."]

# Step 6: Bag of Words Model
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print("Bag of Words Matrix:\n", X.toarray())
print("Feature Names (Vocabulary):", vectorizer.get_feature_names_out())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Tokens: ['Natural', 'Language', 'Processing', 'is', 'fascinating', 'and', 'I', 'love', 'working', 'with', 'NLP', '.']
Filtered Tokens: ['Natural', 'Language', 'Processing', 'fascinating', 'love', 'working', 'NLP', '.']
Stemmed Words: ['natur', 'languag', 'process', 'fascin', 'love', 'work', 'nlp', '.']
Lemmatized Words: ['Natural', 'Language', 'Processing', 'fascinating', 'love', 'working', 'NLP', '.']
Bag of Words Matrix:
 [[1 0 0 1 0 1 0 1 0 1]
 [0 1 1 0 1 0 1 0 1 0]]
Feature Names (Vocabulary): ['amazing' 'and' 'applications' 'is' 'its' 'language' 'love' 'natural'
 'nlp' 'processing']
