In [21]:
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [23]:
text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Fusce commodo mauris id justo condimentum dignissim. Nullam placerat semper dapibus. Pellentesque ac risus nulla. Phasellus ut dapibus nunc, id aliquam dolor."

In [24]:
print(word_tokenize(text))

['Lorem', 'ipsum', 'dolor', 'sit', 'amet', ',', 'consectetur', 'adipiscing', 'elit', '.', 'Fusce', 'commodo', 'mauris', 'id', 'justo', 'condimentum', 'dignissim', '.', 'Nullam', 'placerat', 'semper', 'dapibus', '.', 'Pellentesque', 'ac', 'risus', 'nulla', '.', 'Phasellus', 'ut', 'dapibus', 'nunc', ',', 'id', 'aliquam', 'dolor', '.']


In [25]:
print(sent_tokenize(text))

['Lorem ipsum dolor sit amet, consectetur adipiscing elit.', 'Fusce commodo mauris id justo condimentum dignissim.', 'Nullam placerat semper dapibus.', 'Pellentesque ac risus nulla.', 'Phasellus ut dapibus nunc, id aliquam dolor.']


In [26]:

to_tag = word_tokenize(text)

In [27]:
print(pos_tag(to_tag))

[('Lorem', 'NNP'), ('ipsum', 'NN'), ('dolor', 'NN'), ('sit', 'NN'), ('amet', 'NN'), (',', ','), ('consectetur', 'NN'), ('adipiscing', 'VBG'), ('elit', 'NN'), ('.', '.'), ('Fusce', 'NNP'), ('commodo', 'JJ'), ('mauris', 'NN'), ('id', 'NN'), ('justo', 'NN'), ('condimentum', 'NN'), ('dignissim', 'NN'), ('.', '.'), ('Nullam', 'NNP'), ('placerat', 'VBZ'), ('semper', 'JJR'), ('dapibus', 'NN'), ('.', '.'), ('Pellentesque', 'NNP'), ('ac', 'JJ'), ('risus', 'NN'), ('nulla', 'NN'), ('.', '.'), ('Phasellus', 'CC'), ('ut', 'JJ'), ('dapibus', 'NN'), ('nunc', 'NN'), (',', ','), ('id', 'JJ'), ('aliquam', 'NN'), ('dolor', 'NN'), ('.', '.')]


In [28]:
stop_words = set(stopwords.words("english"))
print(stop_words)

{'you', 'doing', 'with', "aren't", 'what', 'ma', 'but', 'itself', 'that', 'here', 'a', 'hadn', 'do', "shan't", 'my', 'them', 'couldn', 'isn', "wouldn't", 'myself', 'or', 've', 'hasn', 'its', "haven't", 'ourselves', "shouldn't", "didn't", 'haven', 'only', 'up', 'it', 'few', 'yourselves', 'between', 'mightn', "hasn't", 'had', 'all', 'theirs', 'through', "wasn't", 'no', 'an', 'were', 'their', 'having', 'because', 'his', 'am', "mustn't", 'other', 'how', 'himself', 'ain', 'doesn', 'where', 'ours', 'which', 'who', 'out', 'herself', 'while', 'i', 'will', 'did', 'weren', "it's", 'such', 'these', 'under', 'both', 'd', 'this', 'when', 'now', "hadn't", 'in', 'each', 's', 'o', 'just', 'more', 'wasn', 're', 'yours', 'being', 'very', 'mustn', "you'd", 'they', 'down', 'once', "mightn't", "needn't", 'whom', "you'll", 'to', 't', 'shan', 'he', 'shouldn', 'are', 'some', "that'll", 'has', 'should', 'be', 'didn', "won't", 'during', 'we', 'why', "she's", 'nor', 'yourself', 'at', 'll', "doesn't", 'themselves

In [29]:
to_clean = word_tokenize(text)

In [30]:
no_stopwords_text = []
for token in to_clean:
    if(token not in stop_words):
        no_stopwords_text.append(token)

print(no_stopwords_text)

['Lorem', 'ipsum', 'dolor', 'sit', 'amet', ',', 'consectetur', 'adipiscing', 'elit', '.', 'Fusce', 'commodo', 'mauris', 'id', 'justo', 'condimentum', 'dignissim', '.', 'Nullam', 'placerat', 'semper', 'dapibus', '.', 'Pellentesque', 'ac', 'risus', 'nulla', '.', 'Phasellus', 'ut', 'dapibus', 'nunc', ',', 'id', 'aliquam', 'dolor', '.']


In [31]:
stemmer = PorterStemmer()

In [32]:
stemmed_words = []
for token in no_stopwords_text:
    stemmed_word = stemmer.stem(token)
    stemmed_words.append(stemmed_word)

In [33]:
print(stemmed_words)

['lorem', 'ipsum', 'dolor', 'sit', 'amet', ',', 'consectetur', 'adipisc', 'elit', '.', 'fusc', 'commodo', 'mauri', 'id', 'justo', 'condimentum', 'dignissim', '.', 'nullam', 'placerat', 'semper', 'dapibu', '.', 'pellentesqu', 'ac', 'risu', 'nulla', '.', 'phasellu', 'ut', 'dapibu', 'nunc', ',', 'id', 'aliquam', 'dolor', '.']


In [34]:
lemmatizer = WordNetLemmatizer()

In [35]:
lemmatized_words = []
for token in no_stopwords_text:
    lemmatized = lemmatizer.lemmatize(token)  # Assuming you want to lemmatize verbs (you can change the 'pos' argument as needed)
    lemmatized_words.append(lemmatized)

In [36]:
print(lemmatized_words)

['Lorem', 'ipsum', 'dolor', 'sit', 'amet', ',', 'consectetur', 'adipiscing', 'elit', '.', 'Fusce', 'commodo', 'mauris', 'id', 'justo', 'condimentum', 'dignissim', '.', 'Nullam', 'placerat', 'semper', 'dapibus', '.', 'Pellentesque', 'ac', 'risus', 'nulla', '.', 'Phasellus', 'ut', 'dapibus', 'nunc', ',', 'id', 'aliquam', 'dolor', '.']


In [37]:
vectorizer = TfidfVectorizer()

In [38]:
corpus = [
    "I love to eat pizza",
    "Pizza is my favorite food",
    "I enjoy eating pizza with friends",
    "I like to have pizza for dinner",
    "Pizza toppings include cheese, pepperoni, and mushrooms"
]

In [39]:
vectorizer = TfidfVectorizer()

In [40]:
tfidf_matrix = vectorizer.fit_transform(corpus)

feature_names = vectorizer.get_feature_names_out()

In [41]:
print(tfidf_matrix.toarray())

print(feature_names)

[[0.         0.         0.         0.58946308 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.58946308 0.         0.         0.
  0.28088232 0.4755751  0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.48638585 0.48638585 0.         0.         0.         0.
  0.48638585 0.         0.         0.         0.48638585 0.
  0.23176546 0.         0.         0.        ]
 [0.         0.         0.         0.         0.48638585 0.48638585
  0.         0.         0.         0.48638585 0.         0.
  0.         0.         0.         0.         0.         0.
  0.23176546 0.         0.         0.48638585]
 [0.         0.         0.45277275 0.         0.         0.
  0.         0.         0.45277275 0.         0.45277275 0.
  0.         0.45277275 0.         0.         0.         0.
  0.21574864 0.36529421 0.         0.        ]
 [0.40073619 0.40073619 0.         0.         0.         0.
  0.         0.         