In [5]:
# Install libraries if needed
# !pip install nltk scikit-learn

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')
nltk.download('wordnet')

# Sample document
doc = "Natural Language Processing is a fascinating field of Artificial Intelligence that focuses on the interaction between computers and humans through language."

# Preprocessing
tokens = doc.split()  # just simple split
tokens = [w for w in tokens if w.isalpha()]  # Keep only alphabet words
tokens = [w.lower() for w in tokens if w.lower() not in stopwords.words('english')]

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

stems = [stemmer.stem(w) for w in tokens]
lemmas = [lemmatizer.lemmatize(w) for w in tokens]

print("Tokens:", tokens)
print("Stems:", stems)
print("Lemmas:", lemmas)

# TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform([doc])

# Print TF-IDF
for word, score in zip(vectorizer.get_feature_names_out(), X.toarray()[0]):
    if score > 0:
        print(word, ":", round(score, 4))


[nltk_data] Downloading package stopwords to /home/om/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/om/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Tokens: ['natural', 'language', 'processing', 'fascinating', 'field', 'artificial', 'intelligence', 'focuses', 'interaction', 'computers', 'humans']
Stems: ['natur', 'languag', 'process', 'fascin', 'field', 'artifici', 'intellig', 'focus', 'interact', 'comput', 'human']
Lemmas: ['natural', 'language', 'processing', 'fascinating', 'field', 'artificial', 'intelligence', 'focus', 'interaction', 'computer', 'human']
and : 0.2132
artificial : 0.2132
between : 0.2132
computers : 0.2132
fascinating : 0.2132
field : 0.2132
focuses : 0.2132
humans : 0.2132
intelligence : 0.2132
interaction : 0.2132
is : 0.2132
language : 0.4264
natural : 0.2132
of : 0.2132
on : 0.2132
processing : 0.2132
that : 0.2132
the : 0.2132
through : 0.2132
