Part A

In [7]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

In [24]:
# download below package in not installed

"""
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
"""

"\nnltk.download('punkt')\nnltk.download('averaged_perceptron_tagger')\nnltk.download('stopwords')\nnltk.download('wordnet')\n"

In [25]:
document = """Natural language processing (NLP) is a subfield of artificial intelligence (AI) that focuses on the interaction between computers and humans using natural language. It involves the analysis, understanding, and generation of human language, enabling machines to process and comprehend text in a meaningful way. NLP techniques are widely used in various applications such as sentiment analysis, machine translation, chatbots, and information retrieval. Preprocessing is an essential step in NLP, which involves tokenization, part-of-speech tagging, stop words removal, stemming, and lemmatization."""

In [26]:
# Tokenization

"""
In Python tokenization basically refers to splitting up a larger body of text into smaller lines, words or even creating words for a non-English language.
"""

tokens = word_tokenize(document)

In [27]:
# POS Tagging

"""
POS Tagging Parts of speech Tagging is responsible for reading the text in a language and assigning some specific token (Parts of Speech) to each word.
"""

pos_tags = pos_tag(tokens)

In [28]:
# Stop words removal

"""
Stop words removal in Python is a common preprocessing step in Natural Language Processing (NLP) applications.
Stop words are words that do not add much meaning to a sentence and are pre-defined and cannot be removed
"""

stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

In [29]:
# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]

In [30]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

LookupError: 
**********************************************************************
  Resource [93momw-1.4[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('omw-1.4')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/omw-1.4[0m

  Searched in:
    - 'C:\\Users\\UNIQUE/nltk_data'
    - 'D:\\Python\\nltk_data'
    - 'D:\\Python\\share\\nltk_data'
    - 'D:\\Python\\lib\\nltk_data'
    - 'C:\\Users\\UNIQUE\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [31]:
# Print the results
print("Original Document:\n", document)
print("\nTokens:\n", tokens)
print("\nPOS Tags:\n", pos_tags)
print("\nFiltered Tokens (after stop words removal):\n", filtered_tokens)
print("\nStemmed Tokens:\n", stemmed_tokens)
print("\nLemmatized Tokens:\n", lemmatized_tokens)

Original Document:
 Natural language processing (NLP) is a subfield of artificial intelligence (AI) that focuses on the interaction between computers and humans using natural language. It involves the analysis, understanding, and generation of human language, enabling machines to process and comprehend text in a meaningful way. NLP techniques are widely used in various applications such as sentiment analysis, machine translation, chatbots, and information retrieval. Preprocessing is an essential step in NLP, which involves tokenization, part-of-speech tagging, stop words removal, stemming, and lemmatization.

Tokens:
 ['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'subfield', 'of', 'artificial', 'intelligence', '(', 'AI', ')', 'that', 'focuses', 'on', 'the', 'interaction', 'between', 'computers', 'and', 'humans', 'using', 'natural', 'language', '.', 'It', 'involves', 'the', 'analysis', ',', 'understanding', ',', 'and', 'generation', 'of', 'human', 'language', ',', 'e

NameError: name 'lemmatized_tokens' is not defined

Part B

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [33]:
# List of documents
documents = [
    "Natural language processing is a subfield of artificial intelligence.",
    "It focuses on the interaction between computers and humans using natural language.",
    "NLP techniques are widely used in various applications such as sentiment analysis and machine translation.",
    "Preprocessing is an essential step in NLP.",
]

In [34]:
# Create an instance of TfidfVectorizer
vectorizer = TfidfVectorizer()

In [35]:
# Fit and transform the documents
tfidf_matrix = vectorizer.fit_transform(documents)

In [36]:
# Get the feature names (terms)
feature_names = vectorizer.get_feature_names_out()

In [37]:
# Print the TF-IDF representation
for i, doc in enumerate(documents):
    print(f"Document {i+1}:")
    for j, term in enumerate(feature_names):
        tfidf_value = tfidf_matrix[i, j]
        if tfidf_value > 0:
            print(f"{term}: {tfidf_value:.4f}")
    print()

Document 1:
artificial: 0.3817
intelligence: 0.3817
is: 0.3009
language: 0.3009
natural: 0.3009
of: 0.3817
processing: 0.3817
subfield: 0.3817

Document 2:
and: 0.2392
between: 0.3034
computers: 0.3034
focuses: 0.3034
humans: 0.3034
interaction: 0.3034
it: 0.3034
language: 0.2392
natural: 0.2392
on: 0.3034
the: 0.3034
using: 0.3034

Document 3:
analysis: 0.2686
and: 0.2117
applications: 0.2686
are: 0.2686
as: 0.2686
in: 0.2117
machine: 0.2686
nlp: 0.2117
sentiment: 0.2686
such: 0.2686
techniques: 0.2686
translation: 0.2686
used: 0.2686
various: 0.2686
widely: 0.2686

Document 4:
an: 0.4129
essential: 0.4129
in: 0.3256
is: 0.3256
nlp: 0.3256
preprocessing: 0.4129
step: 0.4129

