In [1]:
# Sure, let's break down each step:

# ### 1. Document Preprocessing:

# #### a. Tokenization:
# Tokenization involves breaking down the text into individual words or tokens.

# #### b. POS Tagging:
# POS (Part-of-Speech) tagging involves labeling each word in the text with its corresponding part of speech, such as noun, verb, adjective, etc.

# #### c. Stop Words Removal:
# Stop words are common words like "the", "is", "and", etc., which often do not contribute much to the meaning of the text. Removing them helps in focusing on the more important words.

# #### d. Stemming:
# Stemming involves reducing words to their root or base form, by removing suffixes. For example, "running" becomes "run".

# #### e. Lemmatization:
# Lemmatization is similar to stemming but it reduces words to their base or dictionary form (lemma), which makes it more accurate but computationally expensive.

# ### 2. Calculating TF-IDF (Term Frequency-Inverse Document Frequency):

# #### Term Frequency (TF):
# TF measures how frequently a term occurs in a document. It's calculated as the number of times a term appears in a document divided by the total number of terms in the document.

# #### Inverse Document Frequency (IDF):
# IDF measures how important a term is in a collection of documents. It's calculated as the logarithm of the total number of documents divided by the number of documents containing the term.

# #### TF-IDF:
# TF-IDF is the product of TF and IDF, and it indicates the importance of a term in a specific document relative to its importance across all documents.

# Here's a step-by-step approach to implement these tasks:

# 1. Load or extract a sample document.
# 2. Tokenize the document.
# 3. Perform POS tagging on the tokens.
# 4. Remove stop words.
# 5. Apply stemming or lemmatization.
# 6. Calculate TF for each term in the document.
# 7. Calculate IDF for each term across the document collection.
# 8. Calculate TF-IDF for each term in the document.
# 9. Optionally, create a representation of the document using the TF-IDF values.

# If you need further guidance or assistance with the implementation, feel free to ask!

In [3]:
pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting click (from nltk)
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.4.28-cp312-cp312-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 0.0/41.9 kB ? eta -:--:--
     ---------------------------------------- 0.0/41.9 kB ? eta -:--:--
     --------- ------------------------------ 10.2/41.9 kB ? eta -:--:--
     ------------------ ------------------- 20.5/41.9 kB 330.3 kB/s eta 0:00:01
     -------------------------------------  41.0/41.9 kB 281.8 kB/s eta 0:00:01
     -------------------------------------- 41.9/41.9 kB 253.4 kB/s eta 0:00:00
Collecting tqdm (from nltk)
  Downloading tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
     ---------------------------------------- 0.0/57.6 kB ? eta -:--:--
     -------------------- ----------------- 30.7/57.6 kB 640.0 kB/s eta 0:00:01
     --------------------


[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\msi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [10]:
import nltk
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\msi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [12]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\msi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [14]:
import nltk
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\msi\AppData\Roaming\nltk_data...


True

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample Document
sample_document = """
Natural language processing (NLP) is a field of artificial intelligence (AI) that focuses on the interaction between computers and humans through natural language. The ultimate objective of NLP is to enable computers to understand, interpret, and generate human languages in a way that is both meaningful and useful. NLP encompasses a variety of tasks, including text analysis, sentiment analysis, machine translation, and more. In this tutorial, we will explore some common preprocessing techniques used in NLP, such as tokenization, POS tagging, stop words removal, stemming, and lemmatization.
"""

# Tokenization
tokens = word_tokenize(sample_document)

# POS Tagging
pos_tags = pos_tag(tokens)

# Stop words removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

# Print Results
print("Original Tokens:")
print(tokens)
print("\nPOS Tagging:")
print(pos_tags)
print("\nFiltered Tokens after Stop words removal:")
print(filtered_tokens)
print("\nStemmed Tokens:")
print(stemmed_tokens)
print("\nLemmatized Tokens:")
print(lemmatized_tokens)

# TF-IDF Representation
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([sample_document])

# Print TF-IDF Matrix
print("\nTF-IDF Matrix:")
print(tfidf_matrix.toarray())


Original Tokens:
['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'field', 'of', 'artificial', 'intelligence', '(', 'AI', ')', 'that', 'focuses', 'on', 'the', 'interaction', 'between', 'computers', 'and', 'humans', 'through', 'natural', 'language', '.', 'The', 'ultimate', 'objective', 'of', 'NLP', 'is', 'to', 'enable', 'computers', 'to', 'understand', ',', 'interpret', ',', 'and', 'generate', 'human', 'languages', 'in', 'a', 'way', 'that', 'is', 'both', 'meaningful', 'and', 'useful', '.', 'NLP', 'encompasses', 'a', 'variety', 'of', 'tasks', ',', 'including', 'text', 'analysis', ',', 'sentiment', 'analysis', ',', 'machine', 'translation', ',', 'and', 'more', '.', 'In', 'this', 'tutorial', ',', 'we', 'will', 'explore', 'some', 'common', 'preprocessing', 'techniques', 'used', 'in', 'NLP', ',', 'such', 'as', 'tokenization', ',', 'POS', 'tagging', ',', 'stop', 'words', 'removal', ',', 'stemming', ',', 'and', 'lemmatization', '.']

POS Tagging:
[('Natural', 'JJ'), ('language

In [5]:
!pip install jupyter_contrib_nbextensions




[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip
