# NLP

## Text Processing

In [21]:

import nltk
import os
import string

# Set NLTK data path first
nltk.data.path.clear()
nltk.data.path.append(r'D:\py_prac\langchain-prac\nltk_data')

# Download required resources to the correct directory
# Note: Updated to use punkt_tab instead of punkt
nltk.download('punkt_tab', download_dir=r'D:\py_prac\langchain-prac\nltk_data')
nltk.download('stopwords', download_dir=r'D:\py_prac\langchain-prac\nltk_data')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Test the setup
print("NLTK data path:", nltk.data.path)

NLTK data path: ['D:\\py_prac\\langchain-prac\\nltk_data']


[nltk_data] Downloading package punkt_tab to D:\py_prac\langchain-
[nltk_data]     prac\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to D:\py_prac\langchain-
[nltk_data]     prac\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
text = "Horses are beautiful animals. They run fast and are very strong."


## STEP 1: Lowercase the text
text = text.lower()
text

'horses are beautiful animals. they run fast and are very strong.'

In [23]:
## STEEP 2: Tokenize the text
tokens = word_tokenize(text)
tokens

['horses',
 'are',
 'beautiful',
 'animals',
 '.',
 'they',
 'run',
 'fast',
 'and',
 'are',
 'very',
 'strong',
 '.']

In [24]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [25]:
# STEP 3: Remove punctuation
tokens_no_punct = [token for token in tokens if token not in string.punctuation]
print("Tokens without punctuation:", tokens_no_punct)

Tokens without punctuation: ['horses', 'are', 'beautiful', 'animals', 'they', 'run', 'fast', 'and', 'are', 'very', 'strong']


In [26]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [27]:
# STEP 4: Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens_no_punct if token not in stop_words]
print("Filtered tokens (no stopwords):", filtered_tokens)

Filtered tokens (no stopwords): ['horses', 'beautiful', 'animals', 'run', 'fast', 'strong']


In [28]:
# STEP 5: Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
print("Stemmed tokens:", stemmed_tokens)

Stemmed tokens: ['hors', 'beauti', 'anim', 'run', 'fast', 'strong']


## Bag of words

In [36]:
# Example: Bag of Words with scikit-learn (latest version)
from sklearn.feature_extraction.text import CountVectorizer

# Sample documents
docs = [
    "Cats chase mice.",
    "Mice chase cats.",
    "Dogs bark loudly."
]

# Create the vectorizer and fit_transform the documents
vectorizer = CountVectorizer(max_features=100, stop_words='english', lowercase=True)
X = vectorizer.fit_transform(docs)

# Show the feature names (vocabulary)
print("Vocabulary:", vectorizer.get_feature_names_out())

# Show the Bag of Words matrix
print("Bag of Words Matrix:\n", X.toarray())

Vocabulary: ['bark' 'cats' 'chase' 'dogs' 'loudly' 'mice']
Bag of Words Matrix:
 [[0 1 1 0 0 1]
 [0 1 1 0 0 1]
 [1 0 0 1 1 0]]


In [38]:
type(X)
X.shape

(3, 6)

## TF-IDF  - Term frequency - inv. document frequency

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

docs = [
    "Cats chase mice.",
    "Dogs chase cats.",
    "Cats and dogs are pets."
]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)

print("Vocabulary:", vectorizer.get_feature_names_out())
print("TF-IDF Matrix:\n", X.toarray())

Vocabulary: ['and' 'are' 'cats' 'chase' 'dogs' 'mice' 'pets']
TF-IDF Matrix:
 [[0.         0.         0.42544054 0.54783215 0.         0.72033345
  0.        ]
 [0.         0.         0.48133417 0.61980538 0.61980538 0.
  0.        ]
 [0.50461134 0.50461134 0.29803159 0.         0.38376993 0.
  0.50461134]]
