# NLP

## Text Processing

In [13]:

import nltk
import os
import string

# Set NLTK data path first
nltk.data.path.clear()
nltk.data.path.append(r'D:\py_prac\langchain-prac\nltk_data')

# Download required resources to the correct directory
# Note: Updated to use punkt_tab instead of punkt
nltk.download('punkt_tab', download_dir=r'D:\py_prac\langchain-prac\nltk_data')
nltk.download('stopwords', download_dir=r'D:\py_prac\langchain-prac\nltk_data')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Test the setup
print("NLTK data path:", nltk.data.path)

NLTK data path: ['D:\\py_prac\\langchain-prac\\nltk_data']


[nltk_data] Downloading package punkt_tab to D:\py_prac\langchain-
[nltk_data]     prac\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to D:\py_prac\langchain-
[nltk_data]     prac\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
text = "Horses are beautiful animals. They run fast and are very strong."


## STEP 1: Lowercase the text
text = text.lower()
text

'horses are beautiful animals. they run fast and are very strong.'

In [15]:
## STEEP 2: Tokenize the text
tokens = word_tokenize(text)
tokens

['horses',
 'are',
 'beautiful',
 'animals',
 '.',
 'they',
 'run',
 'fast',
 'and',
 'are',
 'very',
 'strong',
 '.']

In [16]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [17]:
# STEP 3: Remove punctuation
tokens_no_punct = [token for token in tokens if token not in string.punctuation]
print("Tokens without punctuation:", tokens_no_punct)

Tokens without punctuation: ['horses', 'are', 'beautiful', 'animals', 'they', 'run', 'fast', 'and', 'are', 'very', 'strong']


In [18]:
stop_words

NameError: name 'stop_words' is not defined

In [None]:
# STEP 4: Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens_no_punct if token not in stop_words]
print("Filtered tokens (no stopwords):", filtered_tokens)

Filtered tokens (no stopwords): ['horses', 'beautiful', 'animals', 'run', 'fast', 'strong']


In [None]:
# STEP 5: Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
print("Stemmed tokens:", stemmed_tokens)

Stemmed tokens: ['hors', 'beauti', 'anim', 'run', 'fast', 'strong']


## Bag of words

In [None]:
# Example: Bag of Words with scikit-learn (latest version)
from sklearn.feature_extraction.text import CountVectorizer

# Sample documents
docs = [
    "Cats chase mice.",
    "Mice chase cats.",
    "Dogs bark loudly."
]

# Create the vectorizer and fit_transform the documents
vectorizer = CountVectorizer(max_features=100, stop_words='english', lowercase=True)
X = vectorizer.fit_transform(docs)

# Show the feature names (vocabulary)
print("Vocabulary:", vectorizer.get_feature_names_out())

# Show the Bag of Words matrix
print("Bag of Words Matrix:\n", X.toarray())

Vocabulary: ['bark' 'cats' 'chase' 'dogs' 'loudly' 'mice']
Bag of Words Matrix:
 [[0 1 1 0 0 1]
 [0 1 1 0 0 1]
 [1 0 0 1 1 0]]


In [None]:
type(X)
X.shape

(3, 6)

## TF-IDF  - Term frequency - inv. document frequency

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

docs = [
    "Cats chase mice.",
    "Dogs chase cats.",
    "Cats and dogs are pets."
]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)

print("Vocabulary:", vectorizer.get_feature_names_out())
print("TF-IDF Matrix:\n", X.toarray())

Vocabulary: ['and' 'are' 'cats' 'chase' 'dogs' 'mice' 'pets']
TF-IDF Matrix:
 [[0.         0.         0.42544054 0.54783215 0.         0.72033345
  0.        ]
 [0.         0.         0.48133417 0.61980538 0.61980538 0.
  0.        ]
 [0.50461134 0.50461134 0.29803159 0.         0.38376993 0.
  0.50461134]]


In [None]:
%pip install nltk

Note: you may need to restart the kernel to use updated packages.


## Spacy

In [52]:
import spacy

# Creating blank language object then
# tokenizing words of the sentence
nlp = spacy.blank("en")

doc = nlp("Dynstat github is a one stop\
learning destination for geeks.")

type(doc)

spacy.tokens.doc.Doc

In [49]:
nlp.pipe_names

[]

In [35]:
doc_iter = iter(doc)

In [36]:
next(doc_iter)

GeeksforGeeks

In [25]:
dir(doc)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '_bulk_merge',
 '_context',
 '_get_array_attrs',
 '_realloc',
 '_vector',
 '_vector_norm',
 'cats',
 'char_span',
 'copy',
 'count_by',
 'doc',
 'ents',
 'extend_tensor',
 'from_array',
 'from_bytes',
 'from_dict',
 'from_disk',
 'from_docs',
 'from_json',
 'get_extension',
 'get_lca_matrix',
 'has_annotation',
 'has_extension',
 'has_unknown_spaces',
 'has_vector',
 'is_nered',
 'is_parsed',
 'is_sentenced',
 'is_tagged',
 'lang',
 'lang_',
 'mem',
 'noun_chunks',
 'noun_chunks_iterator',
 'remove_extension',
 'retokenize',
 'sentiment'

In [39]:
for token in doc:
    print(token)

GeeksforGeeks
is
a
one
stoplearning
destination
for
geeks
.


In [44]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --- ------------------------------------ 1.0/12.8 MB 5.6 MB/s eta 0:00:03
     ----- ---------------------------------- 1.8/12.8 MB 4.6 MB/s eta 0:00:03
     -------- ------------------------------- 2.6/12.8 MB 4.3 MB/s eta 0:00:03
     ---------- ----------------------------- 3.4/12.8 MB 4.2 MB/s eta 0:00:03
     ------------- -------------------------- 4.2/12.8 MB 4.1 MB/s eta 0:00:03
     --------------- ------------------------ 5.0/12.8 MB 4.1 MB/s eta 0:00:02
     ------------------ --------------------- 5.8/12.8 MB 4.1 MB/s eta 0:00:02
     -------------------- ------------------- 6.6/12.8 MB 4.1 MB/s eta 0:00:02
     ---------------------- ----------------- 7.3/12.8 MB 4.0 MB/s eta 0:00:02
     ------------------------- ----------

In [53]:
nlp = spacy.load("en_core_web_sm")

nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [51]:
nlp.pipe_labels

{'tok2vec': [],
 'tagger': ['$',
  "''",
  ',',
  '-LRB-',
  '-RRB-',
  '.',
  ':',
  'ADD',
  'AFX',
  'CC',
  'CD',
  'DT',
  'EX',
  'FW',
  'HYPH',
  'IN',
  'JJ',
  'JJR',
  'JJS',
  'LS',
  'MD',
  'NFP',
  'NN',
  'NNP',
  'NNPS',
  'NNS',
  'PDT',
  'POS',
  'PRP',
  'PRP$',
  'RB',
  'RBR',
  'RBS',
  'RP',
  'SYM',
  'TO',
  'UH',
  'VB',
  'VBD',
  'VBG',
  'VBN',
  'VBP',
  'VBZ',
  'WDT',
  'WP',
  'WP$',
  'WRB',
  'XX',
  '_SP',
  '``'],
 'parser': ['ROOT',
  'acl',
  'acomp',
  'advcl',
  'advmod',
  'agent',
  'amod',
  'appos',
  'attr',
  'aux',
  'auxpass',
  'case',
  'cc',
  'ccomp',
  'compound',
  'conj',
  'csubj',
  'csubjpass',
  'dative',
  'dep',
  'det',
  'dobj',
  'expl',
  'intj',
  'mark',
  'meta',
  'neg',
  'nmod',
  'npadvmod',
  'nsubj',
  'nsubjpass',
  'nummod',
  'oprd',
  'parataxis',
  'pcomp',
  'pobj',
  'poss',
  'preconj',
  'predet',
  'prep',
  'prt',
  'punct',
  'quantmod',
  'relcl',
  'xcomp'],
 'attribute_ruler': [],
 'lemmatizer':

In [62]:
# Initialising doc with a sentence.
doc = nlp("If you want to be an excellent programmer \
, be consistent practicing daily on leetcode.")
type(doc)

spacy.tokens.doc.Doc

In [None]:
for token in doc:
    print(f"{token} | {spacy.explain(token.pos_)} | {token.lemma_}")

# notice that the lemma of the word "practicing" is "practice"

If | subordinating conjunction | if
you | pronoun | you
want | verb | want
to | particle | to
be | auxiliary | be
an | determiner | an
excellent | adjective | excellent
programmer | noun | programmer
, | punctuation | ,
be | auxiliary | be
consistent | adjective | consistent
practicing | verb | practice
daily | adverb | daily
on | adposition | on
leetcode | proper noun | leetcode
. | punctuation | .


In [64]:
for token in doc:
    print(token.pos_)

SCONJ
PRON
VERB
PART
AUX
DET
ADJ
NOUN
PUNCT
AUX
ADJ
VERB
ADV
ADP
PROPN
PUNCT


In [65]:
for token in doc:
    print(token.lemma_)

if
you
want
to
be
an
excellent
programmer
,
be
consistent
practice
daily
on
leetcode
.
