# NLP

## Text Processing

In [1]:

import nltk
import os
import string

# Set NLTK data path first
nltk.data.path.clear()
nltk.data.path.append(r'D:\py_prac\langchain-prac\nltk_data')

# Download required resources to the correct directory
# Note: Updated to use punkt_tab instead of punkt
nltk.download('punkt_tab', download_dir=r'D:\py_prac\langchain-prac\nltk_data')
nltk.download('stopwords', download_dir=r'D:\py_prac\langchain-prac\nltk_data')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Test the setup
print("NLTK data path:", nltk.data.path)

NLTK data path: ['D:\\py_prac\\langchain-prac\\nltk_data']


[nltk_data] Downloading package punkt_tab to D:\py_prac\langchain-
[nltk_data]     prac\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to D:\py_prac\langchain-
[nltk_data]     prac\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
text = "Horses are beautiful animals. They run fast and are very strong."


## STEP 1: Lowercase the text
text = text.lower()
text

'horses are beautiful animals. they run fast and are very strong.'

In [3]:
## STEEP 2: Tokenize the text
tokens = word_tokenize(text)
tokens

['horses',
 'are',
 'beautiful',
 'animals',
 '.',
 'they',
 'run',
 'fast',
 'and',
 'are',
 'very',
 'strong',
 '.']

In [4]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [5]:
# STEP 3: Remove punctuation
tokens_no_punct = [token for token in tokens if token not in string.punctuation]
print("Tokens without punctuation:", tokens_no_punct)

Tokens without punctuation: ['horses', 'are', 'beautiful', 'animals', 'they', 'run', 'fast', 'and', 'are', 'very', 'strong']


In [6]:
stop_words

NameError: name 'stop_words' is not defined

In [None]:
# STEP 4: Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens_no_punct if token not in stop_words]
print("Filtered tokens (no stopwords):", filtered_tokens)

Filtered tokens (no stopwords): ['horses', 'beautiful', 'animals', 'run', 'fast', 'strong']


In [None]:
# STEP 5: Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
print("Stemmed tokens:", stemmed_tokens)

Stemmed tokens: ['hors', 'beauti', 'anim', 'run', 'fast', 'strong']


## Bag of words

In [None]:
# Example: Bag of Words with scikit-learn (latest version)
from sklearn.feature_extraction.text import CountVectorizer

# Sample documents
docs = [
    "Cats chase mice.",
    "Mice chase cats.",
    "Dogs bark loudly."
]

# Create the vectorizer and fit_transform the documents
vectorizer = CountVectorizer(max_features=100, stop_words='english', lowercase=True)
X = vectorizer.fit_transform(docs)

# Show the feature names (vocabulary)
print("Vocabulary:", vectorizer.get_feature_names_out())

# Show the Bag of Words matrix
print("Bag of Words Matrix:\n", X.toarray())

Vocabulary: ['bark' 'cats' 'chase' 'dogs' 'loudly' 'mice']
Bag of Words Matrix:
 [[0 1 1 0 0 1]
 [0 1 1 0 0 1]
 [1 0 0 1 1 0]]


In [None]:
type(X)
X.shape

(3, 6)

## TF-IDF  - Term frequency - inv. document frequency

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

docs = [
    "Cats chase mice.",
    "Dogs chase cats.",
    "Cats and dogs are pets."
]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)

print("Vocabulary:", vectorizer.get_feature_names_out())
print("TF-IDF Matrix:\n", X.toarray())

Vocabulary: ['and' 'are' 'cats' 'chase' 'dogs' 'mice' 'pets']
TF-IDF Matrix:
 [[0.         0.         0.42544054 0.54783215 0.         0.72033345
  0.        ]
 [0.         0.         0.48133417 0.61980538 0.61980538 0.
  0.        ]
 [0.50461134 0.50461134 0.29803159 0.         0.38376993 0.
  0.50461134]]


In [None]:
%pip install nltk

Note: you may need to restart the kernel to use updated packages.


## Spacy

In [None]:
import spacy

# Creating blank language object then
# tokenizing words of the sentence
nlp = spacy.blank("en")

doc = nlp("Dynstat github is a one stop\
learning destination for geeks.")

type(doc)

spacy.tokens.doc.Doc

In [None]:
nlp.pipe_names

[]

In [None]:
doc_iter = iter(doc)

In [None]:
next(doc_iter)

GeeksforGeeks

In [None]:
dir(doc)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '_bulk_merge',
 '_context',
 '_get_array_attrs',
 '_realloc',
 '_vector',
 '_vector_norm',
 'cats',
 'char_span',
 'copy',
 'count_by',
 'doc',
 'ents',
 'extend_tensor',
 'from_array',
 'from_bytes',
 'from_dict',
 'from_disk',
 'from_docs',
 'from_json',
 'get_extension',
 'get_lca_matrix',
 'has_annotation',
 'has_extension',
 'has_unknown_spaces',
 'has_vector',
 'is_nered',
 'is_parsed',
 'is_sentenced',
 'is_tagged',
 'lang',
 'lang_',
 'mem',
 'noun_chunks',
 'noun_chunks_iterator',
 'remove_extension',
 'retokenize',
 'sentiment'

In [None]:
for token in doc:
    print(token)

GeeksforGeeks
is
a
one
stoplearning
destination
for
geeks
.


In [None]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --- ------------------------------------ 1.0/12.8 MB 5.6 MB/s eta 0:00:03
     ----- ---------------------------------- 1.8/12.8 MB 4.6 MB/s eta 0:00:03
     -------- ------------------------------- 2.6/12.8 MB 4.3 MB/s eta 0:00:03
     ---------- ----------------------------- 3.4/12.8 MB 4.2 MB/s eta 0:00:03
     ------------- -------------------------- 4.2/12.8 MB 4.1 MB/s eta 0:00:03
     --------------- ------------------------ 5.0/12.8 MB 4.1 MB/s eta 0:00:02
     ------------------ --------------------- 5.8/12.8 MB 4.1 MB/s eta 0:00:02
     -------------------- ------------------- 6.6/12.8 MB 4.1 MB/s eta 0:00:02
     ---------------------- ----------------- 7.3/12.8 MB 4.0 MB/s eta 0:00:02
     ------------------------- ----------

In [None]:
nlp = spacy.load("en_core_web_sm")

nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [None]:
nlp.pipe_labels

{'tok2vec': [],
 'tagger': ['$',
  "''",
  ',',
  '-LRB-',
  '-RRB-',
  '.',
  ':',
  'ADD',
  'AFX',
  'CC',
  'CD',
  'DT',
  'EX',
  'FW',
  'HYPH',
  'IN',
  'JJ',
  'JJR',
  'JJS',
  'LS',
  'MD',
  'NFP',
  'NN',
  'NNP',
  'NNPS',
  'NNS',
  'PDT',
  'POS',
  'PRP',
  'PRP$',
  'RB',
  'RBR',
  'RBS',
  'RP',
  'SYM',
  'TO',
  'UH',
  'VB',
  'VBD',
  'VBG',
  'VBN',
  'VBP',
  'VBZ',
  'WDT',
  'WP',
  'WP$',
  'WRB',
  'XX',
  '_SP',
  '``'],
 'parser': ['ROOT',
  'acl',
  'acomp',
  'advcl',
  'advmod',
  'agent',
  'amod',
  'appos',
  'attr',
  'aux',
  'auxpass',
  'case',
  'cc',
  'ccomp',
  'compound',
  'conj',
  'csubj',
  'csubjpass',
  'dative',
  'dep',
  'det',
  'dobj',
  'expl',
  'intj',
  'mark',
  'meta',
  'neg',
  'nmod',
  'npadvmod',
  'nsubj',
  'nsubjpass',
  'nummod',
  'oprd',
  'parataxis',
  'pcomp',
  'pobj',
  'poss',
  'preconj',
  'predet',
  'prep',
  'prt',
  'punct',
  'quantmod',
  'relcl',
  'xcomp'],
 'attribute_ruler': [],
 'lemmatizer':

In [None]:
# Initialising doc with a sentence.
doc = nlp("If you want to be an excellent programmer \
, be consistent practicing daily on leetcode.")
type(doc)

spacy.tokens.doc.Doc

In [None]:
for token in doc:
    print(f"{token} | {spacy.explain(token.pos_)} | {token.lemma_}")

# notice that the lemma of the word "practicing" is "practice"

If | subordinating conjunction | if
you | pronoun | you
want | verb | want
to | particle | to
be | auxiliary | be
an | determiner | an
excellent | adjective | excellent
programmer | noun | programmer
, | punctuation | ,
be | auxiliary | be
consistent | adjective | consistent
practicing | verb | practice
daily | adverb | daily
on | adposition | on
leetcode | proper noun | leetcode
. | punctuation | .


In [None]:
for token in doc:
    print(token.pos_)

SCONJ
PRON
VERB
PART
AUX
DET
ADJ
NOUN
PUNCT
AUX
ADJ
VERB
ADV
ADP
PROPN
PUNCT


In [None]:
for token in doc:
    print(token.lemma_)

if
you
want
to
be
an
excellent
programmer
,
be
consistent
practice
daily
on
leetcode
.


## Simple Text preprocessing project

In [9]:
# using the dataset of imdb reviews from kaggle
import pandas as pd

In [10]:
data_path = r"D:\py_prac\langchain-prac\datasets\IMDB Dataset.csv"

In [14]:
df = pd.read_csv(data_path)
df.shape
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [20]:
type(df["review"])
df["review"][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [21]:
df["review"] = df["review"].str.lower()
df["review"][0]

"one of the other reviewers has mentioned that after watching just 1 oz episode you'll be hooked. they are right, as this is exactly what happened with me.<br /><br />the first thing that struck me about oz was its brutality and unflinching scenes of violence, which set in right from the word go. trust me, this is not a show for the faint hearted or timid. this show pulls no punches with regards to drugs, sex or violence. its is hardcore, in the classic use of the word.<br /><br />it is called oz as that is the nickname given to the oswald maximum security state penitentary. it focuses mainly on emerald city, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. em city is home to many..aryans, muslims, gangstas, latinos, christians, italians, irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />i would say the main appeal of the show is due to the fa

In [23]:
### remove the html tags
import re
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [29]:
# testing the function
remove_html_tags(df["review"][0])

# APPLYING THE FUNCTION TO THE DATAFRAME
df["review"] = df["review"].apply(remove_html_tags)
df["review"][1]

'a wonderful little production. the filming technique is very unassuming- very old-time-bbc fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. the actors are extremely well chosen- michael sheen not only "has got all the polari" but he has all the voices down pat too! you can truly see the seamless editing guided by the references to williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. a masterful production about one of the great master\'s of comedy and his life. the realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. it plays on our knowledge and our senses, particularly with the scenes concerning orton and halliwell and the sets (particularly of their flat with halliwell\'s murals decorating every surface) are terribly well done.'

In [30]:
### Punctuation removal
import string
# removing punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))
# testing the function
remove_punctuation(df["review"][0])
# APPLYING THE FUNCTION TO THE DATAFRAME
df["review"] = df["review"].apply(remove_punctuation)
df["review"][1]

'a wonderful little production the filming technique is very unassuming very oldtimebbc fashion and gives a comforting and sometimes discomforting sense of realism to the entire piece the actors are extremely well chosen michael sheen not only has got all the polari but he has all the voices down pat too you can truly see the seamless editing guided by the references to williams diary entries not only is it well worth the watching but it is a terrificly written and performed piece a masterful production about one of the great masters of comedy and his life the realism really comes home with the little things the fantasy of the guard which rather than use the traditional dream techniques remains solid then disappears it plays on our knowledge and our senses particularly with the scenes concerning orton and halliwell and the sets particularly of their flat with halliwells murals decorating every surface are terribly well done'

### incorrect_text handling

In [31]:
%pip install textblob

Collecting textblob
  Downloading textblob-0.19.0-py3-none-any.whl.metadata (4.4 kB)
Downloading textblob-0.19.0-py3-none-any.whl (624 kB)
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------- ----------------------- 262.1/624.3 kB ? eta -:--:--
   ---------------------------------------- 624.3/624.3 kB 1.4 MB/s eta 0:00:00
Installing collected packages: textblob
Successfully installed textblob-0.19.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
from textblob import TextBlob
# textblob is a library for processing textual data. It provides a simple API for diving into common natural language processing (NLP) tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, classification, translation, and more.
# Example usage of TextBlob, to correct the text and analyze sentiment
text = 'ceertain conditionas duriing seveal ggenerations aree mooddified innnnn the saame maner.'
blob = TextBlob(text)
text_corrected = blob.correct()
text_corrected

# it is still  not perfect, but it is better than the original text

TextBlob("certain conditions during several generations are modified innnnn the same manner.")

In [43]:
### Removing stopwords
from nltk.corpus import stopwords
# Download stopwords if not already downloaded
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\xspc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [44]:
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on