# Libraries

In [2]:
import nltk

In [3]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [51]:
from nltk.corpus import stopwords

In [52]:
from nltk import ne_chunk

In [24]:
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer, RegexpStemmer

In [44]:
from nltk import pos_tag

In [7]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\keert\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [9]:
print(nltk.data.find('tokenizers/punkt'))  

C:\Users\keert\nltk_data\tokenizers\punkt


In [11]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ----- ---------------------------------- 1.8/12.8 MB 10.1 MB/s eta 0:00:02
     ----------- ---------------------------- 3.7/12.8 MB 9.5 MB/s eta 0:00:01
     ---------------- ----------------------- 5.2/12.8 MB 8.9 MB/s eta 0:00:01
     --------------------- ------------------ 6.8/12.8 MB 8.2 MB/s eta 0:00:01
     --------------------------- ------------ 8.9/12.8 MB 8.4 MB/s eta 0:00:01
     --------------------------------- ------ 10.7/12.8 MB 8.4 MB/s eta 0:00:01
     -------------------------------------- - 12.3/12.8 MB 8.4 MB/s eta 0:00:01
     ---------------------------------------- 12.8/12.8 MB 7.9 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m[+] Download and 

# Data

In [6]:
rhymes = """Twinkle, twinkle, little star,  
How I wonder what you are!  
Up above the world so high,  
Like a diamond in the sky.
"""

# Tokenization

In [7]:
tokens = word_tokenize(rhymes)

In [8]:
tokens

['Twinkle',
 ',',
 'twinkle',
 ',',
 'little',
 'star',
 ',',
 'How',
 'I',
 'wonder',
 'what',
 'you',
 'are',
 '!',
 'Up',
 'above',
 'the',
 'world',
 'so',
 'high',
 ',',
 'Like',
 'a',
 'diamond',
 'in',
 'the',
 'sky',
 '.']

In [12]:
import spacy

# Load English language model
nlp = spacy.load("en_core_web_sm")

# First stanza text
text = """Twinkle, twinkle, little star,
How I wonder what you are!
Up above the world so high,
Like a diamond in the sky."""

# Process the text with spaCy
doc = nlp(text)

# Print tokens
print("Tokenized words:\n")
for token in doc:
    print(token.text)

Tokenized words:

Twinkle
,
twinkle
,
little
star
,


How
I
wonder
what
you
are
!


Up
above
the
world
so
high
,


Like
a
diamond
in
the
sky
.


# Stemming

In [25]:
# Initialize stemmers
porter = PorterStemmer()
snowball = SnowballStemmer('english')
lancaster = LancasterStemmer()
regex_stemmer = RegexpStemmer('ing$|s$|e$', min=4)  # Remove endings like 'ing', 's', 'e' from words longer than 4 chars

In [18]:
# Stem with each stemmer
porter_stems = [porter.stem(word) for word in tokens]
snowball_stems = [snowball.stem(word) for word in tokens]
lancaster_stems = [lancaster.stem(word) for word in tokens]
regex_stems = [regex_stemmer.stem(word) for word in tokens]

print("Original:", tokens)
print("Porter stems:", porter_stems)
print("Snowball stems:", snowball_stems)
print("Lancaster stems:", lancaster_stems)
print("Regex stems:", regex_stems)

Original: ['Twinkle', ',', 'twinkle', ',', 'little', 'star', ',', 'How', 'I', 'wonder', 'what', 'you', 'are', '!', 'Up', 'above', 'the', 'world', 'so', 'high', ',', 'Like', 'a', 'diamond', 'in', 'the', 'sky', '.']
Porter stems: ['twinkl', ',', 'twinkl', ',', 'littl', 'star', ',', 'how', 'i', 'wonder', 'what', 'you', 'are', '!', 'up', 'abov', 'the', 'world', 'so', 'high', ',', 'like', 'a', 'diamond', 'in', 'the', 'sky', '.']
Snowball stems: ['twinkl', ',', 'twinkl', ',', 'littl', 'star', ',', 'how', 'i', 'wonder', 'what', 'you', 'are', '!', 'up', 'abov', 'the', 'world', 'so', 'high', ',', 'like', 'a', 'diamond', 'in', 'the', 'sky', '.']
Lancaster stems: ['twinkl', ',', 'twinkl', ',', 'littl', 'star', ',', 'how', 'i', 'wond', 'what', 'you', 'ar', '!', 'up', 'abov', 'the', 'world', 'so', 'high', ',', 'lik', 'a', 'diamond', 'in', 'the', 'sky', '.']
Regex stems: ['Twinkl', ',', 'twinkl', ',', 'littl', 'star', ',', 'How', 'I', 'wonder', 'what', 'you', 'are', '!', 'Up', 'abov', 'the', 'world'

# Stop words removal

In [23]:
stop_words = set(stopwords.words('english'))

In [25]:
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

In [26]:
filtered_tokens

['Twinkle',
 ',',
 'twinkle',
 ',',
 'little',
 'star',
 ',',
 'wonder',
 '!',
 'world',
 'high',
 ',',
 'Like',
 'diamond',
 'sky',
 '.']

In [27]:
print("Original Tokens:", tokens)
print("After Stopword Removal:", filtered_tokens)

Original Tokens: ['Twinkle', ',', 'twinkle', ',', 'little', 'star', ',', 'How', 'I', 'wonder', 'what', 'you', 'are', '!', 'Up', 'above', 'the', 'world', 'so', 'high', ',', 'Like', 'a', 'diamond', 'in', 'the', 'sky', '.']
After Stopword Removal: ['Twinkle', ',', 'twinkle', ',', 'little', 'star', ',', 'wonder', '!', 'world', 'high', ',', 'Like', 'diamond', 'sky', '.']


In [13]:
# Filter out stopwords and punctuation
filtered_tokens = [token.text for token in doc 
                   if not token.is_stop and token.is_alpha]

print("Original Tokens:", [token.text for token in doc])
print("After Stopword Removal:", filtered_tokens)


Original Tokens: ['Twinkle', ',', 'twinkle', ',', 'little', 'star', ',', '\n', 'How', 'I', 'wonder', 'what', 'you', 'are', '!', '\n', 'Up', 'above', 'the', 'world', 'so', 'high', ',', '\n', 'Like', 'a', 'diamond', 'in', 'the', 'sky', '.']
After Stopword Removal: ['Twinkle', 'twinkle', 'little', 'star', 'wonder', 'world', 'high', 'Like', 'diamond', 'sky']


In [15]:
import spacy

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

# Sample text (Twinkle stanza or any text)
text = "Twinkle twinkle little star, how I wonder what you are."

# Process with spaCy
doc = nlp(text)

# Filter out stopwords and punctuation
filtered_tokens = [token.text for token in doc 
                   if not token.is_stop]

print("Original Tokens:", [token.text for token in doc])
print("After Stopword Removal:", filtered_tokens)


Original Tokens: ['Twinkle', 'twinkle', 'little', 'star', ',', 'how', 'I', 'wonder', 'what', 'you', 'are', '.']
After Stopword Removal: ['Twinkle', 'twinkle', 'little', 'star', ',', 'wonder', '.']


# Lemmatization

In [18]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [19]:
simple_lemmas = [lemmatizer.lemmatize(word) for word in tokens]

In [20]:
simple_lemmas

['Twinkle',
 ',',
 'twinkle',
 ',',
 'little',
 'star',
 ',',
 'How',
 'I',
 'wonder',
 'what',
 'you',
 'are',
 '!',
 'Up',
 'above',
 'the',
 'world',
 'so',
 'high',
 ',',
 'Like',
 'a',
 'diamond',
 'in',
 'the',
 'sky',
 '.']

In [31]:
lemmatizer.lemmatize('best')

'best'

In [28]:
porter.stem('congragulate')

'congragul'

In [21]:
# Load English model
nlp = spacy.load("en_core_web_sm")

# Input text
text = """Twinkle twinkle little star,
How I wonder what you are.
Up above the world so high,
Like a diamond in the sky."""

# Process text with spaCy
doc = nlp(text)

# Print original and lemmatized versions
print("Original Text:")
print(" ".join([token.text for token in doc]))

print("\nLemmatized Text:")
print(" ".join([token.lemma_ for token in doc]))

Original Text:
Twinkle twinkle little star , 
 How I wonder what you are . 
 Up above the world so high , 
 Like a diamond in the sky .

Lemmatized Text:
twinkle twinkle little star , 
 how I wonder what you be . 
 up above the world so high , 
 like a diamond in the sky .


# POS Tagging

In [32]:
import nltk
from nltk import word_tokenize, pos_tag

# Download if not already done
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Input stanza
text = """Twinkle twinkle little star,
How I wonder what you are.
Up above the world so high,
Like a diamond in the sky."""

# Tokenize and POS tag
tokens = word_tokenize(text)
pos_tags = pos_tag(tokens)

# Display result
for word, tag in pos_tags:
    print(f"{word:10} --> {tag}")


[nltk_data] Downloading package punkt to C:\Users\keert/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\keert/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Twinkle    --> NNP
twinkle    --> VBD
little     --> JJ
star       --> NN
,          --> ,
How        --> WRB
I          --> PRP
wonder     --> VBP
what       --> WP
you        --> PRP
are        --> VBP
.          --> .
Up         --> IN
above      --> IN
the        --> DT
world      --> NN
so         --> RB
high       --> JJ
,          --> ,
Like       --> IN
a          --> DT
diamond    --> NN
in         --> IN
the        --> DT
sky        --> NN
.          --> .


In [33]:
import spacy

# Load the English spaCy model
nlp = spacy.load("en_core_web_sm")

# Input stanza
text = """Twinkle twinkle little star,
How I wonder what you are.
Up above the world so high,
Like a diamond in the sky."""

# Process the text
doc = nlp(text)

# Print POS tags
for token in doc:
    print(f"{token.text:10} --> {token.pos_:10} ({token.tag_})")

Twinkle    --> NOUN       (NN)
twinkle    --> NOUN       (NN)
little     --> ADJ        (JJ)
star       --> NOUN       (NN)
,          --> PUNCT      (,)

          --> SPACE      (_SP)
How        --> SCONJ      (WRB)
I          --> PRON       (PRP)
wonder     --> VERB       (VBP)
what       --> PRON       (WP)
you        --> PRON       (PRP)
are        --> AUX        (VBP)
.          --> PUNCT      (.)

          --> SPACE      (_SP)
Up         --> ADP        (RP)
above      --> ADP        (IN)
the        --> DET        (DT)
world      --> NOUN       (NN)
so         --> ADV        (RB)
high       --> ADJ        (JJ)
,          --> PUNCT      (,)

          --> SPACE      (_SP)
Like       --> ADP        (IN)
a          --> DET        (DT)
diamond    --> NOUN       (NN)
in         --> ADP        (IN)
the        --> DET        (DT)
sky        --> NOUN       (NN)
.          --> PUNCT      (.)


# Named Entitty Recognition

In [36]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk

# Input text
text = """
Pussy cat, pussy cat, where have you been?
I've been to London to visit the Queen.
Pussy cat, pussy cat, what did you there?
I frightened a little mouse under the chair.
"""

# Tokenize & POS tag
tokens = word_tokenize(text)
pos_tags = pos_tag(tokens)

# Named Entity Recognition
ne_tree = ne_chunk(pos_tags)

# Print Named Entities
for subtree in ne_tree:
    if hasattr(subtree, 'label'):
        print(f"{' '.join(c[0] for c in subtree)} --> {subtree.label()}")

Pussy --> GPE
London --> GPE
Pussy --> PERSON


In [37]:
import spacy

# Load English model
nlp = spacy.load("en_core_web_sm")

# Rhyme text
text = """
Pussy cat, pussy cat, where have you been?
I've been to London to visit the Queen.
Pussy cat, pussy cat, what did you there?
I frightened a little mouse under the chair.
"""

# Process the text
doc = nlp(text)

# Print Named Entities
for ent in doc.ents:
    print(f"{ent.text:20} --> {ent.label_}")

Pussy                --> PERSON
London               --> GPE
Pussy                --> PERSON


# 