In [1]:


import nltk

# Download commonly-used NLTK resources for this notebook.
# If you already have them, NLTK will just say "already up-to-date".
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('vader_lexicon')

print("NLTK resources downloaded (or already available).")

NLTK resources downloaded (or already available).


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/giridharsripathi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/giridharsripathi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/giridharsripathi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/giridharsripathi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/giridharsripathi/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/giridharsripathi/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package v

## 1) Tokenization (Word Tokenization)

In [3]:
# Use-case: Break a sentence into word-level tokens (words + punctuation).
# Why it matters: Tokenization is the first step for many NLP pipelines (classification, NER, etc.).

from nltk.tokenize import word_tokenize

text = "IBM Watson is transforming enterprise AI."
# word_tokenize splits text into words and punctuation tokens.
tokens = word_tokenize(text)

print("Original text:", text)
print("Tokens:", tokens)


Original text: IBM Watson is transforming enterprise AI.
Tokens: ['IBM', 'Watson', 'is', 'transforming', 'enterprise', 'AI', '.']


## 2) Sentence Segmentation (Sentence Tokenization)

In [4]:
# Use-case: Split a paragraph into individual sentences.
# Why it matters: Useful for summarization, sentence-level sentiment, sentence embeddings, etc.

from nltk.tokenize import sent_tokenize

text = "IBM builds AI. It works on Watson. It helps companies."
# sent_tokenize identifies sentence boundaries (handles common punctuation cases).
sentences = sent_tokenize(text)

print("Original text:", text)
print("Sentences:", sentences)


Original text: IBM builds AI. It works on Watson. It helps companies.
Sentences: ['IBM builds AI.', 'It works on Watson.', 'It helps companies.']


## 3) Stopword Removal

In [5]:
# Use-case: Remove commonly occurring words like "the", "is", "a" that often add little meaning.
# Why it matters: Helps reduce noise for tasks like keyword extraction and classical ML models.

from nltk.corpus import stopwords

words = ["this", "is", "a", "cloud", "platform"]
stop_words = set(stopwords.words("english"))  # Load stopwords list for English

# Keep only words that are NOT stopwords
filtered = [w for w in words if w.lower() not in stop_words]

print("Original:", words)
print("Filtered:", filtered)


Original: ['this', 'is', 'a', 'cloud', 'platform']
Filtered: ['cloud', 'platform']


## 4) Stemming (Porter Stemmer)

In [6]:
# Use-case: Reduce words to a crude base form (stem).
# Example: "running" -> "run"
# Why it matters: Helps match variants of the same word in search / retrieval / classical models.

from nltk.stem import PorterStemmer

ps = PorterStemmer()
words = ["running", "runs", "ran", "runner"]

# Stem each word (rule-based, not always a real dictionary word)
stems = [ps.stem(w) for w in words]

print("Words:", words)
print("Stems:", stems)


Words: ['running', 'runs', 'ran', 'runner']
Stems: ['run', 'run', 'ran', 'runner']


## 5) Lemmatization (WordNet Lemmatizer)

In [7]:
# Use-case: Convert a word into its dictionary base form (lemma).
# Why it matters: Usually cleaner than stemming because it tries to return valid words.

from nltk.stem import WordNetLemmatizer

lemm = WordNetLemmatizer()
words = ["better", "running", "cars"]

# Note: Without POS info, lemmatizer assumes noun by default.
lemmas = [lemm.lemmatize(w) for w in words]

print("Words:", words)
print("Lemmas:", lemmas)


Words: ['better', 'running', 'cars']
Lemmas: ['better', 'running', 'car']


## 6) Part-of-Speech (POS) Tagging

In [8]:
# Use-case: Tag each word with its grammatical role (noun, verb, adjective, etc.)
# Why it matters: Useful for chunking, parsing, keyword extraction (e.g., nouns), etc.

from nltk.tokenize import word_tokenize
from nltk import pos_tag

text = "IBM Watson builds smart systems"
tokens = word_tokenize(text)

# pos_tag assigns a POS label to each token
tagged = pos_tag(tokens)

print("Text:", text)
print("POS tagged:", tagged)


Text: IBM Watson builds smart systems
POS tagged: [('IBM', 'NNP'), ('Watson', 'NNP'), ('builds', 'VBZ'), ('smart', 'JJ'), ('systems', 'NNS')]


## 7) Named Entity Recognition (NER) with Chunker

In [9]:
# Use-case: Detect named entities like organizations, locations, people.
# Why it matters: Useful in incident analysis, ticket classification, compliance, and entity indexing.

from nltk.tokenize import word_tokenize
from nltk import pos_tag, ne_chunk

sentence = "IBM is located in New York"
tokens = word_tokenize(sentence)
tagged = pos_tag(tokens)

# ne_chunk converts POS-tagged tokens into a tree containing named entity chunks
entities_tree = ne_chunk(tagged)

print("Sentence:", sentence)
print("NER tree:", entities_tree)


Sentence: IBM is located in New York
NER tree: (S
  (ORGANIZATION IBM/NNP)
  is/VBZ
  located/VBN
  in/IN
  (GPE New/NNP York/NNP))


## 8) Frequency Distribution (Most Common Words)

In [10]:
# Use-case: Count how frequently each token occurs.
# Why it matters: Basic analytics for keyword discovery, vocabulary building, etc.

from nltk import FreqDist
from nltk.tokenize import word_tokenize

text = "AI AI ML ML ML NLP"
tokens = word_tokenize(text)

fd = FreqDist(tokens)  # Count token frequencies

print("Tokens:", tokens)
print("Most common:", fd.most_common())


Tokens: ['AI', 'AI', 'ML', 'ML', 'ML', 'NLP']
Most common: [('ML', 3), ('AI', 2), ('NLP', 1)]


## 9) Bigram Analysis

In [11]:
# Use-case: Generate 2-word sequences (bigrams).
# Why it matters: Useful for phrase mining like "machine learning", "data science", etc.

from nltk import bigrams
from nltk.tokenize import word_tokenize

text = "machine learning models are powerful"
tokens = word_tokenize(text)

# bigrams returns an iterator of consecutive pairs
bg = list(bigrams(tokens))

print("Tokens:", tokens)
print("Bigrams:", bg)


Tokens: ['machine', 'learning', 'models', 'are', 'powerful']
Bigrams: [('machine', 'learning'), ('learning', 'models'), ('models', 'are'), ('are', 'powerful')]


## 10) Trigram Analysis

In [12]:
# Use-case: Generate 3-word sequences (trigrams).
# Why it matters: Captures context better than bigrams for phrase patterns.

from nltk import trigrams
from nltk.tokenize import word_tokenize

text = "AI models learn from data"
tokens = word_tokenize(text)

tg = list(trigrams(tokens))

print("Tokens:", tokens)
print("Trigrams:", tg)


Tokens: ['AI', 'models', 'learn', 'from', 'data']
Trigrams: [('AI', 'models', 'learn'), ('models', 'learn', 'from'), ('learn', 'from', 'data')]


## 11) Text Cleaning (Regex-based)

In [14]:
# Use-case: Remove special characters/numbers and keep only letters + spaces.
# Why it matters: Preprocessing step for classical NLP models or keyword extraction.

import re

text = "IBM!! Watson?? 2025"
# Replace everything that's NOT a letter or space with empty string.
clean = re.sub(r"[^a-zA-Z ]", "", text)

# re.sub(source,destination)


print("Original:", text)
print("Cleaned:", clean)


Original: IBM!! Watson?? 2025
Cleaned: IBM Watson 


## 12) Bag of Words (CountVectorizer)

In [15]:
# Use-case: Convert a collection of documents into a bag-of-words numeric matrix.
# Why it matters: Classical ML models (LogReg, NB, SVM) often use this representation.

from sklearn.feature_extraction.text import CountVectorizer

docs = ["AI is powerful", "AI is smart"]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(docs)  # Learn vocab + transform docs

print("Vocabulary:", vectorizer.get_feature_names_out())
print("BoW matrix:\n", X.toarray())


Vocabulary: ['ai' 'is' 'powerful' 'smart']
BoW matrix:
 [[1 1 1 0]
 [1 1 0 1]]


## 13) TF-IDF (Term importance)

In [16]:
# Use-case: Convert text into TF-IDF features (downweights common words, upweights important words).
# Why it matters: Strong baseline for search, similarity, and classical ML.

from sklearn.feature_extraction.text import TfidfVectorizer

docs = ["AI builds systems", "AI builds intelligence"]

tfidf = TfidfVectorizer()
X = tfidf.fit_transform(docs)

print("Features:", tfidf.get_feature_names_out())
print("TF-IDF matrix:\n", X.toarray())


Features: ['ai' 'builds' 'intelligence' 'systems']
TF-IDF matrix:
 [[0.50154891 0.50154891 0.         0.70490949]
 [0.50154891 0.50154891 0.70490949 0.        ]]


## 14) Simple Text Similarity (Jaccard over token sets)

In [18]:
# Use-case: Compute rough similarity between two short texts using token overlap.
# Why it matters: Useful as a quick baseline for deduplication / clustering heuristics.

from nltk.tokenize import word_tokenize

t1 = "AI builds models"
t2 = "AI builds systems"

set1 = set(word_tokenize(t1.lower()))
set2 = set(word_tokenize(t2.lower()))

# Jaccard similarity = intersection / union
similarity = len(set1 & set2) / len(set1 | set2)

print("Text1:", t1)
print("Text2:", t2)
print("Jaccard similarity:", similarity)


Text1: AI builds models
Text2: AI builds systems
Jaccard similarity: 0.5


## 15) Spam Detection (Toy Naive Bayes with NLTK)

In [20]:
# Use-case: Classify messages as spam/ham using NLTK's NaiveBayesClassifier.
# Why it matters: Demonstrates a classic NLP classification approach quickly.

from nltk.classify import NaiveBayesClassifier

# Training data format: (feature_dict, label)
# Here we use a very small toy dataset just for demonstration.
train = [
    ({"free": True, "win": True}, "spam"),
    ({"hello": True, "meeting": True}, "ham"),
    ({"free": True, "prize": True}, "spam"),
    ({"project": True, "update": True}, "ham"),
]

model = NaiveBayesClassifier.train(train)

# Predict a new message represented as features
test_features = {"free": True, "offer": True}
prediction = model.classify(test_features)

print("Test features:", test_features)
print("Prediction:", prediction)


Test features: {'free': True, 'offer': True}
Prediction: spam


## 16) Sentiment Analysis (VADER)

In [22]:
# Use-case: Get sentiment scores for a sentence (positive/negative/neutral).
# Why it matters: Useful for feedback analysis, survey analytics, social media monitoring.

from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

text = "IBM Watson is amazing and very helpful."
scores = sia.polarity_scores(text)  # returns dict with pos/neu/neg/compound

print("Text:", text)
print("Scores:", scores)


Text: IBM Watson is amazing and very helpful.
Scores: {'neg': 0.0, 'neu': 0.42, 'pos': 0.58, 'compound': 0.7841}


## 17) Chunking (Phrase Extraction using POS patterns)

In [None]:
# Use-case: Extract noun phrases using a simple chunk grammar.
# Why it matters: Quick way to extract "key phrases" like 'smart system', 'cloud platform', etc.

import nltk
from nltk import RegexpParser
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# Grammar: NP (Noun Phrase) = optional determiner + adjectives + noun
grammar = "NP: {<DT>?<JJ>*<NN>}"

parser = RegexpParser(grammar)

text = "The smart system"
tokens = word_tokenize(text)
tagged = pos_tag(tokens)

tree = parser.parse(tagged)

print("Text:", text)
print("Chunk tree:", tree)


## 18) Keyword Extraction (FreqDist + Stopword filtering)

In [None]:
# Use-case: Extract top keywords by frequency after basic cleaning.
# Why it matters: Quick baseline keyword extraction for tickets / logs / documents.

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import FreqDist
import re

text = "AI models build predictive systems. Predictive systems help enterprises."
# 1) Lowercase
text = text.lower()
# 2) Remove non-letters
text = re.sub(r"[^a-z ]", " ", text)
# 3) Tokenize
tokens = word_tokenize(text)
# 4) Remove stopwords + short tokens
stop_words = set(stopwords.words("english"))
tokens = [t for t in tokens if t not in stop_words and len(t) > 2]

fd = FreqDist(tokens)
print("Top keywords:", fd.most_common(5))
