In [2]:
# =============================
# ASSIGNMENT 9 - NLP with Python
# =============================

import nltk
import re
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

# ----------------------------------------
# Q1. Text Preprocessing
# ----------------------------------------

text = """I love exploring the intersection of technology and human behavior.
Artificial Intelligence is transforming industries, especially healthcare and education.
My fascination lies in how machines learn to think and make decisions.
Books and podcasts keep me updated on cutting-edge advancements.
I'm particularly intrigued by ethical implications of autonomous systems."""

text_lower = text.lower()
text_clean = text_lower.translate(str.maketrans('', '', string.punctuation))

word_tokens = word_tokenize(text_clean)
sent_tokens = sent_tokenize(text_clean)

stop_words = set(stopwords.words("english"))
filtered_words = [word for word in word_tokens if word not in stop_words]


print("\nQ1 - Word Frequency Distribution (filtered):")
fdist = FreqDist(filtered_words)
for word, freq in fdist.most_common():
    print(f"{word}: {freq}")







[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...



Q1 - Word Frequency Distribution (filtered):
love: 1
exploring: 1
intersection: 1
technology: 1
human: 1
behavior: 1
artificial: 1
intelligence: 1
transforming: 1
industries: 1
especially: 1
healthcare: 1
education: 1
fascination: 1
lies: 1
machines: 1
learn: 1
think: 1
make: 1
decisions: 1
books: 1
podcasts: 1
keep: 1
updated: 1
cuttingedge: 1
advancements: 1
im: 1
particularly: 1
intrigued: 1
ethical: 1
implications: 1
autonomous: 1
systems: 1


[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [3]:
# ----------------------------------------
# Q2. Stemming and Lemmatization
# ----------------------------------------

porter = PorterStemmer()
lancaster = LancasterStemmer()
lemmatizer = WordNetLemmatizer()

print("\nQ2 - Stemming Comparison:")
print("{:<15} {:<15} {:<15}".format("Word", "Porter", "Lancaster"))
for word in filtered_words:
    print("{:<15} {:<15} {:<15}".format(word, porter.stem(word), lancaster.stem(word)))

print("\nQ2 - Lemmatization:")
for word in filtered_words:
    print(f"{word} → {lemmatizer.lemmatize(word)}")


Q2 - Stemming Comparison:
Word            Porter          Lancaster      
love            love            lov            
exploring       explor          expl           
intersection    intersect       intersect      
technology      technolog       technolog      
human           human           hum            
behavior        behavior        behavy         
artificial      artifici        art            
intelligence    intellig        intellig       
transforming    transform       transform      
industries      industri        industry       
especially      especi          espec          
healthcare      healthcar       healthc        
education       educ            educ           
fascination     fascin          fascin         
lies            lie             lie            
machines        machin          machin         
learn           learn           learn          
think           think           think          
make            make            mak            
decisions    

In [5]:
# ----------------------------------------
# Q3. Regular Expressions and Splitting
# ----------------------------------------

long_words = re.findall(r'\b\w{6,}\b', text)
print("\nQ3 - Words > 5 letters:", long_words)


numbers = re.findall(r'\b\d+\b', text)
print("Q3 - Numbers:", numbers)


capitalized = re.findall(r'\b[A-Z][a-z]*\b', text)
print("Q3 - Capitalized Words:", capitalized)


alpha_words = re.findall(r'\b[a-zA-Z]+\b', text)
print("Q3 - Alphabet-only Words:", alpha_words)


vowel_words = re.findall(r'\b[aeiouAEIOU]\w*', text)
print("Q3 - Words starting with vowels:", vowel_words)


Q3 - Words > 5 letters: ['exploring', 'intersection', 'technology', 'behavior', 'Artificial', 'Intelligence', 'transforming', 'industries', 'especially', 'healthcare', 'education', 'fascination', 'machines', 'decisions', 'podcasts', 'updated', 'cutting', 'advancements', 'particularly', 'intrigued', 'ethical', 'implications', 'autonomous', 'systems']
Q3 - Numbers: []
Q3 - Capitalized Words: ['I', 'Artificial', 'Intelligence', 'My', 'Books', 'I']
Q3 - Alphabet-only Words: ['I', 'love', 'exploring', 'the', 'intersection', 'of', 'technology', 'and', 'human', 'behavior', 'Artificial', 'Intelligence', 'is', 'transforming', 'industries', 'especially', 'healthcare', 'and', 'education', 'My', 'fascination', 'lies', 'in', 'how', 'machines', 'learn', 'to', 'think', 'and', 'make', 'decisions', 'Books', 'and', 'podcasts', 'keep', 'me', 'updated', 'on', 'cutting', 'edge', 'advancements', 'I', 'm', 'particularly', 'intrigued', 'by', 'ethical', 'implications', 'of', 'autonomous', 'systems']
Q3 - Word

In [6]:
# ----------------------------------------
# Q4. Custom Tokenization and Regex Cleaning
# ----------------------------------------

def custom_tokenizer(text):
    text = re.sub(r'[^\w\s\-\']', '', text)
    return re.findall(r"\b(?:\d+\.\d+|\d+|\w+(?:-\w+)*|\w+'\w+)\b", text)

custom_tokens = custom_tokenizer(text)
print("\nQ4 - Custom Tokens:", custom_tokens)

text_sub = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w+\b', '<EMAIL>', text)
text_sub = re.sub(r'http[s]?://\S+|www\.\S+', '<URL>', text_sub)
text_sub = re.sub(r'(\+91\s?\d{10}|\d{3}[-\s]\d{3}[-\s]\d{4})', '<PHONE>', text_sub)
print("Q4 - Cleaned Text with Substitutions:")
print(text_sub)



Q4 - Custom Tokens: ['I', 'love', 'exploring', 'the', 'intersection', 'of', 'technology', 'and', 'human', 'behavior', 'Artificial', 'Intelligence', 'is', 'transforming', 'industries', 'especially', 'healthcare', 'and', 'education', 'My', 'fascination', 'lies', 'in', 'how', 'machines', 'learn', 'to', 'think', 'and', 'make', 'decisions', 'Books', 'and', 'podcasts', 'keep', 'me', 'updated', 'on', 'cutting-edge', 'advancements', 'I', 'm', 'particularly', 'intrigued', 'by', 'ethical', 'implications', 'of', 'autonomous', 'systems']
Q4 - Cleaned Text with Substitutions:
I love exploring the intersection of technology and human behavior. 
Artificial Intelligence is transforming industries, especially healthcare and education. 
My fascination lies in how machines learn to think and make decisions. 
Books and podcasts keep me updated on cutting-edge advancements. 
I'm particularly intrigued by ethical implications of autonomous systems.
