In [None]:
# Lab Assignment 1: Text Preprocessing and Regular Expressions
# •	Implement tokenization, stemming, and lemmatization using NLTK and spaCy.
# •	Use regular expressions for tasks such as extracting email addresses, phone numbers, and hashtags from a given text dataset of minimum 5 pages.

In [None]:
!pip install nltk spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m62.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import nltk
import spacy
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from collections import Counter

# Download required NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

from nltk.stem import WordNetLemmatizer

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Sample text (use a multi-page text document for real case)
text = """
Contact us at info@example.com or support@company.org.
You can also call +1-800-555-0199 or message us on Twitter #SupportTeam.
Dr. John Smith joined Stanford University in 2021 and works in AI research.
"""

### 1. Tokenization (NLTK)
print("\n=== Tokenization (NLTK) ===")
tokens = word_tokenize(text)
print(tokens)

### 2. Stemming (NLTK)
stemmer = PorterStemmer()
stemmed = [stemmer.stem(word) for word in tokens]
print("\n=== Stemming (NLTK) ===")
print(stemmed)

### 3. Lemmatization (NLTK & spaCy)
lemmatizer = WordNetLemmatizer()
lemmas_nltk = [lemmatizer.lemmatize(word) for word in tokens]

doc = nlp(text)
lemmas_spacy = [token.lemma_ for token in doc]

print("\n=== Lemmatization (NLTK) ===")
print(lemmas_nltk)
print("\n=== Lemmatization (spaCy) ===")
print(lemmas_spacy)

### 4. Regex: Extract emails, phone numbers, hashtags
emails = re.findall(r'\b[\w.-]+?@\w+?\.\w+?\b', text)
phones = re.findall(r'\+?\d[\d\-]{8,}\d', text)
hashtags = re.findall(r'#\w+', text)

print("\n=== Extracted Emails ===")
print(emails)

print("\n=== Extracted Phone Numbers ===")
print(phones)

print("\n=== Extracted Hashtags ===")
print(hashtags)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.



=== Tokenization (NLTK) ===
['Contact', 'us', 'at', 'info', '@', 'example.com', 'or', 'support', '@', 'company.org', '.', 'You', 'can', 'also', 'call', '+1-800-555-0199', 'or', 'message', 'us', 'on', 'Twitter', '#', 'SupportTeam', '.', 'Dr.', 'John', 'Smith', 'joined', 'Stanford', 'University', 'in', '2021', 'and', 'works', 'in', 'AI', 'research', '.']

=== Stemming (NLTK) ===
['contact', 'us', 'at', 'info', '@', 'example.com', 'or', 'support', '@', 'company.org', '.', 'you', 'can', 'also', 'call', '+1-800-555-0199', 'or', 'messag', 'us', 'on', 'twitter', '#', 'supportteam', '.', 'dr.', 'john', 'smith', 'join', 'stanford', 'univers', 'in', '2021', 'and', 'work', 'in', 'ai', 'research', '.']

=== Lemmatization (NLTK) ===
['Contact', 'u', 'at', 'info', '@', 'example.com', 'or', 'support', '@', 'company.org', '.', 'You', 'can', 'also', 'call', '+1-800-555-0199', 'or', 'message', 'u', 'on', 'Twitter', '#', 'SupportTeam', '.', 'Dr.', 'John', 'Smith', 'joined', 'Stanford', 'University', 'in