In [1]:
import nltk

print("Downloading NLTK resources...")
nltk.download('punkt')
nltk.download('wordnet')
print("Downloads complete.")

Downloading NLTK resources...
Downloads complete.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tarru\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tarru\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re

In [3]:
corpus = [
    "He wasn't studying, but he's a great learner.",
    "I've been told they're the best players.",
    "It's a beautiful day; let's not waste it."
]

contractions_map = {
    "wasn't": "was not",
    "he's": "he is",
    "i've": "i have",
    "they're": "they are",
    "it's": "it is",
    "let's": "let us"
}

print("--- Original Corpus ---")
for doc in corpus:
    print(doc)

--- Original Corpus ---
He wasn't studying, but he's a great learner.
I've been told they're the best players.
It's a beautiful day; let's not waste it.


In [4]:
def expand_contractions(text, contractions_map):
    for contraction, expansion in contractions_map.items():
        text = re.sub(r'\b' + re.escape(contraction) + r'\b', expansion, text, flags=re.IGNORECASE)
    return text

expanded_corpus = [expand_contractions(doc, contractions_map) for doc in corpus]

print("--- Corpus After Contraction Expansion ---")
for doc in expanded_corpus:
    print(doc)

--- Corpus After Contraction Expansion ---
He was not studying, but he is a great learner.
i have been told they are the best players.
it is a beautiful day; let us not waste it.


In [5]:
stemmer = PorterStemmer()

print("--- Stemming Results ---")
for doc in expanded_corpus:
    tokens = word_tokenize(doc)
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    
    print(f"\nOriginal Expanded: {doc}")
    print(f"Stemmed Tokens:    {' '.join(stemmed_tokens)}")

--- Stemming Results ---

Original Expanded: He was not studying, but he is a great learner.
Stemmed Tokens:    he wa not studi , but he is a great learner .

Original Expanded: i have been told they are the best players.
Stemmed Tokens:    i have been told they are the best player .

Original Expanded: it is a beautiful day; let us not waste it.
Stemmed Tokens:    it is a beauti day ; let us not wast it .


In [6]:
lemmatizer = WordNetLemmatizer()

print("--- Lemmatization Results ---")
for doc in expanded_corpus:
    tokens = word_tokenize(doc)
    lemmatized_tokens = [lemmatizer.lemmatize(word, pos='v') for word in tokens]
    
    print(f"\nOriginal Expanded: {doc}")
    print(f"Lemmatized Tokens: {' '.join(lemmatized_tokens)}")

--- Lemmatization Results ---

Original Expanded: He was not studying, but he is a great learner.
Lemmatized Tokens: He be not study , but he be a great learner .

Original Expanded: i have been told they are the best players.
Lemmatized Tokens: i have be tell they be the best players .

Original Expanded: it is a beautiful day; let us not waste it.
Lemmatized Tokens: it be a beautiful day ; let us not waste it .
