In [8]:
# N-GRAMS EXAMPLES
from collections import Counter

text = "natural language processing is amazing and natural language is powerful"
words = text.split()

# Bigrams (2-grams)
bigrams = [tuple(words[i:i+2]) for i in range(len(words)-1)]
print("Bigrams:", bigrams)
print("Bigram counts:", Counter(bigrams))

# Trigrams (3-grams)
trigrams = [tuple(words[i:i+3]) for i in range(len(words)-2)]
print("\nTrigrams:", trigrams)

# General n-gram function
def generate_ngrams(text, n):
    words = text.split()
    return [tuple(words[i:i+n]) for i in range(len(words)-n+1)]

print("\n4-grams:", generate_ngrams(text, 4))

# Character-level n-grams
word = "python"
char_bigrams = [word[i:i+2] for i in range(len(word)-1)]
print("\nCharacter bigrams:", char_bigrams)







Bigrams: [('natural', 'language'), ('language', 'processing'), ('processing', 'is'), ('is', 'amazing'), ('amazing', 'and'), ('and', 'natural'), ('natural', 'language'), ('language', 'is'), ('is', 'powerful')]
Bigram counts: Counter({('natural', 'language'): 2, ('language', 'processing'): 1, ('processing', 'is'): 1, ('is', 'amazing'): 1, ('amazing', 'and'): 1, ('and', 'natural'): 1, ('language', 'is'): 1, ('is', 'powerful'): 1})

Trigrams: [('natural', 'language', 'processing'), ('language', 'processing', 'is'), ('processing', 'is', 'amazing'), ('is', 'amazing', 'and'), ('amazing', 'and', 'natural'), ('and', 'natural', 'language'), ('natural', 'language', 'is'), ('language', 'is', 'powerful')]

4-grams: [('natural', 'language', 'processing', 'is'), ('language', 'processing', 'is', 'amazing'), ('processing', 'is', 'amazing', 'and'), ('is', 'amazing', 'and', 'natural'), ('amazing', 'and', 'natural', 'language'), ('and', 'natural', 'language', 'is'), ('natural', 'language', 'is', 'powerful

In [9]:
# STEMMING EXAMPLES
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer
import nltk
nltk.download('punkt')

# Porter Stemmer (most common)
porter = PorterStemmer()
words = ["running", "runs", "ran", "runner", "easily", "fairly", "playing", "played"]
print("\n--- Porter Stemmer ---")
for word in words:
    print(f"{word} -> {porter.stem(word)}")

# Snowball Stemmer (supports multiple languages)
snowball = SnowballStemmer("english")
print("\n--- Snowball Stemmer ---")
for word in words:
    print(f"{word} -> {snowball.stem(word)}")

# Lancaster Stemmer (most aggressive)
lancaster = LancasterStemmer()
print("\n--- Lancaster Stemmer ---")
for word in words:
    print(f"{word} -> {lancaster.stem(word)}")


--- Porter Stemmer ---
running -> run
runs -> run
ran -> ran
runner -> runner
easily -> easili
fairly -> fairli
playing -> play
played -> play

--- Snowball Stemmer ---
running -> run
runs -> run
ran -> ran
runner -> runner
easily -> easili
fairly -> fair
playing -> play
played -> play

--- Lancaster Stemmer ---
running -> run
runs -> run
ran -> ran
runner -> run
easily -> easy
fairly -> fair
playing -> play
played -> play


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dimtriospanagoulias/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
# LEMMATIZATION EXAMPLES
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

print("\n--- Lemmatization (default = noun) ---")
words = ["running", "runs", "ran", "runner", "better", "worse", "feet", "geese"]
for word in words:
    print(f"{word} -> {lemmatizer.lemmatize(word)}")

# Lemmatization with POS tags
print("\n--- Lemmatization with POS tags ---")
print(f"running (verb) -> {lemmatizer.lemmatize('running', pos='v')}")
print(f"running (noun) -> {lemmatizer.lemmatize('running', pos='n')}")
print(f"better (adjective) -> {lemmatizer.lemmatize('better', pos='a')}")
print(f"worse (adjective) -> {lemmatizer.lemmatize('worse', pos='a')}")
print(f"is (verb) -> {lemmatizer.lemmatize('is', pos='v')}")





--- Lemmatization (default = noun) ---
running -> running
runs -> run
ran -> ran
runner -> runner
better -> better
worse -> worse
feet -> foot
geese -> goose

--- Lemmatization with POS tags ---
running (verb) -> run
running (noun) -> running
better (adjective) -> good
worse (adjective) -> bad
is (verb) -> be


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dimtriospanagoulias/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/dimtriospanagoulias/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [11]:
# COMPARISON: Stemming vs Lemmatization
print("\n--- Stemming vs Lemmatization Comparison ---")
test_words = ["studies", "studying", "better", "worse", "caring", "cared", "cats", "geese"]
for word in test_words:
    stemmed = porter.stem(word)
    lemmatized = lemmatizer.lemmatize(word)
    lemmatized_verb = lemmatizer.lemmatize(word, pos='v')
    print(f"{word:12} | Stem: {stemmed:10} | Lemma(n): {lemmatized:10} | Lemma(v): {lemmatized_verb}")
    




--- Stemming vs Lemmatization Comparison ---
studies      | Stem: studi      | Lemma(n): study      | Lemma(v): study
studying     | Stem: studi      | Lemma(n): studying   | Lemma(v): study
better       | Stem: better     | Lemma(n): better     | Lemma(v): better
worse        | Stem: wors       | Lemma(n): worse      | Lemma(v): worse
caring       | Stem: care       | Lemma(n): caring     | Lemma(v): care
cared        | Stem: care       | Lemma(n): cared      | Lemma(v): care
cats         | Stem: cat        | Lemma(n): cat        | Lemma(v): cat
geese        | Stem: gees       | Lemma(n): goose      | Lemma(v): geese
