# Stemming and Lemmatization

#### 1. Given the list of pluralized words below, define your own simple word stemmer function or class,  limited to only simple rules and regex. No libraries! It should strip basic endings.

In [69]:
plurals = [
    "flies",
    "denied",
    "itemization",
    "sensational",
    "reference",
    "colonizer",
]

# TODO: implement your own ismple stemmer

def stemmer(wordArray):
    endings = ["es", "ed", "al", "ion", "er"]
    stemmedWords = []
    for word in wordArray:
        for ending in endings:
            if word.endswith(ending):
                word = word[:-len(ending)]
        stemmedWords.append(word)
    return stemmedWords

print(stemmer(plurals))

['fli', 'deni', 'itemizat', 'sensat', 'reference', 'coloniz']


#### 2. After your initial implementation, run it on the following words:

In [70]:
new_words = [
    "friendly",
    "puzzling",
    "helpful",
]
# TODO: run your stemmer on the new words

print(stemmer(new_words))

['friendly', 'puzzling', 'helpful']


#### 3. Realizing that fixing future words manually can be problematic, use a desired NLTK stemmer and run it on all the words:

In [71]:
import nltk

all_words = plurals + new_words

# TODO: use an nltk stemming implementation to stem `all_words`

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmedWords = [stemmer.stem(word) for word in all_words]
print(stemmedWords)

['fli', 'deni', 'item', 'sensat', 'refer', 'colon', 'friendli', 'puzzl', 'help']


#### 4. There are likely a few words in the outputs above that would cause issues in real-world applications. Pick some examples, and show how they are solved with a lemmatizer. Use either spaCy or nltk.

Your answer here! Code below.

In [72]:
import spacy

# TODO: basic observations on which examples are problematic with stemming + implement lemmatization with spacy/nltk

nlp = spacy.load("en_core_web_sm")

allWordsLemmatized = [nlp(word)[0].lemma_ for word in all_words]

print(allWordsLemmatized)


['fly', 'deny', 'itemization', 'sensational', 'reference', 'colonizer', 'friendly', 'puzzle', 'helpful']


# Stemming/Lemmatization - Practical Example
Using the news corpus (subset/category of the Brown corpus), perform common text normalization techniques such as stopword filtering and stemming/lemmatization. Compare the top 10 most common **words** before and after these normalization techniques.

In [73]:
# import nltk; nltk.download('brown')  # ensure we have the data
from nltk.corpus import brown
news = brown.words(categories='news')

# TODO: find the top 10 most common words
from collections import Counter

print(Counter(news).most_common(10))


[('the', 5580), (',', 5188), ('.', 4030), ('of', 2849), ('and', 2146), ('to', 2116), ('a', 1993), ('in', 1893), ('for', 943), ('The', 806)]


In [74]:
# nltk.download('stopwords')
from nltk.corpus import stopwords

# TODO: find the top 10 most common words after applying text normalization techniques

"""
# remove non-alphabetic characters
newsAlpha = [word for word in news if word.isalpha()]
newsLemmatized = [nlp(word)[0].lemma_ for word in newsAlpha]
print(Counter(newsLemmatized).most_common(10))
# [('the', 6386), ('of', 2861), ('be', 2840), ('and', 2186), ('to', 2144),
# ('a', 2130), ('in', 2020), ('for', 969), ('have', 861), ('that', 829)]
# Runtime: 3m 14s


news_text = ' '.join(news)
newsLemmatized = [token.lemma_.lower() for token in nlp(news_text) if token.is_alpha]
print(Counter(newsLemmatized).most_common(10))
# [('the', 6390), ('of', 2864), ('be', 2853), ('and', 2190), ('to', 2155),
# ('a', 2139), ('in', 2034), ('for', 972), ('have', 860), ('that', 843)]
# Runtime: 8.6s
"""

stop_words = set(stopwords.words('english'))

# Apply stopword filtering along with lemmatization
news_text = ' '.join(news)
news_normalised = [token.lemma_.lower() for token in nlp(news_text)
                 if token.is_alpha and token.lemma_.lower() not in stop_words]

print(Counter(news_normalised).most_common(10))

[('say', 464), ('year', 319), ('would', 249), ('new', 245), ('one', 233), ('two', 191), ('state', 189), ('make', 184), ('last', 180), ('president', 164)]


# TF-IDF
TF-IDF (term frequency-inverse document frequency) is a way to measure the importance of a word in a document.

$$
\text{tf-idf}(t, d, D) = \text{tf}(t, d) \times \text{idf}(t, D)
$$

Where:
- $t$ is the term (word)
- $d$ is the document
- $D$ is the corpus



#### 1. Implement TF-IDF using NLTKs FreqDist (no use of e.g. scikit-learn and other high-level libraries).

### Notes to self:
**Term Frequency (TF)**
How frequently a term occurs in a document.
$$
\text{tf}(t, d) = \frac{\text{instances of term } t \text{ in document } d}{\text{total number of terms in document } d}
$$

**Inverse Document Frequency (IDF)**
How important a term is.
$$
\text{idf}(t, D) = \log \left( \frac{\text{total number of documents in corpus } D}{\text{number of documents containing term } t} \right)
$$

In [75]:
from typing import List
from nltk import FreqDist
import numpy as np

##########################################################
# Feel free to change everything below.
# It is merely a guide to understand the inputs/outputs
##########################################################



############ TODO ############
def tf(document: List[str], term: str) -> float:
    """
    Calculate the term frequency (TF) of a given term in a document.

    Args:
        document (List[str]): The document in which to calculate the term frequency.
        term (str): The term for which to calculate the term frequency.

    Returns:
        float: The term frequency of the given term in the document.
    """
    return FreqDist(document).freq(term)
    # or
    freq_dist = FreqDist(document)
    instancesOfTerm = freq_dist.get(term)
    totalWords = freq_dist.N()
    return instancesOfTerm / totalWords
    

print(news)
print(tf(news, "the"))

############ TODO ############
def idf(documents: List[List[str]], term: str) -> float:
    """
    Calculate the inverse document frequency (IDF) of a term in a collection of documents.

    Args:
        documents (List[List[str]]): A list of documents, where each document is represented as a list of strings.
        term (str): The term for which IDF is calculated.

    Returns:
        float: The IDF value of the term.
    """
    # Convert each document to a set of words for faster membership checking
    documents_sets = [set(doc) for doc in documents]

    # Count documents containing the term using set for faster lookup
    nt = sum(1 for doc_set in documents_sets if term in doc_set)

    # Total number of documents
    N = len(documents)

    if nt == 0:
        return 0
    else:
        return np.log(N / nt)


############ TODO ############
def tf_idf(
    all_documents: List[List[str]],
    document: List[str],
    term: str,
) -> float:
    return tf(document, term) * idf(all_documents, term)


['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]
0.05549257115579689


#### 2. With your TF-IDF function in place, calculate the TF-IDF for the following words in the first document of the news articles found in the Brown corpus: 

- *the*
- *nevertheless*
- *highway*
- *election*

Perform any preprocessing steps you deem necessary. Comment on your findings.

In [76]:
fileids = brown.fileids(categories='news')
first_doc = list(brown.words(fileids[0]))
all_docs = [list(brown.words(fileid)) for fileid in fileids]

# TODO: preprocess and calculate tf-idf scores.

terms = ["the", "nevertheless", "highway", "election"]

for term in terms:
    print(f"TF-IDF for '{term}': {tf_idf(all_docs, first_doc, term)}")

TF-IDF for 'the': 0.0
TF-IDF for 'nevertheless': 0.0
TF-IDF for 'highway': 0.003593546849130443
TF-IDF for 'election': 0.009251767873746217


**Comment**: "the" and "nevertheless" both yeilded a value of zero becuase "the" was in all documents (log(1)) and "nevertheless" was in no documents (0 instances). In the first document "election" seemed to be a hotter topic than highway.

#### 3. While TF-IDF is primarily used for information retrieval and text mining, reflect on how TF-IDF could be used in a language modeling context.

It could be used to help the model's understanding and generation of text by prioritizing words that are more relevant and distinctive to the topics being modeled. This could lead to better performance, especially in tasks where its important to understand the nuances in the text.

#### 4. You were previously introduced to word representations. TF-IDF can be considered one. What are some differences between the TF-IDF output and one that is computed once from a vocabulary (e.g. one-hot encoding)?

TF-IDF's can give two different words the same score and is therefore not a unique identifyer. One-hot encoding says nothing about the word but is purely an identifier.

# TF-IDF - Practical Example
You will again be looking at specific words for a document, but this time weighted by their TF-IDF scores. Ideally, the scoring should be able to retrieve representative words for this document in context of its document collection or category.

You will do the following:
- Select a category from the Reuters (news) corpus
- Perform preprocessing
- Calculate TF-IDF scores
- Find the top 5 words for a subset of documents in your collection (e.g. 5, 10, ..)
- Inspect whether these words make sense for a given document, and comment on your findings.

In [77]:
# import nltk; nltk.download("reuters")
from nltk.corpus import reuters

categories = reuters.categories()
print("Available categories:", categories)

coconutTexts = reuters.fileids(categories="coconut")
print("Number of coconut texts:", len(coconutTexts))

coconutTexts_normalised = []
for i in range(len(coconutTexts)):
    textString = ' '.join(reuters.words(coconutTexts[i]))
    coconutTexts_normalised.append([token.lemma_.lower() for token in nlp(textString)
                 if token.is_alpha and token.lemma_.lower() not in stop_words])

print("Most common words:\n", Counter(coconutTexts_normalised[0]).most_common(15))

top15words = [term for term, _ in Counter(coconutTexts_normalised[0]).most_common(15)]

for word in top15words:
    print(f"TF-IDF for '{word}': {tf_idf(coconutTexts_normalised, coconutTexts_normalised[0], word)}")


Available categories: ['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']
Number of coconut texts: 6
Most common words:
 [('say', 24), ('cocon

**Comment:** Sadly, "coconut" got 0... but, of course it's because all articles are about coconut so it per definition not relevant when we know that the category already is coconuts.

# Part-of-speech tagging

#### 1. Briefly describe your understanding of POS tagging and its possible use-cases in context of text generation applications/language modeling.

It tags words into grammatical categories, making it easier to model the complete sentence. This helps understanding the context of words and can clarify the meaning of words that can mean many things.

#### 2. Train a UnigramTagger (NLTK) using the Brown corpus. 
Hint: the taggers in nltk require a list of sentences containing tagged words.

In [78]:
# TODO: train a unigram tagger on the brown corpus
from nltk import UnigramTagger
from nltk.corpus import brown

brown_tagged_sents = brown.tagged_sents(categories='news')
tagger = UnigramTagger(brown_tagged_sents)

#### 3. Use this tagger to tag the text given below. Print out the POS tags for all variants of "justify"

In [79]:
text = """
Imagine a situation where you have to explain why you did something – that's when you justify your actions. So, let's say you made a decision; you, as the justifier, need to give good reasons (justifications) for your choice. You might use justifying words to make your point clear and reasonable. Justifying can be a bit like saying, "Here's why I did what I did." When you justify things, you're basically providing the why behind your actions. So, being a good justifier involves carefully explaining, giving reasons, and making sure others understand your choices
"""

# TODO: use your trained tagger

text_tokens = text.split()
tagged_text = tagger.tag(text_tokens)
for word, tag in tagged_text[:10]:
    print(f"{word} ({tag})")

Imagine (None)
a (AT)
situation (NN)
where (WRB)
you (PPSS)
have (HV)
to (TO)
explain (VB)
why (WRB)
you (PPSS)


#### 4. Your results may be disappointing. Repeat the same task as above using both the default NLTK pos-tagger and with spaCy. Compare the results

In [80]:
# TODO: use the default NLTK tagger
# nltk.download('averaged_perceptron_tagger')

tagged_text = nltk.pos_tag(text_tokens)
for word, tag in tagged_text[:10]:
    print(f"{word} ({tag})")

Imagine (VB)
a (DT)
situation (NN)
where (WRB)
you (PRP)
have (VBP)
to (TO)
explain (VB)
why (WRB)
you (PRP)


In [81]:
# TODO: use spacy to fetch pos tags from the document

doc = nlp(text)
for token in doc[:10]:
    print(f"{token.text} ({token.pos_})")


 (SPACE)
Imagine (VERB)
a (DET)
situation (NOUN)
where (SCONJ)
you (PRON)
have (VERB)
to (PART)
explain (VERB)
why (SCONJ)


#### 5. Finally, explore more features of the what the spaCy *document* includes related to topics covered in this lab.

In [82]:
# TODO

print(dir(doc))

# print nouns
for token in doc:
    if token.pos_ == "NOUN":
        print(f"{token.text} ({token.pos_})")

['_', '__bytes__', '__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__ne__', '__new__', '__pyx_vtable__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__unicode__', '_bulk_merge', '_context', '_get_array_attrs', '_realloc', '_vector', '_vector_norm', 'cats', 'char_span', 'copy', 'count_by', 'doc', 'ents', 'extend_tensor', 'from_array', 'from_bytes', 'from_dict', 'from_disk', 'from_docs', 'from_json', 'get_extension', 'get_lca_matrix', 'has_annotation', 'has_extension', 'has_unknown_spaces', 'has_vector', 'is_nered', 'is_parsed', 'is_sentenced', 'is_tagged', 'lang', 'lang_', 'mem', 'noun_chunks', 'noun_chunks_iterator', 'remove_extension', 'retokenize', 'sentiment', 'sents', 'set_ents', 'set_extension', 'similarity', 'spans', 'tens