In [None]:
import json
import codecs
import copy
import re
import textwrap
import os
import pandas as pd
from IPython.display import HTML

import string
from collections import Counter


import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import words as nltk_words, wordnet
from nltk.corpus import words
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from unidecode import unidecode

import spacy
# !python -m spacy download en_core_web_sm





# Download required resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('words')
nltk.download('omw-1.4')


## Tokenization with NLTK and spaCy

In [None]:
# This paragraph includes verbs in various tenses (e.g., "running", "hunted", "howling", "studying", "included", "documenting", "surviving"), nouns with plural/singular forms, and derived forms (e.g., "aggression", "aggressive", "classified", "classifying")—perfect for seeing the effects of stemming and lemmatization.

some_text = "The researchers were studying the behaviors of wolves that had been running, hunted, and howling throughout the forested regions. Interestingly, they noticed that the wolves’ activities varied depending on the season, with increased aggression observed during mating periods. The observation included analyzing journals written by those documenting the wolves’ movements and strategies for surviving in harsh environments. Understanding these interactions helps in predicting future behavioral patterns and classifying different subspecies accordingly."


# Tokenization using NLTK
nltk_tokens_with_punct = word_tokenize(some_text)
nltk_tokens_wout_punct = [token for token in nltk_tokens_with_punct if token not in string.punctuation]

# Tokenization using spaCy
nlp = spacy.load("en_core_web_sm")
doc = nlp(some_text)
spacy_tokens_with_punct = [token.text for token in doc]
spacy_tokens_wout_punct = [token.text for token in doc if not token.is_punct and not token.is_space]

# ''.join([f'({token})' for token in nltk_tokens])

## Lemmatization with NLTK and spaCy

In [None]:

# Função auxiliar para converter POS tags do NLTK para WordNet
def get_wordnet_pos(tag):

    """
    This function is a POS tag mapper. It converts Penn Treebank tags (used by nltk.pos_tag) to the simplified tags that WordNetLemmatizer expects.
    WordNet only recognizes: wordnet.NOUN, wordnet.VERB, wordnet.ADJ, wordnet.ADV
    Thus, this function is useful if you're using NLTK + WordNetLemmatizer to accurately lemmatize words.

    Without this function, the lemmatizer WordNetLemmatizer.lemmatize() method defaults to noun (wordnet.NOUN):
    lemmatizer = WordNetLemmatizer()
    lemmatizer.lemmatize("running", pos="v")  # Output: 'run'
    lemmatizer.lemmatize("running")           # Output: 'running'
    """

    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN




# NLTK
lemmatizer = WordNetLemmatizer()
nltk_pos_tags = pos_tag(nltk_tokens_wout_punct)

# spaCy
spacy_token_dict = {token.text: token.lemma_ for token in doc if not token.is_punct and not token.is_space}

common_tokens = sorted(set([token for token, _ in nltk_pos_tags]).intersection(spacy_token_dict.keys()))

df = pd.DataFrame({
    "Token": common_tokens,
    # "POS Tag": [dict(nltk_pos_tags).get(token) for token in common_tokens],
    "NLTK Lemma (WordNet)": [lemmatizer.lemmatize(token) for token in common_tokens],
    "NLTK Lemma (WordNet, POS-aware)": [lemmatizer.lemmatize(token, get_wordnet_pos(dict(nltk_pos_tags).get(token))) for token in common_tokens],
    "spaCy Lemma": [spacy_token_dict[token] for token in common_tokens]
})

HTML(df.to_html(index=False))
