In [None]:
import pandas as pd
from IPython.display import HTML
import string
from nltk.tokenize import word_tokenize
from nltk import word_tokenize
import spacy

In [2]:
# This paragraph includes verbs in various tenses (e.g., "running", "hunted", "howling", "studying", "included", "documenting", "surviving"), nouns with plural/singular forms, and derived forms (e.g., "aggression", "aggressive", "classified", "classifying")—perfect for seeing the effects of stemming and lemmatization.

text = "The researchers were studying the behaviors of wolves that had been running, hunted, and howling throughout the forested regions. Interestingly, they noticed that the wolves’ activities varied depending on the season, with increased aggression observed during mating periods. The observation included analyzing journals written by those documenting the wolves’ movements and strategies for surviving in harsh environments. Understanding these interactions helps in predicting future behavioral patterns and classifying different subspecies accordingly."
print(text)


The researchers were studying the behaviors of wolves that had been running, hunted, and howling throughout the forested regions. Interestingly, they noticed that the wolves’ activities varied depending on the season, with increased aggression observed during mating periods. The observation included analyzing journals written by those documenting the wolves’ movements and strategies for surviving in harsh environments. Understanding these interactions helps in predicting future behavioral patterns and classifying different subspecies accordingly.


# Tokenization with NLTK

In [3]:

nltk_tokens_with_punct = word_tokenize(text)
nltk_tokens_wout_punct = [token for token in nltk_tokens_with_punct if token not in string.punctuation]


# Tokenization with spaCy

In [4]:

nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
spacy_tokens_with_punct = [token.text for token in doc]
spacy_tokens_wout_punct = [token.text for token in doc if not token.is_punct and not token.is_space]


# Comparing the results

In [5]:

intersect_token = list(set(nltk_tokens_with_punct + spacy_tokens_with_punct))
count_nltk_tokens_with_punct = [nltk_tokens_with_punct.count(token) for token in intersect_token]
count_nltk_tokens_wout_punct = [nltk_tokens_wout_punct.count(token) for token in intersect_token]
count_spacy_tokens_with_punct = [spacy_tokens_with_punct.count(token) for token in intersect_token]
count_spacy_tokens_wout_punct = [spacy_tokens_wout_punct.count(token) for token in intersect_token]

some_df = pd.DataFrame({
    'NLTK token count<br>with punctuations': count_nltk_tokens_with_punct,
    'NLTK token count<br>w/out punctuations': count_nltk_tokens_wout_punct,
    'spaCy token count<br>with punctuations': count_spacy_tokens_with_punct,
    'spaCy token count<br>w/out punctuations': count_spacy_tokens_wout_punct,
})
some_df.index = intersect_token
# some_df = some_df.T

print(f"Number of NLTK tokens with punctuations: {len(nltk_tokens_with_punct)} tokens.")
print(f"Number of NLTK tokens with punctuations: {len(nltk_tokens_wout_punct)} tokens.")
print(f"Number of spaCy tokens with punctuations: {len(spacy_tokens_with_punct)} tokens.")
print(f"Number of spaCy tokens with punctuations: {len(spacy_tokens_wout_punct)} tokens.")


HTML(some_df.to_html(escape=False))

Number of NLTK tokens with punctuations: 81 tokens.
Number of NLTK tokens with punctuations: 73 tokens.
Number of spaCy tokens with punctuations: 81 tokens.
Number of spaCy tokens with punctuations: 71 tokens.


Unnamed: 0,NLTK token count with punctuations,NLTK token count w/out punctuations,spaCy token count with punctuations,spaCy token count w/out punctuations
that,2,2,2,2
.,4,0,4,0
throughout,1,1,1,1
hunted,1,1,1,1
these,1,1,1,1
noticed,1,1,1,1
movements,1,1,1,1
by,1,1,1,1
",",4,0,4,0
researchers,1,1,1,1


We can see that the number of wokens with punctuations using NLTK and spaCy are equal.

However, there is a difference when we compare the number of tokens without punctuation.

This is because NLTK doesn't consider the character '’' as a punctuation, while spaCy does.

In [6]:
# Sample text that includes various punctuation
punct_list = string.punctuation + " “ ” ‘ ’ — – … •"

# Get punctuations via spaCy tokenizer
spacy_doc = nlp(punct_list)
spacy_punctuations = {token.text for token in spacy_doc if token.is_punct}

# Get punctuations via NLTK tokenizer
nltk_tokens = word_tokenize(punct_list)
nltk_punctuations = {token for token in nltk_tokens if all(char in string.punctuation for char in token)}

# Display results
print("Punctuations according to spaCy:")
print(sorted(spacy_punctuations))

print("\nPunctuations according to NLTK:")
print(sorted(nltk_punctuations))


Punctuations according to spaCy:
['!', '"', '#', '%', '&', "'", '(', ')', '*', ',', '–', '—', '‘', '’', '“', '”', '•', '…']

Punctuations according to NLTK:
['!', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-./', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^_', '`', '``', '{', '|', '}', '~']
