## Tokenization in NLP with Python
*Submitted by:*  
**Christian Elijah Darvin**  
BCS32  
College of Information and Computer Studies - De La Salle University Dasmariñas

In [42]:
import nltk
import spacy
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

nltk.download("punkt")
nltk.download("stopwords")

nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Part 1: Tokenizing Customer Feedback

In [43]:
customer_feedback = "Great product, but the software crashed twice in the last week. The customer support team was very helpful, though. Could improve the battery life."


def nltk_tokenize(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    return sentences, words


def spacy_tokenize(text):
    doc = nlp(text)
    spacy_tokens = [token.text for token in doc]
    return spacy_tokens


nltk_sentences, nltk_words = nltk_tokenize(customer_feedback)
print(f"NLTK Sentences: \n{nltk_sentences}\n")
print(f"NLTK Words: \n{nltk_words}")
print("\n")
print(f"Spacy Words: \n{spacy_tokenize(customer_feedback)}")

NLTK Sentences: 
['Great product, but the software crashed twice in the last week.', 'The customer support team was very helpful, though.', 'Could improve the battery life.']

NLTK Words: 
['Great', 'product', ',', 'but', 'the', 'software', 'crashed', 'twice', 'in', 'the', 'last', 'week', '.', 'The', 'customer', 'support', 'team', 'was', 'very', 'helpful', ',', 'though', '.', 'Could', 'improve', 'the', 'battery', 'life', '.']


Spacy Words: 
['Great', 'product', ',', 'but', 'the', 'software', 'crashed', 'twice', 'in', 'the', 'last', 'week', '.', 'The', 'customer', 'support', 'team', 'was', 'very', 'helpful', ',', 'though', '.', 'Could', 'improve', 'the', 'battery', 'life', '.']


### Part 2: Removing Stopwords 

In [44]:
def remove_stopwords_nltk(text):
    _, words = nltk_tokenize(text)
    stop_words = set(stopwords.words("english"))
    return [word for word in words if word.lower() not in stop_words]


def remove_stopwords_spacy(text):
    doc = nlp(text)
    return [token.text for token in doc if not token.is_stop]


print(f"NLTK Remove Stopwords: \n{remove_stopwords_nltk(customer_feedback)}\n")
print(f"Spacy Remove Stopwords: \n{remove_stopwords_spacy(customer_feedback)}")

NLTK Remove Stopwords: 
['Great', 'product', ',', 'software', 'crashed', 'twice', 'last', 'week', '.', 'customer', 'support', 'team', 'helpful', ',', 'though', '.', 'Could', 'improve', 'battery', 'life', '.']

Spacy Remove Stopwords: 
['Great', 'product', ',', 'software', 'crashed', 'twice', 'week', '.', 'customer', 'support', 'team', 'helpful', ',', '.', 'improve', 'battery', 'life', '.']


### Part 3: Extracting Named Entities 

In [45]:
def extract_named_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]


extract_named_entities(customer_feedback)

[('the last week', 'DATE')]