In [1]:
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer

text = "Hello world! NLP's tokenization is essential for text analysis."

# Example 1: Basic word tokenization
tokens = word_tokenize(text)
print(tokens)

['Hello', 'world', '!', 'NLP', "'s", 'tokenization', 'is', 'essential', 'for', 'text', 'analysis', '.']


In [2]:
# Example 2: Tokenizing contractions correctly
text2 = "Don't hesitate to ask questions."
print(word_tokenize(text2))

['Do', "n't", 'hesitate', 'to', 'ask', 'questions', '.']


In [3]:
# Example 3: Using RegexpTokenizer to keep only words
tokenizer = RegexpTokenizer(r'\w+')
print(tokenizer.tokenize(text))

['Hello', 'world', 'NLP', 's', 'tokenization', 'is', 'essential', 'for', 'text', 'analysis']


In [4]:
# Example 4: Using SpaCy tokenizer
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
print([token.text for token in doc])

['Hello', 'world', '!', 'NLP', "'s", 'tokenization', 'is', 'essential', 'for', 'text', 'analysis', '.']


In [5]:
# Example 5: Tokenize and convert to lowercase
tokens_lower = [t.lower() for t in tokens]
print(tokens_lower)

['hello', 'world', '!', 'nlp', "'s", 'tokenization', 'is', 'essential', 'for', 'text', 'analysis', '.']


In [6]:
# Example 6: Tokenize words including hyphens
text3 = "State-of-the-art NLP techniques are evolving."
print(word_tokenize(text3))


['State-of-the-art', 'NLP', 'techniques', 'are', 'evolving', '.']


In [7]:
# Example 8: Removing punctuation tokens
tokens_no_punc = [t for t in tokens if t.isalpha()]
print(tokens_no_punc)

['Hello', 'world', 'NLP', 'tokenization', 'is', 'essential', 'for', 'text', 'analysis']


In [8]:
# Example 9: Tokenizing multilingual text
text_de = "Hallo Welt! Wie geht's?"
print(word_tokenize(text_de, language='german'))

['Hallo', 'Welt', '!', 'Wie', 'geht', "'s", '?']


In [9]:
# Example 10: Tokenizing emoji and special characters (basic)
text5 = "I love NLP 😊 #AI"
print(word_tokenize(text5))

['I', 'love', 'NLP', '😊', '#', 'AI']
