# Tokenization

In [9]:
def tokenize_text(text):
    """Tokenize a text into words using whitespace as a delimiter."""
    tokens = text.split()
    return tokens

text = "This is an example sentence for tokenization."
tokens = tokenize_text(text)
print(tokens)


['This', 'is', 'an', 'example', 'sentence', 'for', 'tokenization.']


# Bag of words

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample text data
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

# Create an instance of CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the corpus to create the BoW representation
X = vectorizer.fit_transform(corpus)

# Get the vocabulary (unique words)
vocab = vectorizer.get_feature_names_out()

# Convert the BoW representation to a dense array for readability
dense_array = X.toarray()

# Display the BoW representation
print("BoW representation:")
print(dense_array)

# Display the vocabulary
print("Vocabulary:")
print(vocab)

BoW representation:
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]
Vocabulary:
['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']


# Stemmer

In [3]:
from nltk.stem import SnowballStemmer

# Create a Snowball Stemmer for the English language
stemmer = SnowballStemmer('english')

# Example words to be stemmed
words = ['connection', 'connectivity', 'connected']

# Stem the words
stemmed_words = [stemmer.stem(word) for word in words]

# Print the stemmed words
print(stemmed_words)


['connect', 'connect', 'connect']


# Stop words removal

In [5]:
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

# Sample text
text = "This is an example sentence with some stop words."

# Tokenize the text
words = nltk.word_tokenize(text)

# Remove stop words
filtered_words = [word for word in words if word.lower() not in stopwords.words('english')]

# Join the filtered words back into a sentence
filtered_text = ' '.join(filtered_words)

# Print the filtered text
print(filtered_text)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/victoroshimua/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/victoroshimua/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


example sentence stop words .


# Name Entity Recognition

In [10]:
# Download necessary NLTK resources if not already downloaded
import nltk
nltk.download('punkt')  # Tokenizer
nltk.download('maxent_ne_chunker')  # Named Entity Chunker
nltk.download('words')  # Word corpus
nltk.download('averaged_perceptron_tagger')  # POS Tagger

from nltk import word_tokenize, pos_tag, ne_chunk

# Input text
text = "Apple Inc. is a leading tech company based in Cupertino, California."

# Tokenize the text into words
tokens = word_tokenize(text)

# Perform part-of-speech tagging on the tokens
tagged = pos_tag(tokens)

# Perform Named Entity Recognition (NER) using the ne_chunk function
named_entities = ne_chunk(tagged)

# Print named entities
for entity in named_entities:
    if isinstance(entity, tuple):
        # If it's a tuple, print the word and its POS tag
        print(entity[0], entity[1])
    else:
        # If it's a named entity, print the words and the entity label
        print(" ".join([word for word, tag in entity]), entity.label())


Apple PERSON
Inc. ORGANIZATION
is VBZ
a DT
leading VBG
tech NN
company NN
based VBN
in IN
Cupertino GPE
, ,
California GPE
. .


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/victoroshimua/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/victoroshimua/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/victoroshimua/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/victoroshimua/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
