# NLTK Data Preprocessing Tools

In [68]:
import nltk
nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

Sentence tokenization

In [69]:
# sentence tokenization
from nltk.tokenize import sent_tokenize
text = 'This is a random text we are using to demonstrate the implementation of the tokenization. Please follow the course'
print(sent_tokenize(text))

['This is a random text we are using to demonstrate the implementation of the tokenization.', 'Please follow the course']


In [70]:
# word tokenization example
from nltk.tokenize import word_tokenize
print(word_tokenize(text))

['This', 'is', 'a', 'random', 'text', 'we', 'are', 'using', 'to', 'demonstrate', 'the', 'implementation', 'of', 'the', 'tokenization', '.', 'Please', 'follow', 'the', 'course']


Stop words

In [79]:
from nltk.corpus import stopwords
import string

# Load the NLTK stop words
# nltk.download("stopwords")            # used only once
stop_words = set(stopwords.words("english"))

# Read the content from the file "obama.txt"
with open("obama.txt", "r") as file:
    content = file.read()

# Tokenize the content into words
words = nltk.word_tokenize(content)

# Remove punctuation and convert to lowercase
filtered_words = [word.lower() for word in words if word.isalpha()]

# Remove stop words
filtered_words = [word for word in filtered_words if word not in stop_words]


for word in filtered_words:
    print(word)


barack
hussein
obama
ii
born
august
american
politician
served
president
united
states
member
democratic
party
first
president
history
obama
previously
served
senator
representing
illinois
illinois
state
senator
civil
rights
lawyer
university
lecturer
obama
born
honolulu
hawaii
graduated
columbia
university
political
science
later
worked
community
organizer
chicago
obama
enrolled
harvard
law
school
first
black
president
harvard
law
review


Stemming

In [72]:
# stemming
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
set(stopwords.words('English'))
ps = PorterStemmer()
text='Let me excite you with the excitement I got when the trainer was training us'
words = word_tokenize(text)
for w in words:
    print(ps.stem(w))

let
me
excit
you
with
the
excit
i
got
when
the
trainer
wa
train
us


Lemmatization

In [73]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in words] 
print("The lemmatized words: ", lemmatized_words) #prints the lemmatized words

The lemmatized words:  ['Let', 'me', 'excite', 'you', 'with', 'the', 'excitement', 'I', 'got', 'when', 'the', 'trainer', 'wa', 'training', 'u']


In [74]:
# POS Tagging
from nltk.tag import DefaultTagger
tagged = nltk.pos_tag(lemmatized_words)
print(tagged)

[('Let', 'VB'), ('me', 'PRP'), ('excite', 'VB'), ('you', 'PRP'), ('with', 'IN'), ('the', 'DT'), ('excitement', 'NN'), ('I', 'PRP'), ('got', 'VBD'), ('when', 'WRB'), ('the', 'DT'), ('trainer', 'NN'), ('wa', 'VBD'), ('training', 'NN'), ('u', 'NN')]


Named Entity Recognition

In [75]:
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree

# Read the text from 'obama.txt' file
with open('obama.txt', 'r') as file:
    text = file.read()

# Tokenize the text
tokenized_text = word_tokenize(text)

# Part-of-speech tagging
tagged_text = pos_tag(tokenized_text)

# Extract named entities
def extract_named_entities(tagged_text):
    named_entities = []
    for chunk in ne_chunk(tagged_text):
        if isinstance(chunk, Tree):
            entity_name = ' '.join([token for token, pos in chunk.leaves()])
            entity_type = chunk.label()
            named_entities.append((entity_name, entity_type))
    return named_entities

# Get named entities from the text
named_entities = extract_named_entities(tagged_text)

# Print the named entities
print('NAMED ENTITY\t\tTYPE')
print('='*35)
for entity_name, entity_type in named_entities:
    print(f"{entity_name:{20}} ({entity_type})")


NAMED ENTITY		TYPE
Barack               (PERSON)
Hussein Obama II     (PERSON)
American             (GPE)
United States        (GPE)
Democratic Party     (ORGANIZATION)
U.S.                 (GPE)
Obama                (PERSON)
U.S.                 (GPE)
Illinois             (PERSON)
Obama                (PERSON)
Honolulu             (GPE)
Hawaii               (GPE)
Columbia University  (ORGANIZATION)
Chicago              (GPE)
Obama                (PERSON)
Harvard Law School   (ORGANIZATION)
Harvard Law Review   (ORGANIZATION)
