## 03_01 Tokenization

Tokenization refers to converting a text string into individual tokens. Tokens may be words or punctations

In [7]:
import nltk
import os


# Read the base file into a raw text variable
base_file = open(os.getcwd() + "/TJ.txt", "rt")
raw_text = base_file.read()
base_file.close()

# Extract tokens
token_list = nltk.word_tokenize(raw_text)
print("Token List : ", token_list[:20])
print("\n Total Tokens : ", len(token_list))

Token List :  ['ÿþP\x00R\x00E\x00F\x00A\x00C\x00E\x00.\x00', '\x00', '\x00', '\x00', '\x00F\x00r\x00o\x00m\x00', '\x00t\x00h\x00e\x00', '\x00t\x00i\x00m\x00e\x00', '\x00o\x00f\x00', '\x00t\x00h\x00e\x00', '\x00c\x00o\x00m\x00i\x00n\x00g\x00', '\x00o\x00f\x00', '\x00t\x00h\x00e\x00', '\x00F\x00i\x00r\x00s\x00t\x00', '\x00P\x00a\x00t\x00r\x00i\x00a\x00r\x00c\x00h\x00', '\x00B\x00o\x00d\x00h\x00i\x00d\x00h\x00a\x00r\x00m\x00a\x00', '\x00w\x00h\x00o\x00', '\x00', '\x00t\x00r\x00a\x00n\x00s\x00m\x00i\x00t\x00t\x00e\x00d\x00', '\x00t\x00h\x00e\x00', '\x00\x18']

 Total Tokens :  47777


## 03_02 Cleansing Text

We will see examples of removing punctuation and converting to lower case

#### Remove Punctuation

In [2]:
# Use the Punkt library to extract tokens
token_list2 = list(
    filter(lambda token: nltk.tokenize.punkt.PunktToken(token).is_non_punct, token_list)
)
print("Token List after removing punctuation : ", token_list2[:20])
print("\nTotal tokens after removing punctuation : ", len(token_list2))

Token List after removing punctuation :  ['ÿþP\x00R\x00E\x00F\x00A\x00C\x00E\x00.\x00', '\x00F\x00r\x00o\x00m\x00', '\x00t\x00h\x00e\x00', '\x00t\x00i\x00m\x00e\x00', '\x00o\x00f\x00', '\x00t\x00h\x00e\x00', '\x00c\x00o\x00m\x00i\x00n\x00g\x00', '\x00o\x00f\x00', '\x00t\x00h\x00e\x00', '\x00F\x00i\x00r\x00s\x00t\x00', '\x00P\x00a\x00t\x00r\x00i\x00a\x00r\x00c\x00h\x00', '\x00B\x00o\x00d\x00h\x00i\x00d\x00h\x00a\x00r\x00m\x00a\x00', '\x00w\x00h\x00o\x00', '\x00t\x00r\x00a\x00n\x00s\x00m\x00i\x00t\x00t\x00e\x00d\x00', '\x00t\x00h\x00e\x00', 'W\x00e\x00s\x00t\x00e\x00r\x00n\x00', "\x00M\x00e\x00s\x00s\x00a\x00g\x00e\x00'\x00", '\x00m\x00e\x00s\x00s\x00a\x00g\x00e\x00', '\x00f\x00r\x00o\x00m\x00', '\x00I\x00n\x00d\x00i\x00a\x00']

Total tokens after removing punctuation :  36073


#### Convert to Lower Case

In [3]:
token_list3 = [word.lower() for word in token_list2]
print("Token list after converting to lower case : ", token_list3[:20])
print("\nTotal tokens after converting to lower case : ", len(token_list3))

Token list after converting to lower case :  ['ÿþp\x00r\x00e\x00f\x00a\x00c\x00e\x00.\x00', '\x00f\x00r\x00o\x00m\x00', '\x00t\x00h\x00e\x00', '\x00t\x00i\x00m\x00e\x00', '\x00o\x00f\x00', '\x00t\x00h\x00e\x00', '\x00c\x00o\x00m\x00i\x00n\x00g\x00', '\x00o\x00f\x00', '\x00t\x00h\x00e\x00', '\x00f\x00i\x00r\x00s\x00t\x00', '\x00p\x00a\x00t\x00r\x00i\x00a\x00r\x00c\x00h\x00', '\x00b\x00o\x00d\x00h\x00i\x00d\x00h\x00a\x00r\x00m\x00a\x00', '\x00w\x00h\x00o\x00', '\x00t\x00r\x00a\x00n\x00s\x00m\x00i\x00t\x00t\x00e\x00d\x00', '\x00t\x00h\x00e\x00', 'w\x00e\x00s\x00t\x00e\x00r\x00n\x00', "\x00m\x00e\x00s\x00s\x00a\x00g\x00e\x00'\x00", '\x00m\x00e\x00s\x00s\x00a\x00g\x00e\x00', '\x00f\x00r\x00o\x00m\x00', '\x00i\x00n\x00d\x00i\x00a\x00']

Total tokens after converting to lower case :  36073


## 03_03 Stop word Removal

Removing stop words by using a standard stop word list available in NLTK for English

In [4]:
# Download the standard stopword list
nltk.download("stopwords")
from nltk.corpus import stopwords

# Remove stopwords
token_list4 = list(
    filter(lambda token: token not in stopwords.words("english"), token_list3)
)
print("Token list after removing stop words : ", token_list4[:20])
print("\nTotal tokens after removing stop words : ", len(token_list4))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hetia\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Token list after removing stop words :  ['ÿþp\x00r\x00e\x00f\x00a\x00c\x00e\x00.\x00', '\x00f\x00r\x00o\x00m\x00', '\x00t\x00h\x00e\x00', '\x00t\x00i\x00m\x00e\x00', '\x00o\x00f\x00', '\x00t\x00h\x00e\x00', '\x00c\x00o\x00m\x00i\x00n\x00g\x00', '\x00o\x00f\x00', '\x00t\x00h\x00e\x00', '\x00f\x00i\x00r\x00s\x00t\x00', '\x00p\x00a\x00t\x00r\x00i\x00a\x00r\x00c\x00h\x00', '\x00b\x00o\x00d\x00h\x00i\x00d\x00h\x00a\x00r\x00m\x00a\x00', '\x00w\x00h\x00o\x00', '\x00t\x00r\x00a\x00n\x00s\x00m\x00i\x00t\x00t\x00e\x00d\x00', '\x00t\x00h\x00e\x00', 'w\x00e\x00s\x00t\x00e\x00r\x00n\x00', "\x00m\x00e\x00s\x00s\x00a\x00g\x00e\x00'\x00", '\x00m\x00e\x00s\x00s\x00a\x00g\x00e\x00', '\x00f\x00r\x00o\x00m\x00', '\x00i\x00n\x00d\x00i\x00a\x00']

Total tokens after removing stop words :  36073


## 03_04 Stemming

In [14]:
# Use the PorterStemmer library for stemming.
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

# Stem data
token_list5 = [stemmer.stem(word) for word in token_list4]
print("Token list after stemming : ", token_list5[:20])
print("\nTotal tokens after Stemming : ", len(token_list5))

Token list after stemming :  ['order', 'construct', 'data', 'pipelin', 'network', 'stream', 'process', 'store', 'data', 'data', 'engin', 'data-sci', 'devop', 'specialist', 'must', 'understand', 'combin', 'multipl', 'big', 'data']

Total tokens after Stemming :  62


## 03_05 Lemmatization

In [5]:
# Use the wordnet library to map words to their lemmatized form
nltk.download("wordnet")
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
token_list6 = [lemmatizer.lemmatize(word) for word in token_list4]
print("Token list after Lemmatization : ", token_list6[:20])
print("\nTotal tokens after Lemmatization : ", len(token_list6))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hetia\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


Token list after Lemmatization :  ['ÿþp\x00r\x00e\x00f\x00a\x00c\x00e\x00.\x00', '\x00f\x00r\x00o\x00m\x00', '\x00t\x00h\x00e\x00', '\x00t\x00i\x00m\x00e\x00', '\x00o\x00f\x00', '\x00t\x00h\x00e\x00', '\x00c\x00o\x00m\x00i\x00n\x00g\x00', '\x00o\x00f\x00', '\x00t\x00h\x00e\x00', '\x00f\x00i\x00r\x00s\x00t\x00', '\x00p\x00a\x00t\x00r\x00i\x00a\x00r\x00c\x00h\x00', '\x00b\x00o\x00d\x00h\x00i\x00d\x00h\x00a\x00r\x00m\x00a\x00', '\x00w\x00h\x00o\x00', '\x00t\x00r\x00a\x00n\x00s\x00m\x00i\x00t\x00t\x00e\x00d\x00', '\x00t\x00h\x00e\x00', 'w\x00e\x00s\x00t\x00e\x00r\x00n\x00', "\x00m\x00e\x00s\x00s\x00a\x00g\x00e\x00'\x00", '\x00m\x00e\x00s\x00s\x00a\x00g\x00e\x00', '\x00f\x00r\x00o\x00m\x00', '\x00i\x00n\x00d\x00i\x00a\x00']

Total tokens after Lemmatization :  36073


#### Comparison of tokens between raw, stemming and lemmatization

In [6]:
# Check for token technlogies
print(
    "Raw : ",
    token_list4[20],
    " , Stemmed : ",
    token_list5[20],
    " , Lemmatized : ",
    token_list6[20],
)

NameError: name 'token_list5' is not defined