*Eduardus Tjitrahardja | @edutjie | 2022*

# NLP Workflow & Text Wrangling

## Importing Libraries and Installing Dependencies

In [134]:
import nltk, spacy, requests, re, unicodedata, contractions, textsearch
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

In [108]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\edutjie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\edutjie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\edutjie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\edutjie\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\edutjie\AppData\Roaming\nltk_data...


True

## Case Conversion

In [2]:
text = "The quick brown fox jumps over the lazy dog."
text

'The quick brown fox jumps over the lazy dog.'

In [3]:
text.lower()

'the quick brown fox jumps over the lazy dog.'

In [4]:
text.upper()

'THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG.'

In [5]:
text.title()

'The Quick Brown Fox Jumps Over The Lazy Dog.'

## Tokenization

In [6]:
sample_text = ("US unveils world's most powerful supercomputer, beats China. " 
               "The US has unveiled the world's most powerful supercomputer called 'Summit', " 
               "beating the previous record-holder China's Sunway TaihuLight. With a peak performance "
               "of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, "
               "which is capable of 93,000 trillion calculations per second. Summit has 4,608 servers, "
               "which reportedly take up the size of two tennis courts.")
sample_text

"US unveils world's most powerful supercomputer, beats China. The US has unveiled the world's most powerful supercomputer called 'Summit', beating the previous record-holder China's Sunway TaihuLight. With a peak performance of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, which is capable of 93,000 trillion calculations per second. Summit has 4,608 servers, which reportedly take up the size of two tennis courts."

### Using nltk

In [18]:
# returns list of sentences
nltk.sent_tokenize(sample_text)

["US unveils world's most powerful supercomputer, beats China.",
 "The US has unveiled the world's most powerful supercomputer called 'Summit', beating the previous record-holder China's Sunway TaihuLight.",
 'With a peak performance of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, which is capable of 93,000 trillion calculations per second.',
 'Summit has 4,608 servers, which reportedly take up the size of two tennis courts.']

In [9]:
# returns list of words
print(nltk.word_tokenize(sample_text))

['US', 'unveils', 'world', "'s", 'most', 'powerful', 'supercomputer', ',', 'beats', 'China', '.', 'The', 'US', 'has', 'unveiled', 'the', 'world', "'s", 'most', 'powerful', 'supercomputer', 'called', "'Summit", "'", ',', 'beating', 'the', 'previous', 'record-holder', 'China', "'s", 'Sunway', 'TaihuLight', '.', 'With', 'a', 'peak', 'performance', 'of', '200,000', 'trillion', 'calculations', 'per', 'second', ',', 'it', 'is', 'over', 'twice', 'as', 'fast', 'as', 'Sunway', 'TaihuLight', ',', 'which', 'is', 'capable', 'of', '93,000', 'trillion', 'calculations', 'per', 'second', '.', 'Summit', 'has', '4,608', 'servers', ',', 'which', 'reportedly', 'take', 'up', 'the', 'size', 'of', 'two', 'tennis', 'courts', '.']


### Using spacy

In [15]:
nlp = spacy.load('en_core_web_sm')

text_spacy = nlp(sample_text)

In [16]:
[sentence.text for sentence in text_spacy.sents]

["US unveils world's most powerful supercomputer, beats China.",
 "The US has unveiled the world's most powerful supercomputer called 'Summit', beating the previous record-holder China's Sunway TaihuLight.",
 'With a peak performance of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, which is capable of 93,000 trillion calculations per second.',
 'Summit has 4,608 servers, which reportedly take up the size of two tennis courts.']

In [17]:
print([word.text for word in text_spacy])

['US', 'unveils', 'world', "'s", 'most', 'powerful', 'supercomputer', ',', 'beats', 'China', '.', 'The', 'US', 'has', 'unveiled', 'the', 'world', "'s", 'most', 'powerful', 'supercomputer', 'called', "'", 'Summit', "'", ',', 'beating', 'the', 'previous', 'record', '-', 'holder', 'China', "'s", 'Sunway', 'TaihuLight', '.', 'With', 'a', 'peak', 'performance', 'of', '200,000', 'trillion', 'calculations', 'per', 'second', ',', 'it', 'is', 'over', 'twice', 'as', 'fast', 'as', 'Sunway', 'TaihuLight', ',', 'which', 'is', 'capable', 'of', '93,000', 'trillion', 'calculations', 'per', 'second', '.', 'Summit', 'has', '4,608', 'servers', ',', 'which', 'reportedly', 'take', 'up', 'the', 'size', 'of', 'two', 'tennis', 'courts', '.']


## Removing HTML tags and noise

In [23]:
data = requests.get('http://www.gutenberg.org/cache/epub/8001/pg8001.html')
data

<Response [200]>

In [54]:
content = data.text
print(content[6075:7195])

<p id="id00011" style="margin-top: 2em">*** START OF THE PROJECT GUTENBERG EBOOK, THE BIBLE, KING JAMES, BOOK 1***</p>

<p id="id00012" style="margin-top: 4em">This eBook was produced by David Widger
with the help of Derek Andrew's text from January 1992
and the work of Bryan Taylor in November 2002.</p>

<h1 id="id00013" style="margin-top: 5em">Book 01        Genesis</h1>

<p id="id00014">01:001:001 In the beginning God created the heaven and the earth.</p>

<p id="id00015" style="margin-left: 0%; margin-right: 0%">01:001:002 And the earth was without form, and void; and darkness was
           upon the face of the deep. And the Spirit of God moved upon
           the face of the waters.</p>

<p id="id00016">01:001:003 And God said, Let there be light: and there was light.</p>

<p id="id00017">01:001:004 And God saw the light, that it was good: and God divided the<br>

           light from the darkness.<br>
</p>

<p id="id00018">01:001:005 And God called the light Day, and the darkne

In [79]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', soup.get_text())
    return stripped_text

cleaned_content = strip_html_tags(content)
print(cleaned_content[1253:2046])

*** START OF THE PROJECT GUTENBERG EBOOK, THE BIBLE, KING JAMES, BOOK 1***
This eBook was produced by David Widger
with the help of Derek Andrew's text from January 1992
and the work of Bryan Taylor in November 2002.
Book 01        Genesis
01:001:001 In the beginning God created the heaven and the earth.
01:001:002 And the earth was without form, and void; and darkness was
           upon the face of the deep. And the Spirit of God moved upon
           the face of the waters.
01:001:003 And God said, Let there be light: and there was light.
01:001:004 And God saw the light, that it was good: and God divided the
           light from the darkness.
01:001:005 And God called the light Day, and the darkness he called
           Night. And the evening and the morning were the first day.


## Removing Accented Characters

In [80]:
def remove_accented_char(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

In [81]:
s = 'Sómě Áccěntěd těxt'
s

'Sómě Áccěntěd těxt'

In [82]:
remove_accented_char(s)

'Some Accented text'

## Removing Special Characters, Numbers and Symbols

In [85]:
def remove_special_char(text, remove_digit=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digit else r'[^a-zA-Z\s]'
    return re.sub(pattern, '', text)

In [86]:
s = "Well this was fun! See you at 7:30, What do you think!!? #$@@9318@ 🙂🙂🙂"
s

'Well this was fun! See you at 7:30, What do you think!!? #$@@9318@ 🙂🙂🙂'

In [87]:
remove_special_char(s)

'Well this was fun See you at 730 What do you think 9318 '

In [88]:
remove_special_char(s, remove_digit=True)

'Well this was fun See you at  What do you think  '

## Expanding Contractions

In [91]:
s = "Y'all can't expand contractions I'd think! You wouldn't be able to. How'd you do it?"
s

"Y'all can't expand contractions I'd think! You wouldn't be able to. How'd you do it?"

In [95]:
list(contractions.contractions_dict.items())[:10]

[("I'm", 'I am'),
 ("I'm'a", 'I am about to'),
 ("I'm'o", 'I am going to'),
 ("I've", 'I have'),
 ("I'll", 'I will'),
 ("I'll've", 'I will have'),
 ("I'd", 'I would'),
 ("I'd've", 'I would have'),
 ('Whatcha', 'What are you'),
 ("amn't", 'am not')]

In [96]:
contractions.fix(s)

'You all cannot expand contractions I would think! You would not be able to. How did you do it?'

## Stemming
- Doesn't care about its meaning, it just cuts down the words

In [98]:
ps = PorterStemmer()

ps.stem('jumping'), ps.stem('jumps'), ps.stem('jumped')

('jump', 'jump', 'jump')

In [99]:
ps.stem('lying')

'lie'

In [101]:
ps.stem('strange')

'strang'

Notice that strange converted into strang which has no meaning

## Lemmatization

In [104]:
wnl = WordNetLemmatizer()
help(wnl.lemmatize)

Help on method lemmatize in module nltk.stem.wordnet:

lemmatize(word: str, pos: str = 'n') -> str method of nltk.stem.wordnet.WordNetLemmatizer instance
    Lemmatize `word` using WordNet's built-in morphy function.
    Returns the input word unchanged if it cannot be found in WordNet.
    
    :param word: The input word to lemmatize.
    :type word: str
    :param pos: The Part Of Speech tag. Valid options are `"n"` for nouns,
        `"v"` for verbs, `"a"` for adjectives, `"r"` for adverbs and `"s"`
        for satellite adjectives.
    :param pos: str
    :return: The lemma of `word`, for the given `pos`.



In [120]:
# lematize nouns
print(wnl.lemmatize('cars', pos='n'))
print(wnl.lemmatize('boxes', pos='n'))

car
box


In [113]:
# lematize verbs
print(wnl.lemmatize('running', pos='v'))
print(wnl.lemmatize('ate', pos='v'))

run
eat


In [114]:
# lematize adjectives
print(wnl.lemmatize('saddest', pos='a'))
print(wnl.lemmatize('fancier', pos='a'))

sad
fancy


In [116]:
# ineffective lemmatization
print(wnl.lemmatize('ate', pos='n'))
print(wnl.lemmatize('fancier', pos='v'))
print(wnl.lemmatize('fancier'))

ate
fancier
fancier


## Tokenize

In [121]:
s = 'The brown foxes are quick and they are jumping over the sleeping lazy dogs!'

In [123]:
tokens = nltk.word_tokenize(s)
print(tokens)

['The', 'brown', 'foxes', 'are', 'quick', 'and', 'they', 'are', 'jumping', 'over', 'the', 'sleeping', 'lazy', 'dogs', '!']


In [132]:
lemmatized_text = ' '.join(wnl.lemmatize(token) for token in tokens)
lemmatized_text

'The brown fox are quick and they are jumping over the sleeping lazy dog !'

There're few unlemmatized token

## POS Tagging

In [133]:
tagged_tokens = nltk.pos_tag(tokens)
print(tagged_tokens)

[('The', 'DT'), ('brown', 'JJ'), ('foxes', 'NNS'), ('are', 'VBP'), ('quick', 'JJ'), ('and', 'CC'), ('they', 'PRP'), ('are', 'VBP'), ('jumping', 'VBG'), ('over', 'IN'), ('the', 'DT'), ('sleeping', 'VBG'), ('lazy', 'JJ'), ('dogs', 'NNS'), ('!', '.')]


## Tag conversion to WordNet Tags

In [139]:
def pos_tag_wordnet(tagged_tokens):
    tag_map = {"j": wordnet.ADJ, "v": wordnet.VERB, "n": wordnet.NOUN, "r": wordnet.ADV}
    return [(token, tag_map.get(tag[0].lower(), wordnet.NOUN)) for token, tag in tagged_tokens]

In [140]:
wordnet_tokens = pos_tag_wordnet(tagged_tokens)
print(wordnet_tokens)

[('The', 'n'), ('brown', 'a'), ('foxes', 'n'), ('are', 'v'), ('quick', 'a'), ('and', 'n'), ('they', 'n'), ('are', 'v'), ('jumping', 'v'), ('over', 'n'), ('the', 'n'), ('sleeping', 'v'), ('lazy', 'a'), ('dogs', 'n'), ('!', 'n')]


## Effective Lemmatization

In [141]:
lemmatized_text = ' '.join(wnl.lemmatize(token, pos=tag) for token, tag in wordnet_tokens)
lemmatized_text

'The brown fox be quick and they be jump over the sleep lazy dog !'

## Your turn: Define a function such that you put all the above steps together so that it does the following
- Function name is **wordnet_lemmatize_text(...)**
- Input is a variable **text** which should take in a document (bunch of words)
- Call the earlier defined functions and utilize them
- Return lemmatized text as the output (as a string

In [142]:
def wordnet_lemmatize_text(text):
    tokens = nltk.word_tokenize(text)
    wordnet_tokens = pos_tag_wordnet(nltk.pos_tag(tokens))
    return ' '.join(wnl.lemmatize(token, pos=tag) for token, tag in wordnet_tokens)

### Your Turn: Now call the function on the below sentence and test it

In [143]:
s

'The brown foxes are quick and they are jumping over the sleeping lazy dogs!'

In [144]:
wordnet_lemmatize_text(s)

'The brown fox be quick and they be jump over the sleep lazy dog !'

## Lemmatization with Spacy

In [158]:
def spacy_lemmatize_text(text):
    return ' '.join(word.lemma_ if word.lemma_ != "-PRON-" else word.text for word in nlp(text))

In [159]:
s

'The brown foxes are quick and they are jumping over the sleeping lazy dogs!'

In [160]:
spacy_lemmatize_text(s)

'the brown fox be quick and they be jump over the sleep lazy dog !'

## Stopword Removal

In [189]:
def remove_stopwords(text, is_lowercase=False, stopwords=None):
    stopwords = nltk.corpus.stopwords.words('english') if not stopwords else stopwords
    tokens = nltk.word_tokenize(text)
    if not is_lowercase:
        tokens = [token.lower() for token in tokens]
    return ' '.join(token for token in tokens if token not in stopwords)

In [174]:
stop_words = nltk.corpus.stopwords.words('english')
print(stop_words[:10])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [175]:
s

'The brown foxes are quick and they are jumping over the sleeping lazy dogs!'

In [190]:
remove_stopwords(s)

'brown foxes quick jumping sleeping lazy dogs !'