In [1]:
text = 'The quick brown fox jumped over The Big Dog'
text

'The quick brown fox jumped over The Big Dog'

In [2]:
# Lower casing
text.lower()

'the quick brown fox jumped over the big dog'

In [3]:
text.upper()

'THE QUICK BROWN FOX JUMPED OVER THE BIG DOG'

In [4]:
text.title()

'The Quick Brown Fox Jumped Over The Big Dog'

In [5]:
#Removing HTML tags & noise
import requests

data = requests.get('http://www.gutenberg.org/cache/epub/8001/pg8001.html')
content = data.text
print(content[2745:3948])



<p id="id00010">Language: English</p>

<p id="id00011" style="margin-top: 2em">*** START OF THE PROJECT GUTENBERG EBOOK, THE BIBLE, KING JAMES, BOOK 1***</p>

<p id="id00012" style="margin-top: 4em">This eBook was produced by David Widger
with the help of Derek Andrew's text from January 1992
and the work of Bryan Taylor in November 2002.</p>

<h1 id="id00013" style="margin-top: 5em">Book 01        Genesis</h1>

<p id="id00014">01:001:001 In the beginning God created the heaven and the earth.</p>

<p id="id00015" style="margin-left: 0%; margin-right: 0%">01:001:002 And the earth was without form, and void; and darkness was
           upon the face of the deep. And the Spirit of God moved upon
           the face of the waters.</p>

<p id="id00016">01:001:003 And God said, Let there be light: and there was light.</p>

<p id="id00017">01:001:004 And God saw the light, that it was good: and God divided the<br/>

           light from the darkness.<br/>
</p>

<p id=

In [7]:
import re
from bs4 import BeautifulSoup

def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text

clean_content = strip_html_tags(content)
print(clean_content[1163:1957])

*** START OF THE PROJECT GUTENBERG EBOOK, THE BIBLE, KING JAMES, BOOK 1***
This eBook was produced by David Widger
with the help of Derek Andrew's text from January 1992
and the work of Bryan Taylor in November 2002.
Book 01        Genesis
01:001:001 In the beginning God created the heaven and the earth.
01:001:002 And the earth was without form, and void; and darkness was
           upon the face of the deep. And the Spirit of God moved upon
           the face of the waters.
01:001:003 And God said, Let there be light: and there was light.
01:001:004 And God saw the light, that it was good: and God divided the
           light from the darkness.
01:001:005 And God called the light Day, and the darkness he called
           Night. And the evening and the morning were the first day.



In [8]:
#Removing Accented Characters
import unicodedata

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text


In [9]:
s = 'Sómě Áccěntěd těxt'
s

'Sómě Áccěntěd těxt'

In [10]:
remove_accented_chars(s)

'Some Accented text'

In [14]:
#Removing Special Characters, Numbers and Symbols¶
import re

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text



In [15]:
s = "Well this was fun! See you at 7:30, What do you think!!? #$@@9318@ 🙂🙂🙂"
s

'Well this was fun! See you at 7:30, What do you think!!? #$@@9318@ 🙂🙂🙂'

In [16]:
p=remove_special_characters(s)
p

'Well this was fun See you at 730 What do you think 9318 '

In [19]:
#Expanding Contractions
s = "Y'all can't expand contractions I'd think! You wouldn't be able to. How'd you do it?"
s
import contractions

list(contractions.contractions_dict.items())[:10]

contractions.fix(s)

'you all can not expand contractions I would think! You would not be able to. how did you do it?'

['The', 'brown', 'foxes', 'are', 'quick', 'and', 'they', 'are', 'jumping', 'over', 'the', 'sleeping', 'lazy', 'do']


In [20]:
#Stemming
# Porter Stemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()

ps.stem('jumping'), ps.stem('jumps'), ps.stem('jumped')

('jump', 'jump', 'jump')

In [21]:
#Lemmatization
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

In [22]:
# lemmatize verbs
print(wnl.lemmatize('running', 'v'))
print(wnl.lemmatize('ate', 'v'))

run
eat


In [27]:
#Tokenization
import nltk
s = 'The brown foxes are quick and they are jumping over the sleeping lazy do'
tokens = nltk.word_tokenize(s)
print(tokens)
lemmatized_text = ' '.join(wnl.lemmatize(token) for token in tokens)
lemmatized_text


['The', 'brown', 'foxes', 'are', 'quick', 'and', 'they', 'are', 'jumping', 'over', 'the', 'sleeping', 'lazy', 'do']


'The brown fox are quick and they are jumping over the sleeping lazy do'

In [29]:
#Pos tagging
tagged_tokens = nltk.pos_tag(tokens)
print(tagged_tokens)

[('The', 'DT'), ('brown', 'JJ'), ('foxes', 'NNS'), ('are', 'VBP'), ('quick', 'JJ'), ('and', 'CC'), ('they', 'PRP'), ('are', 'VBP'), ('jumping', 'VBG'), ('over', 'IN'), ('the', 'DT'), ('sleeping', 'VBG'), ('lazy', 'NN'), ('do', 'VBP')]


In [30]:
#Stopword Removal
def remove_stopwords(text, is_lower_case=False, stopwords=None):
    if not stopwords:
        stopwords = nltk.corpus.stopwords.words('english')
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
    
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text


In [31]:
stop_words = nltk.corpus.stopwords.words('english')
print(stop_words[:10])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [32]:
s

'The brown foxes are quick and they are jumping over the sleeping lazy do'

In [33]:
remove_stopwords(s, is_lower_case=False)

'brown foxes quick jumping sleeping lazy'