In [57]:
#here we eill discuss about text pre-processing
# source: https://www.geeksforgeeks.org/introduction-to-natural-language-processing/
# import the necessary libraries
import nltk
import string
import re


In [31]:
#case change to lower case
def text_lowercase(text):
    return text.lower()
  
input_str = "The 5 Quick Brown foxes were trying to Jump over the 20 Little Lazy Dogs,    oh! they were so cute, but now they aren't!"
text_lowercase(input_str)

"the 5 quick brown foxes were trying to jump over the 20 little lazy dogs,    oh! they were so cute, but now they aren't!"

In [32]:
# Remove numbers
def remove_numbers(text):
    result = re.sub(r'\d+', '', text)
    return result

remove_numbers(input_str)


"The  Quick Brown foxes were trying to Jump over the  Little Lazy Dogs,    oh! they were so cute, but now they aren't!"

In [33]:
#how to convert numbers into words as in 5 to five

# import the inflect library
import inflect
p = inflect.engine()


def convert_num(text):
    # split string into list of words
    temp_str = text.split()
    # initialise empty list
    new_string = []

    for word in temp_str:
        # if word is a digit, convert the digit
        # to numbers and append into the new_string list
        if word.isdigit():
            temp = p.number_to_words(word)
            new_string.append(temp)

        # append the word as it is
        else:
            new_string.append(word)

    # join the words of new_string to form a string
    temp_str = ' '.join(new_string)
    return temp_str


convert_num(input_str)


"The five Quick Brown foxes were trying to Jump over the twenty Little Lazy Dogs, oh! they were so cute, but now they aren't!"

In [34]:
# removing punctuation
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

remove_punctuation(input_str)


'The 5 Quick Brown foxes were trying to Jump over the 20 Little Lazy Dogs    oh they were so cute but now they arent'

In [35]:
# remove whitespace or blanks from text
def remove_whitespace(text):
    return " ".join(text.split())

remove_whitespace(input_str)


"The 5 Quick Brown foxes were trying to Jump over the 20 Little Lazy Dogs, oh! they were so cute, but now they aren't!"

In [37]:
# Remove stop words using nltk corpus

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# remove stopwords function
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return filtered_text

example_text = "This is a sample sentence and we are going to remove the stopwords from this."
remove_stopwords(example_text)


['This', 'sample', 'sentence', 'going', 'remove', 'stopwords', '.']

In [38]:
remove_stopwords(input_str)

['The',
 '5',
 'Quick',
 'Brown',
 'foxes',
 'trying',
 'Jump',
 '20',
 'Little',
 'Lazy',
 'Dogs',
 ',',
 'oh',
 '!',
 'cute',
 ',',
 "n't",
 '!']

In [39]:
#Stemming
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
stemmer = PorterStemmer()

# There are mainly three algorithms for stemming:
#Porter Stemmer, the Snowball Stemmer and the Lancaster Stemmer. 
#Porter Stemmer is the most common

# stem words in the list of tokenised words
def stem_words(text):
    word_tokens = word_tokenize(text)
    stems = [stemmer.stem(word) for word in word_tokens]
    return stems

text = 'data science uses scientific methods algorithms and many types of processes'
stem_words(text)


['data',
 'scienc',
 'use',
 'scientif',
 'method',
 'algorithm',
 'and',
 'mani',
 'type',
 'of',
 'process']

In [40]:
stem_words(input_str)

['the',
 '5',
 'quick',
 'brown',
 'fox',
 'were',
 'tri',
 'to',
 'jump',
 'over',
 'the',
 '20',
 'littl',
 'lazi',
 'dog',
 ',',
 'oh',
 '!',
 'they',
 'were',
 'so',
 'cute',
 ',',
 'but',
 'now',
 'they',
 'are',
 "n't",
 '!']

In [46]:
#Lemmatization
#similar to Stemming, only difference is lemmatizing converts the word into native language
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemmatizer = WordNetLemmatizer()
# lemmatize string
def lemmatize(text):
    word_tokens = word_tokenize(text)
    # provide context i.e. part-of-speech
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
    return lemmas

text = 'data science uses scientific methods algorithms and many types of processes'
lemmatize(text)


['data',
 'science',
 'use',
 'scientific',
 'methods',
 'algorithms',
 'and',
 'many',
 'type',
 'of',
 'process']

In [47]:
lemmatize(input_str)

['The',
 '5',
 'Quick',
 'Brown',
 'fox',
 'be',
 'try',
 'to',
 'Jump',
 'over',
 'the',
 '20',
 'Little',
 'Lazy',
 'Dogs',
 ',',
 'oh',
 '!',
 'they',
 'be',
 'so',
 'cute',
 ',',
 'but',
 'now',
 'they',
 'be',
 "n't",
 '!']

In [44]:
#part of speech tagging
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# convert text into word_tokens with their tags
def pos_tagging(text):
    word_tokens = word_tokenize(text)
    return pos_tag(word_tokens)

pos_tagging('You just gave me a scare')


[('You', 'PRP'),
 ('just', 'RB'),
 ('gave', 'VBD'),
 ('me', 'PRP'),
 ('a', 'DT'),
 ('scare', 'NN')]

In [52]:
pos_result = pos_tagging(remove_punctuation(convert_num(input_str)))

In [50]:
# To understand the above coding, let's take help from the Penn tagset
# download the tagset
nltk.download('tagsets')

# extract information about the tag
nltk.help.upenn_tagset('NN')


NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...


[nltk_data] Downloading package tagsets to C:\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


In [51]:
#Chunking
#Chunking is the process of extracting phrases from unstructured text and more structure to it. 
#It is also known as shallow parsing

from nltk.tokenize import word_tokenize
from nltk import pos_tag

# define chunking function with text and regular
# expression representing grammar as parameter
def chunking(text, grammar):
    word_tokens = word_tokenize(text)

    # label words with part of speech
    word_pos = pos_tag(word_tokens)

    # create a chunk parser using grammar
    chunkParser = nltk.RegexpParser(grammar)

    # test it on the list of word tokens with tagged pos
    tree = chunkParser.parse(word_pos)
    
    for subtree in tree.subtrees():
        print(subtree)
    tree.draw()
    
sentence = 'the little yellow bird is flying in the sky'
grammar = "NP: {<DT>?<JJ>*<NN>}"
chunking(sentence, grammar)



(S
  (NP the/DT little/JJ yellow/JJ bird/NN)
  is/VBZ
  flying/VBG
  in/IN
  (NP the/DT sky/NN))
(NP the/DT little/JJ yellow/JJ bird/NN)
(NP the/DT sky/NN)


In [55]:
#Named Entity Recognition

from nltk.tokenize import word_tokenize
from nltk import pos_tag, ne_chunk

def ner(text):
    # tokenize the text
    word_tokens = word_tokenize(text)

    # part of speech tagging of words
    word_pos = pos_tag(word_tokens)

    # tree of word entities
    print(ne_chunk(word_pos))

text = 'Bill works for GeeksforGeeks so he went to Delhi for a meetup.'
ner(text)


(S
  (PERSON Bill/NNP)
  works/VBZ
  for/IN
  (ORGANIZATION GeeksforGeeks/NNP)
  so/RB
  he/PRP
  went/VBD
  to/TO
  (GPE Delhi/NNP)
  for/IN
  a/DT
  meetup/NN
  ./.)


In [56]:
ner(input_str)

(S
  The/DT
  5/CD
  (PERSON Quick/NNP Brown/NNP)
  foxes/NNS
  were/VBD
  trying/VBG
  to/TO
  (GPE Jump/NNP)
  over/IN
  the/DT
  20/CD
  Little/NNP
  Lazy/NNP
  Dogs/NNP
  ,/,
  oh/UH
  !/.
  they/PRP
  were/VBD
  so/RB
  cute/JJ
  ,/,
  but/CC
  now/RB
  they/PRP
  are/VBP
  n't/RB
  !/.)
