# Text Preprocessing in Python

In [1]:
# Import the necessary libraries
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
import string
import re

[nltk_data] Downloading package punkt to /home/eosindo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/eosindo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/eosindo/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
# We lowercase the text to reduce the size of the vocabulary of our text data
def text_lowercase(text):
    return text.lower()

input_str = "Hey, did you know that the summer break is coming? Amazing right !! It's only 5 more days !!"
text_lowercase(input_str)


"hey, did you know that the summer break is coming? amazing right !! it's only 5 more days !!"

### Remove numbers

In [3]:
# We can either remove numbers or convert the numbers into their textual representations. We can use regular expressions to remove the numbers.
#Remove numbers
def remove_numbers(text):
    results = re.sub(r'\d+', '', text)
    return results

input_str = "There are 3 balls in this bag, and 12 in the other one."
remove_numbers(input_str)

'There are  balls in this bag, and  in the other one.'

#### We can also convert the numbers into words. This can be done by using the inflect library.

In [4]:
# import the inflect library
import inflect
p = inflect.engine()

# convert number into words
def convert_number(text):
    # Split string into list of words
    temp_str = text.split()
    # initialise empty list
    new_string = []
    
    for word in temp_str:
        # if word is digit, convert the digit
        # to numbers and append into the new_string list
        if word.isdigit():
            temp = p.number_to_words(word)
            new_string.append(temp)
        
        # append the word as it is
        else:
            new_string.append(word)
            
    # Join the words of new_string to form a string
    temp_str = ' '.join(new_string)
    return temp_str

input_str = 'There are 3 balls in this bag, and 12 in the other one.'
convert_number(input_str)

'There are three balls in this bag, and twelve in the other one.'

#### Remove punctuation:

We remove punctuations so that we dont have different forms of the same word. If we dont remove the punctuation, then been. been. been! will be treated separately.

In [5]:
# remove punctuation
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

input_str = "Hey, did you know that the summer break is coming? Amazing right !! It's only 5 more days !!"
remove_punctuation(input_str)

'Hey did you know that the summer break is coming Amazing right  Its only 5 more days '

#### Remove whitespaces

In [6]:
# Remove whitespace from text 
def remove_whitespace(text):
    return " ".join(text.split())

input_str = "   we don't need   the given questions"
remove_whitespace(input_str)

"we don't need the given questions"

### Remove default stopwords

In [7]:

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
 
# remove stopwords function
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return filtered_text
 
example_text = "This is a sample sentence and we are going to remove the stopwords from this."
remove_stopwords(example_text)

['This', 'sample', 'sentence', 'going', 'remove', 'stopwords', '.']

### Stemming

In [8]:
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
stemmer = PorterStemmer()

# Stem words in the list of tokenized words
def stem_words(text):
    word_tokens = word_tokenize(text)
    stems = [stemmer.stem(word) for word in word_tokens]
    return stems


text = 'data science uses scientific methods algorithms and many types of processes'
stem_words(text)

['data',
 'scienc',
 'use',
 'scientif',
 'method',
 'algorithm',
 'and',
 'mani',
 'type',
 'of',
 'process']

### Lemmatization

In [9]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemmatizer = WordNetLemmatizer()
# lemmatize string
def lemmatize_word(text):
    word_tokens = word_tokenize(text)
    # provide context i.e. part-of-speech
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
    return lemmas
 
text = 'data science uses scientific methods algorithms and many types of processes'
lemmatize_word(text)

['data',
 'science',
 'use',
 'scientific',
 'methods',
 'algorithms',
 'and',
 'many',
 'type',
 'of',
 'process']