# Text Processing


This notebook contains an example of text processing stage of NLP pipeline

In [1]:
import re
import nltk
import string
import itertools
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('words')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')

from nltk import pos_tag, ne_chunk

import warnings

warnings.filterwarnings('ignore')

[nltk_data] Downloading package words to
[nltk_data]     /Users/kc.kasaraneni/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kc.kasaraneni/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/kc.kasaraneni/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/kc.kasaraneni/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


In [2]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

## 1. Cleaning 

In [3]:
def clean_text(text):

    # Remove URLs
    text = re.sub(r"http\S+", "", text)
    
    # Remove emails
    text = re.sub(r'[\w\.-]+@[\w\.-]+', ' ', text, flags=re.MULTILINE)
    
    print('\nCleaning output:\n')
    print(text)
    
    return text

## 2. Normalization

In [4]:
def normailze_text(text):
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove extra characters
    text = ''.join(''.join(s)[:2] for _, s in itertools.groupby(text))
    
    # Remove punctuation characters
    text = re.sub(r"[^a-zA-Z0-9]", " ", text) 
    
    # Remove symbols
    text = re.sub(r'[^A-Za-z\s]',r'',text)
    text = re.sub(r'\n',r'',text)
    
    print('\nNormalization output:\n')
    print(text)
    
    return text

## 3. Tokenization

In [5]:
def tokenize_text(text):
    
    #Tokenize words
    tokens = nltk.word_tokenize(text)
    print('\nTokenization output:\n')
    print(tokens)
    
    return tokens

## 4. Stop words removal

In [6]:
def remove_stopwords(tokens):
    
    stop_words = stopwords.words('english')
    token_list = []
    
    for word in tokens:
        if not word in stop_words:
            token_list.append(word)
            
    print('\nOutput after removing stop words:\n')
    
    return token_list

## 5. Parts of speech tagging & Named Entity Recognition

In [7]:
def pos_ner(tokens):
    
    #POS tagging
    pos = pos_tag(tokens)
    
    print('\nParts of Speech Tagging:\n')
    print(pos)
    
    #NER
    ner = ne_chunk(pos)
    
    print('\nNamed Entity Recognition:\n')
    print(ner)

## 6. Stemming and Lemmatizing Tokens

In [8]:
def stem_lem_words(tokens):
    
    # Stemming tokens
    tokens = [stemmer.stem(token) for token in tokens]
    
    print('\nStemming Output:\n')
    print(tokens)
    
    #Lemmatizing tokens
    tokens = [lemmatizer.lemmatize(token, pos='v') for token in tokens]

    print('\nLemmatizing Output:\n')
    print(tokens)
    
    return tokens

In [9]:
def main(text):
    
    # Clean Text
    text = clean_text(text)
    
    # Normalize Text
    text = normailze_text(text)
    
    # Tokenize Text
    tokens = tokenize_text(text)
    
    # Remove Stop words
    tokens = remove_stopwords(tokens)
    
    # Display POS & NER
    pos_ner(tokens)
    
    # Stem & Lemmatize Tokens
    tokens = stem_lem_words(tokens)

    print('\nThe input text after processing:\n')
    
    return tokens

In [10]:
inp_text = "The first time you see The Second Renaissance it may look boring. Look at it at least twice and definitely watch part 2. It will change your view of the matrix. Are the human people the ones who started the war ? Is AI a bad thing ? Learn more at https://www.ai.com/test"

print(main(inp_text))


Cleaning output:

The first time you see The Second Renaissance it may look boring. Look at it at least twice and definitely watch part 2. It will change your view of the matrix. Are the human people the ones who started the war ? Is AI a bad thing ? Learn more at 

Normalization output:

the first time you see the second renaissance it may look boring  look at it at least twice and definitely watch part   it will change your view of the matrix  are the human people the ones who started the war   is ai a bad thing   learn more at 

Tokenization output:

['the', 'first', 'time', 'you', 'see', 'the', 'second', 'renaissance', 'it', 'may', 'look', 'boring', 'look', 'at', 'it', 'at', 'least', 'twice', 'and', 'definitely', 'watch', 'part', 'it', 'will', 'change', 'your', 'view', 'of', 'the', 'matrix', 'are', 'the', 'human', 'people', 'the', 'ones', 'who', 'started', 'the', 'war', 'is', 'ai', 'a', 'bad', 'thing', 'learn', 'more', 'at']

Output after removing stop words:


Parts of Speech Tag