In [28]:
import re 
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.stem.porter import PorterStemmer 
from nltk import word_tokenize, pos_tag
import gensim
from gensim import corpora

Using Theano backend.


# 1. Text Preprocessing

![title](./images/pipeline.png)

Noise removal

In [1]:
noise_list = ["is", "a", "this", "..."] 
def _remove_noise(input_text):
    words = input_text.split() 
    noise_free_words = [word for word in words if word not in noise_list] 
    noise_free_text = " ".join(noise_free_words) 
    return noise_free_text

_remove_noise("this is a sample text")

'sample text'

using regexp

In [3]:
def _remove_regex(input_text, regex_pattern):
    urls = re.finditer(regex_pattern, input_text) 
    for i in urls: 
        input_text = re.sub(i.group().strip(), '', input_text)
    return input_text

regex_pattern = "#[\w]*"  

_remove_regex("remove this #hashtag from analytics vidhya", regex_pattern)

'remove this  from analytics vidhya'

In [12]:
lem = WordNetLemmatizer()
stem = PorterStemmer()

Obtaing the root of the word

In [11]:
word = "multiplying" 
lem.lemmatize(word, "v")

'multiply'

Strripping off the suffixes

In [10]:
stem.stem(word)

'multipli'

Object Standardization

In [16]:
lookup_dict = {'rt':'Retweet', 'dm':'direct message', "awsm" : "awesome", "luv" :"love"}
def _lookup_words(input_text):
    words = input_text.split() 
    new_words = [] 
    for word in words:
        if word.lower() in lookup_dict:
            word = lookup_dict[word.lower()]
        new_words.append(word) 
        new_text = " ".join(new_words) 
    return new_text

_lookup_words("RT this is a retweeted tweet by Shivam Bansal")

'Retweet this is a retweeted tweet by Shivam Bansal'

# 2. Text to Features (Feature Engineering on text data)

Dependency Tree
![title](./images/trees.png)

Part of speech tagging 

In [26]:
text = "I am learning Natural Language Processing on Analytics Vidhya"
tokens = word_tokenize(text)
print(pos_tag(tokens))

[('I', 'PRP'), ('am', 'VBP'), ('learning', 'VBG'), ('Natural', 'NNP'), ('Language', 'NNP'), ('Processing', 'NNP'), ('on', 'IN'), ('Analytics', 'NNP'), ('Vidhya', 'NNP')]


Entity Extraction (Entities as features)
![title](./images/entity.png)

Topic Modeling

In [30]:
doc1 = "Sugar is bad to consume. My sister likes to have sugar, but not my father." 
doc2 = "My father spends a lot of time driving my sister around to dance practice."
doc3 = "Doctors suggest that driving may cause increased stress and blood pressure."
doc_complete = [doc1, doc2, doc3]
doc_clean = [doc.split() for doc in doc_complete]

# Creating the term dictionary of corpus, where every unique term is assigned an index.  
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. 
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Training LDA model on the document term matrix
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)

# Results 
print(ldamodel.print_topics())

[(0, '0.060*"driving" + 0.060*"cause" + 0.060*"pressure." + 0.060*"blood" + 0.060*"increased" + 0.060*"stress" + 0.060*"suggest" + 0.060*"Doctors" + 0.060*"and" + 0.060*"that"'), (1, '0.053*"driving" + 0.053*"sister" + 0.053*"my" + 0.053*"My" + 0.053*"father" + 0.053*"of" + 0.053*"dance" + 0.053*"practice." + 0.053*"around" + 0.053*"time"'), (2, '0.089*"to" + 0.051*"My" + 0.051*"my" + 0.051*"sister" + 0.051*"not" + 0.051*"bad" + 0.051*"consume." + 0.051*"Sugar" + 0.051*"is" + 0.051*"father."')]


 N-Grams as Features

In [31]:
def generate_ngrams(text, n):
    words = text.split()
    output = []  
    for i in range(len(words)-n+1):
        output.append(words[i:i+n])
    return output

generate_ngrams('this is a sample text', 2)

[['this', 'is'], ['is', 'a'], ['a', 'sample'], ['sample', 'text']]