In [24]:
import re 
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.stem.porter import PorterStemmer 
from nltk import word_tokenize, pos_tag
import gensim
from gensim import corpora
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from textblob.classifiers import NaiveBayesClassifier as NBC
from textblob import TextBlob
import sklearn.feature_extraction.text
from sklearn.metrics import classification_report
from sklearn import svm 
import math
from collections import Counter

# 1. Text Preprocessing

![title](./images/pipeline.png)

Noise removal

In [1]:
noise_list = ["is", "a", "this", "..."] 
def _remove_noise(input_text):
    words = input_text.split() 
    noise_free_words = [word for word in words if word not in noise_list] 
    noise_free_text = " ".join(noise_free_words) 
    return noise_free_text

_remove_noise("this is a sample text")

'sample text'

using regexp

In [3]:
def _remove_regex(input_text, regex_pattern):
    urls = re.finditer(regex_pattern, input_text) 
    for i in urls: 
        input_text = re.sub(i.group().strip(), '', input_text)
    return input_text

regex_pattern = "#[\w]*"  

_remove_regex("remove this #hashtag from analytics vidhya", regex_pattern)

'remove this  from analytics vidhya'

In [12]:
lem = WordNetLemmatizer()
stem = PorterStemmer()

Obtaing the root of the word

In [11]:
word = "multiplying" 
lem.lemmatize(word, "v")

'multiply'

Strripping off the suffixes

In [10]:
stem.stem(word)

'multipli'

Object Standardization

In [16]:
lookup_dict = {'rt':'Retweet', 'dm':'direct message', "awsm" : "awesome", "luv" :"love"}
def _lookup_words(input_text):
    words = input_text.split() 
    new_words = [] 
    for word in words:
        if word.lower() in lookup_dict:
            word = lookup_dict[word.lower()]
        new_words.append(word) 
        new_text = " ".join(new_words) 
    return new_text

_lookup_words("RT this is a retweeted tweet by Shivam Bansal")

'Retweet this is a retweeted tweet by Shivam Bansal'

# 2. Text to Features (Feature Engineering on text data)

Dependency Tree
![title](./images/trees.png)

Part of speech tagging 

In [26]:
text = "I am learning Natural Language Processing on Analytics Vidhya"
tokens = word_tokenize(text)
print(pos_tag(tokens))

[('I', 'PRP'), ('am', 'VBP'), ('learning', 'VBG'), ('Natural', 'NNP'), ('Language', 'NNP'), ('Processing', 'NNP'), ('on', 'IN'), ('Analytics', 'NNP'), ('Vidhya', 'NNP')]


Entity Extraction (Entities as features)
![title](./images/entity.png)

Topic Modeling

In [30]:
doc1 = "Sugar is bad to consume. My sister likes to have sugar, but not my father." 
doc2 = "My father spends a lot of time driving my sister around to dance practice."
doc3 = "Doctors suggest that driving may cause increased stress and blood pressure."
doc_complete = [doc1, doc2, doc3]
doc_clean = [doc.split() for doc in doc_complete]

# Creating the term dictionary of corpus, where every unique term is assigned an index.  
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. 
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Training LDA model on the document term matrix
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)

# Results 
print(ldamodel.print_topics())

[(0, '0.060*"driving" + 0.060*"cause" + 0.060*"pressure." + 0.060*"blood" + 0.060*"increased" + 0.060*"stress" + 0.060*"suggest" + 0.060*"Doctors" + 0.060*"and" + 0.060*"that"'), (1, '0.053*"driving" + 0.053*"sister" + 0.053*"my" + 0.053*"My" + 0.053*"father" + 0.053*"of" + 0.053*"dance" + 0.053*"practice." + 0.053*"around" + 0.053*"time"'), (2, '0.089*"to" + 0.051*"My" + 0.051*"my" + 0.051*"sister" + 0.051*"not" + 0.051*"bad" + 0.051*"consume." + 0.051*"Sugar" + 0.051*"is" + 0.051*"father."')]


 N-Grams as Features

In [31]:
def generate_ngrams(text, n):
    words = text.split()
    output = []  
    for i in range(len(words)-n+1):
        output.append(words[i:i+n])
    return output

generate_ngrams('this is a sample text', 2)

[['this', 'is'], ['is', 'a'], ['a', 'sample'], ['sample', 'text']]

Statistical Features
![title](./images/tf.png)

In [2]:
obj = TfidfVectorizer()
corpus = ['This is sample document.', 'another random document.', 'third sample document text']
X = obj.fit_transform(corpus)
print(X)

  (0, 7)	0.58448290102
  (0, 2)	0.58448290102
  (0, 4)	0.444514311537
  (0, 1)	0.345205016865
  (1, 1)	0.385371627466
  (1, 0)	0.652490884513
  (1, 3)	0.652490884513
  (2, 4)	0.444514311537
  (2, 1)	0.345205016865
  (2, 6)	0.58448290102
  (2, 5)	0.58448290102


 Word Embedding (text vectors)

In [4]:
sentences = [['data', 'science'], ['vidhya', 'science', 'data', 'analytics'],['machine', 'learning'], ['deep', 'learning']]

# train the model on your corpus  
model = Word2Vec(sentences, min_count = 1)

print(model.similarity('data', 'science'))

print(model['learning'])

0.0873396930529
[  1.14168017e-03  -4.94242134e-03   2.72238249e-04   3.97438370e-03
   1.69398112e-03   2.43922602e-03  -3.20519553e-04  -1.18722732e-03
  -4.35135467e-03  -3.56071885e-03   3.99002247e-03  -3.28068179e-03
  -1.49453568e-04  -3.82759469e-03  -2.43284900e-04   1.77918072e-03
   3.11065342e-05   3.49582359e-03   6.00333533e-06   2.08169455e-03
  -4.31511085e-03   2.12243013e-03   7.45210506e-04   3.46274348e-03
  -1.47648144e-03  -2.76382617e-03  -2.81338114e-03  -7.59243267e-04
   1.94972206e-04   3.54523258e-03  -3.22825788e-03   2.56676367e-03
   1.02296274e-03  -4.39176848e-03  -4.27832548e-03   6.48388872e-04
   5.47887932e-04  -2.09806324e-03  -4.96958056e-03   2.08194065e-03
  -2.02509505e-03  -5.73990343e-04   1.04251027e-03   4.81688976e-03
  -2.63691857e-03  -3.34192789e-03   4.24436264e-04  -2.71220435e-03
  -3.24598700e-03  -2.61668139e-03   6.59248501e-04   3.95958871e-03
  -2.47115782e-03  -4.93136141e-03  -3.11104301e-03  -7.91999686e-04
   1.96530949e-03 

# Important tasks of NLP

Text Classification
![title](./images/classification.png)

In [10]:
training_corpus = [
                   ('I am exhausted of this work.', 'Class_B'),
                   ("I can't cooperate with this", 'Class_B'),
                   ('He is my badest enemy!', 'Class_B'),
                   ('My management is poor.', 'Class_B'),
                   ('I love this burger.', 'Class_A'),
                   ('This is an brilliant place!', 'Class_A'),
                   ('I feel very good about these dates.', 'Class_A'),
                   ('This is my best work.', 'Class_A'),
                   ("What an awesome view", 'Class_A'),
                   ('I do not like this dish', 'Class_B')]
test_corpus = [
                ("I am not feeling well today.", 'Class_B'), 
                ("I feel brilliant!", 'Class_A'), 
                ('Gary is a friend of mine.', 'Class_A'), 
                ("I can't believe I'm doing this.", 'Class_B'), 
                ('The date was good.', 'Class_A'), ('I do not enjoy my job', 'Class_B')]

model = NBC(training_corpus) 
print(model.classify("Their codes are amazing."))

print(model.classify("I don't like their computer."))

print(model.accuracy(test_corpus)) 

Class_A
Class_B
0.8333333333333334


sklearn data pipeline

In [19]:
train_data = []
train_labels = []
for row in training_corpus:
    train_data.append(row[0])
    train_labels.append(row[1])

test_data = [] 
test_labels = [] 
for row in test_corpus:
    test_data.append(row[0]) 
    test_labels.append(row[1])

# Create feature vectors 
vectorizer = TfidfVectorizer(min_df=4, max_df=0.9)
# Train the feature vectors
train_vectors = vectorizer.fit_transform(train_data)
# Apply model on test data 
test_vectors = vectorizer.transform(test_data)

# Perform classification with SVM, kernel=linear 
model = svm.SVC(kernel='linear') 
model.fit(train_vectors, train_labels) 
prediction = model.predict(test_vectors)

print(classification_report(test_labels, prediction))

             precision    recall  f1-score   support

    Class_A       0.50      0.67      0.57         3
    Class_B       0.50      0.33      0.40         3

avg / total       0.50      0.50      0.49         6



Text Matching / Similarity

Levenshtein Distance 

In [20]:
def levenshtein(s1,s2): 
    if len(s1) > len(s2):
        s1,s2 = s2,s1 
    distances = range(len(s1) + 1) 
    for index2,char2 in enumerate(s2):
        newDistances = [index2+1]
        for index1,char1 in enumerate(s1):
            if char1 == char2:
                newDistances.append(distances[index1]) 
            else:
                 newDistances.append(1 + min((distances[index1], distances[index1+1], newDistances[-1]))) 
        distances = newDistances 
    return distances[-1]

print(levenshtein("analyze","analyse"))

1


Cosine Similarity

In [26]:
def get_cosine(vec1, vec2):
    common = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in common])

    sum1 = sum([vec1[x]**2 for x in vec1.keys()]) 
    sum2 = sum([vec2[x]**2 for x in vec2.keys()]) 
    denominator = math.sqrt(sum1) * math.sqrt(sum2)
   
    if not denominator:
        return 0.0 
    else:
        return float(numerator) / denominator

def text_to_vector(text): 
    words = text.split() 
    return Counter(words)

text1 = 'This is an article on analytics vidhya' 
text2 = 'article on analytics vidhya is about natural language processing'

vector1 = text_to_vector(text1) 
vector2 = text_to_vector(text2) 
cosine = get_cosine(vector1, vector2)
print(cosine)

0.629940788348712
