# Natural Language Processing Introduction


# Noise removal in sample text


In [1]:
# Sample code to remove noisy words from a text

noise_list = ["is", "a", "this", "..."] 
def _remove_noise(input_text):
    words = input_text.split() 
    noise_free_words = [word for word in words if word not in noise_list] 
    noise_free_text = " ".join(noise_free_words) 
    return noise_free_text

In [2]:
_remove_noise("this is a sample text")

'sample text'

In [3]:
# Sample code to remove a regex pattern 
import re 

def _remove_regex(input_text, regex_pattern):
    urls = re.finditer(regex_pattern, input_text) 
    for i in urls: 
        input_text = re.sub(i.group().strip(), '', input_text)
    return input_text

regex_pattern = "#[\w]*"  

_remove_regex("remove this #hashtag from sample text", regex_pattern)

'remove this  from sample text'

# Root word Extraction

In [4]:
# Finding root words
from nltk.stem.wordnet import WordNetLemmatizer 
lem = WordNetLemmatizer()

from nltk.stem.porter import PorterStemmer 
stem = PorterStemmer()



In [5]:
word = "multiplying" 
lem.lemmatize(word, "v")

'multiply'

In [6]:
stem.stem(word)

'multipli'

# Object standardization

In [7]:
# Object standardization
lookup_dict = {'rt':'Retweet', 'dm':'direct message', "awsm" : "awesome", "luv" :"love"}
def _lookup_words(input_text):
    words = input_text.split() 
    new_words = [] 
    for word in words:
        if word.lower() in lookup_dict:
            word = lookup_dict[word.lower()]
        new_words.append(word) 
        new_text = " ".join(new_words) 
        return new_text



In [8]:
_lookup_words("RT this is a retweeted tweet by User")

'Retweet'

# Parts of speech tagging

In [9]:
from nltk import word_tokenize, pos_tag
text = "I am learning Natural Language Processing on Analytics Vidhya"
tokens = word_tokenize(text)
print(pos_tag(tokens))


[('I', 'PRP'), ('am', 'VBP'), ('learning', 'VBG'), ('Natural', 'NNP'), ('Language', 'NNP'), ('Processing', 'NNP'), ('on', 'IN'), ('Analytics', 'NNP'), ('Vidhya', 'NNP')]


# Topic modelling

# n -gram generation

In [11]:
# n -gram generation
def generate_ngrams(text, n):
    words = text.split()
    output = []  
    for i in range(len(words)-n+1):
        output.append(words[i:i+n])
    return output



In [12]:
 generate_ngrams('this is a sample text', 2)


[['this', 'is'], ['is', 'a'], ['a', 'sample'], ['sample', 'text']]

# Term Frequency – Inverse Document Frequency (TF – IDF)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
obj = TfidfVectorizer()
corpus = ['This is sample document.', 'another random document.', 'third sample document text']
X = obj.fit_transform(corpus)
print(X)


  (0, 1)	0.34520501686496574
  (0, 4)	0.444514311537431
  (0, 2)	0.5844829010200651
  (0, 7)	0.5844829010200651
  (1, 3)	0.652490884512534
  (1, 0)	0.652490884512534
  (1, 1)	0.3853716274664007
  (2, 5)	0.5844829010200651
  (2, 6)	0.5844829010200651
  (2, 1)	0.34520501686496574
  (2, 4)	0.444514311537431


# Word Embedding (text vectors)

In [14]:
from gensim.models import Word2Vec
sentences = [['data', 'science'], ['vidhya', 'science', 'data', 'analytics'],['machine', 'learning'], ['deep', 'learning']]

# train the model on your corpus  
model = Word2Vec(sentences, min_count = 1)


In [15]:
print(model.similarity('data', 'science'))

-0.026081588


  """Entry point for launching an IPython kernel.


In [16]:
print(model['learning']  )

[-2.1114559e-03  4.0282998e-03 -2.4209418e-03 -2.3224731e-03
 -2.9035429e-03  2.3237576e-03  1.1352203e-03  9.7878603e-04
  2.6004343e-05 -4.6746302e-03  2.2118350e-03 -3.6649862e-03
  1.1161554e-03  2.7634865e-03  4.8253480e-03 -2.3518649e-03
  2.6311895e-03 -9.2142104e-04  1.6654573e-03 -3.2163823e-03
  4.0544942e-03 -4.6770307e-03 -3.2631867e-04 -1.1999932e-04
  4.7497069e-03 -2.2621672e-03 -9.8093401e-04  4.8115752e-03
 -7.0598020e-05 -3.1731846e-03  3.6439991e-03  2.3624599e-03
  1.6675318e-03 -1.1162723e-03 -2.2862034e-03  5.4479000e-04
  3.4477548e-03  6.7857566e-04  1.0293481e-03  4.5942471e-04
  3.7593981e-03 -1.3868172e-04 -2.7522233e-03 -2.6950150e-04
 -1.7405511e-03 -4.2016585e-03  1.4243968e-03 -3.6115085e-03
 -8.9955947e-04 -3.1237637e-03  1.7734072e-03 -1.3439535e-03
 -3.8077247e-03 -2.5034812e-04  4.6441434e-03 -6.4118707e-05
  2.5672978e-03  4.6155606e-03 -2.9132729e-03  9.2010832e-06
 -3.5237276e-03 -3.8213239e-03 -2.4839744e-03  1.4439465e-03
 -1.4836479e-03  4.48653

  """Entry point for launching an IPython kernel.


# Text Classification


In [24]:
# Here is a code that uses naive bayes classifier using text blob library (built on top of nltk).
from textblob.classifiers import NaiveBayesClassifier as NBC
from textblob import TextBlob
training_corpus = [
                   ('I am exhausted of this work.', 'Class_B'),
                   ("I can't cooperate with this", 'Class_B'),
                   ('He is my badest enemy!', 'Class_B'),
                   ('My management is poor.', 'Class_B'),
                   ('I love this burger.', 'Class_A'),
                   ('This is an brilliant place!', 'Class_A'),
                   ('I feel very good about these dates.', 'Class_A'),
                   ('This is my best work.', 'Class_A'),
                   ("What an awesome view", 'Class_A'),
                   ('I do not like this dish', 'Class_B')]
test_corpus = [
                ("I am not feeling well today.", 'Class_B'), 
                ("I feel brilliant!", 'Class_A'), 
                ('Gary is a friend of mine.', 'Class_A'), 
                ("I can't believe I'm doing this.", 'Class_B'), 
                ('The date was good.', 'Class_A'), ('I do not enjoy my job', 'Class_B')]

model = NBC(training_corpus) 


In [21]:
print(model.classify("Their codes are amazing."))

Class_A


In [22]:
print(model.classify("I don't like their computer."))



Class_B


In [23]:
print(model.accuracy(test_corpus))

0.8333333333333334


In [25]:
# Scikit.Learn also provides a pipeline framework for text classification:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn import svm 

# preparing data for SVM model (using the same training_corpus, test_corpus from naive bayes example)
train_data = []
train_labels = []
for row in training_corpus:
    train_data.append(row[0])
    train_labels.append(row[1])

test_data = [] 
test_labels = [] 
for row in test_corpus:
    test_data.append(row[0]) 
    test_labels.append(row[1])

# Create feature vectors 
vectorizer = TfidfVectorizer(min_df=4, max_df=0.9)
# Train the feature vectors
train_vectors = vectorizer.fit_transform(train_data)
# Apply model on test data 
test_vectors = vectorizer.transform(test_data)

# Perform classification with SVM, kernel=linear 
model = svm.SVC(kernel='linear') 
model.fit(train_vectors, train_labels) 
prediction = model.predict(test_vectors)

In [26]:
print (classification_report(test_labels, prediction))

              precision    recall  f1-score   support

     Class_A       0.50      0.67      0.57         3
     Class_B       0.50      0.33      0.40         3

    accuracy                           0.50         6
   macro avg       0.50      0.50      0.49         6
weighted avg       0.50      0.50      0.49         6



# Text Matching / Similarity

In [27]:
# levenshtein distance calculation example

def levenshtein(s1,s2): 
    if len(s1) > len(s2):
        s1,s2 = s2,s1 
    distances = range(len(s1) + 1) 
    for index2,char2 in enumerate(s2):
        newDistances = [index2+1]
        for index1,char1 in enumerate(s1):
            if char1 == char2:
                newDistances.append(distances[index1]) 
            else:
                 newDistances.append(1 + min((distances[index1], distances[index1+1], newDistances[-1]))) 
        distances = newDistances 
    return distances[-1]

print(levenshtein("analyze","analyse"))

1


In [28]:
import fuzzy 
soundex = fuzzy.Soundex(4) 
print(soundex('swathi'))

S300


In [29]:
print(soundex('sweety'))

S300


In [30]:
# Cosine Similarity 
import math
from collections import Counter
def get_cosine(vec1, vec2):
    common = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in common])

    sum1 = sum([vec1[x]**2 for x in vec1.keys()]) 
    sum2 = sum([vec2[x]**2 for x in vec2.keys()]) 
    denominator = math.sqrt(sum1) * math.sqrt(sum2)
   
    if not denominator:
        return 0.0 
    else:
        return float(numerator) / denominator

def text_to_vector(text): 
    words = text.split() 
    return Counter(words)

text1 = 'This is an article on analytics vidhya' 
text2 = 'article on analytics vidhya is about natural language processing'

vector1 = text_to_vector(text1) 
vector2 = text_to_vector(text2) 
cosine = get_cosine(vector1, vector2)


In [31]:
cosine

0.629940788348712

# Other NLP Use Cases