# Basic NLP Pipeline

* Data Collection
* Tokenization, Stopword, Stemming
* Building a common vocab
* Vectorizing the documents
* Performing Classification/Clustering

In [248]:
from nltk.corpus import brown

## 1. Data Collection

In [249]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [250]:
data = brown.sents(categories = 'editorial') # gives list of sentences

In [251]:
print(data)

[['Assembly', 'session', 'brought', 'much', 'good'], ['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.'], ...]


In [252]:
print(data[0])

['Assembly', 'session', 'brought', 'much', 'good']


In [253]:
print(data[1])

['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.']


In [254]:
len(data)

2997

## 2. (a) Tokenization

In [255]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [256]:
text = "hello world, It is a beautiful day. Time to work, give 100% and achieve greatness. Mail at abc@def.com"

In [257]:
sents = sent_tokenize(text) # get sentences as list
print(sents)

['hello world, It is a beautiful day.', 'Time to work, give 100% and achieve greatness.', 'Mail at abc@def.com']


In [258]:
word_list = word_tokenize(sents[0].lower()) # get words as list from sentence
print(word_list)

['hello', 'world', ',', 'it', 'is', 'a', 'beautiful', 'day', '.']


## 2. (b) Stopword Removal

In [259]:
from nltk.corpus import stopwords

In [260]:
sw = set(stopwords.words('english'))

In [261]:
print(sw)

{'been', 'yourself', 'this', 'over', 'they', 'themselves', 'above', "haven't", "aren't", 'did', 'be', 'had', "you've", 'because', 'in', 'how', 'if', 'through', 'there', 'these', 'of', 'such', "hasn't", 'hadn', "isn't", 'can', 'for', 'haven', 'ours', 've', 'that', 'on', "didn't", 'the', 'own', 'it', 'into', "couldn't", 'below', "weren't", 'from', 'her', 'only', 'she', 'ma', 'an', 'off', 'i', 'a', 'until', 'while', 'against', 'was', "shan't", "wasn't", 'theirs', 'after', 'does', "don't", 'itself', 'is', 'aren', "it's", 'having', 'isn', 'weren', 'wouldn', 'with', 'them', 'once', 'now', 'to', 'no', 'those', 'about', 'wasn', 'some', 'then', 't', 'don', "should've", 'both', 'yours', 'and', 'our', 'down', 'between', 'when', "won't", 'y', 'few', 'o', 'but', 'or', 'not', 'further', 'before', 'were', 'too', 're', 'each', 'needn', 'doesn', 'under', 'didn', "you'll", 'has', 'should', 'ain', 'here', 'so', 'won', "doesn't", 'more', 'up', 'doing', 'do', 'just', 'their', 'as', 'my', 'who', 'during', '

In [262]:
print(len(sw))

179


### Filter the words from your sentence

In [263]:
def filter_words(word_list):
    useful_words = [w for w in word_list if w not in sw]
    return useful_words

    
useful_words= filter_words(word_list)
print(useful_words)

['hello', 'world', ',', 'beautiful', 'day', '.']


### Tokenization using Regular Expression
#### Problem with Word Tokenizer - Can't handle complex tokenizations like (, .) etc ! So we use a Regexp Tokenizer Class in NLTK

In [264]:
from nltk.tokenize import RegexpTokenizer

In [265]:
tokenizer = RegexpTokenizer("[a-zA-z@]+")

In [266]:
text = "Send all the 50 documents related to clauses 1,2,3 at abc@xyz.com."
print(tokenizer.tokenize(text))

['Send', 'all', 'the', 'documents', 'related', 'to', 'clauses', 'at', 'abc@xyz', 'com']


## 2. (c) Stemming

* Process that transforms particular words(verbs,plurals) into their radical form
* Preserve the semantics of the sentence without increasing the number of unique tokens
* jumps, jumping, jumped, jump ==> jump


In [267]:
text= """Foxes love to make jumps.The quick brown fox was seen jumping over the 
        lovely dog from a 6ft feet high wall"""
words_list = tokenizer.tokenize(text.lower())
print(words_list)

['foxes', 'love', 'to', 'make', 'jumps', 'the', 'quick', 'brown', 'fox', 'was', 'seen', 'jumping', 'over', 'the', 'lovely', 'dog', 'from', 'a', 'ft', 'feet', 'high', 'wall']


In [268]:
word_list = filter_words(words_list) # Remove the stopwords
print(word_list)

['foxes', 'love', 'make', 'jumps', 'quick', 'brown', 'fox', 'seen', 'jumping', 'lovely', 'dog', 'ft', 'feet', 'high', 'wall']


### Stemming
* 1) Snowball Stemmer (Multilingual)
* 2) Porter Stemmer
* 3) LancasterStemmer

In [269]:
from nltk.stem.snowball import PorterStemmer,SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

ps = PorterStemmer()

In [270]:
ps.stem("jumped")

'jump'

In [271]:
ps.stem("jumping")

'jump'

In [272]:
ps.stem("jump")

'jump'

In [273]:
ls = LancasterStemmer()
print(ls.stem("teeth"))

print(ps.stem("teenager")) #English
print(ls.stem("teenager")) #English

tee
teenag
teen


In [274]:
# Snowball stemmer
ss = SnowballStemmer('english')
print(ss.stem('lovely'))
print(ss.stem('teenager'))

love
teenag


### Lemmatization

In [275]:
from nltk.stem import WordNetLemmatizer

l = WordNetLemmatizer()
l.lemmatize("crying")

'cry'

## 3. Building Common Vocabulary and Vectorizing Documents (based upon Bag of Words Model)

In [276]:
corpus = [
        'Indian cricket team will wins World Cup, says Capt. Virat Kohli',
        'We will win next Lok Sabha Elections, says confident Indian PM',
        'The nobel laurate won the hearts of the people',
        'The movie Raazi is an exciting Indian Spy thriller based upon a real story'
]

In [277]:
from sklearn.feature_extraction.text import CountVectorizer # helps in counting the frequency of each word

In [278]:
cv = CountVectorizer()

In [279]:
vectorized_corpus = cv.fit_transform(corpus).toarray()

In [280]:
print(vectorized_corpus)
print(len(vectorized_corpus[0]))

[[0 0 1 0 1 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 1 0 1 0
  1]
 [0 0 0 1 0 0 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 1 1 0 0
  0]
 [0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 1
  0]
 [1 1 0 0 0 0 0 1 0 1 1 0 0 0 1 0 0 0 0 0 1 1 0 0 1 1 0 1 1 1 0 0 0 0 0 0
  0]]
37


In [281]:
print(cv.vocabulary_) #Dictionary - Word -> Index

{'indian': 9, 'cricket': 4, 'team': 26, 'will': 32, 'wins': 34, 'world': 36, 'cup': 5, 'says': 23, 'capt': 2, 'virat': 30, 'kohli': 11, 'we': 31, 'win': 33, 'next': 15, 'lok': 13, 'sabha': 22, 'elections': 6, 'confident': 3, 'pm': 19, 'the': 27, 'nobel': 16, 'laurate': 12, 'won': 35, 'hearts': 8, 'of': 17, 'people': 18, 'movie': 14, 'raazi': 20, 'is': 10, 'an': 0, 'exciting': 7, 'spy': 24, 'thriller': 28, 'based': 1, 'upon': 29, 'real': 21, 'story': 25}


In [282]:
# Given a vector what is the sentence
import numpy as np
vector = np.ones((37,))
vector[3:7] = 0

print(vector)
print(cv.inverse_transform(vector))
print(vectorized_corpus[0])
print(cv.inverse_transform(vectorized_corpus[0]))

[1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
[array(['an', 'based', 'capt', 'exciting', 'hearts', 'indian', 'is',
       'kohli', 'laurate', 'lok', 'movie', 'next', 'nobel', 'of',
       'people', 'pm', 'raazi', 'real', 'sabha', 'says', 'spy', 'story',
       'team', 'the', 'thriller', 'upon', 'virat', 'we', 'will', 'win',
       'wins', 'won', 'world'], dtype='<U9')]
[0 0 1 0 1 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 1 0 1 0 1]
[array(['capt', 'cricket', 'cup', 'indian', 'kohli', 'says', 'team',
       'virat', 'will', 'wins', 'world'], dtype='<U9')]


In [283]:
print(cv.vocabulary_['capt']) # index of word in dictionary
print(cv.vocabulary_['an'])
print(cv.vocabulary_['world'])

2
0
36


In [284]:
### Effectively reduce the size of the vector

def myTokenizer(sentence):
    words = tokenizer.tokenize(sentence.lower())
    return filter_words(words) #Filter out the stopwords

myTokenizer(corpus[0])

['indian',
 'cricket',
 'team',
 'wins',
 'world',
 'cup',
 'says',
 'capt',
 'virat',
 'kohli']

In [285]:
cv = CountVectorizer(tokenizer=myTokenizer)
vectorized_corpus = cv.fit_transform(corpus)
vc = vectorized_corpus.toarray()
print(vc[0])
print(len(vc[0]))
v = vc[0]
print(cv.inverse_transform(v))
vc[0][0] = 1
v = vc[0]
print(vc[0])
cv.inverse_transform(v)

[0 1 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 1]
30
[array(['capt', 'cricket', 'cup', 'indian', 'kohli', 'says', 'team',
       'virat', 'wins', 'world'], dtype='<U9')]
[1 1 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 1]


[array(['based', 'capt', 'cricket', 'cup', 'indian', 'kohli', 'says',
        'team', 'virat', 'wins', 'world'], dtype='<U9')]

## Features in Bag of Words Model
* Unigrams
* Bigrams, Trigrams
* N-Grams (an n-gram is a contiguous sequence of n items from a given sample of text or speech.)

In [286]:
cv = CountVectorizer(tokenizer=myTokenizer,ngram_range = (1,2)) # combine unigram and bigram
vectorized_corpus = cv.fit_transform(corpus)
vc = vectorized_corpus.toarray()
print(cv.vocabulary_)

{'indian': 16, 'cricket': 6, 'team': 45, 'wins': 55, 'world': 57, 'cup': 8, 'says': 39, 'capt': 2, 'virat': 51, 'kohli': 20, 'indian cricket': 17, 'cricket team': 7, 'team wins': 46, 'wins world': 56, 'world cup': 58, 'cup says': 9, 'says capt': 40, 'capt virat': 3, 'virat kohli': 52, 'win': 53, 'next': 27, 'lok': 23, 'sabha': 37, 'elections': 10, 'confident': 4, 'pm': 32, 'win next': 54, 'next lok': 28, 'lok sabha': 24, 'sabha elections': 38, 'elections says': 11, 'says confident': 41, 'confident indian': 5, 'indian pm': 18, 'nobel': 29, 'laurate': 21, 'hearts': 14, 'people': 31, 'nobel laurate': 30, 'laurate hearts': 22, 'hearts people': 15, 'movie': 25, 'raazi': 33, 'exciting': 12, 'spy': 42, 'thriller': 47, 'based': 0, 'upon': 49, 'real': 35, 'story': 44, 'movie raazi': 26, 'raazi exciting': 34, 'exciting indian': 13, 'indian spy': 19, 'spy thriller': 43, 'thriller based': 48, 'based upon': 1, 'upon real': 50, 'real story': 36}


In [287]:
print(vc)
print(vc[0].size)

[[0 0 1 1 0 0 1 1 1 1 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 1 1 0 0 0 0 1 1 0 0 0 0 1 1 0 0 1 1 1 1]
 [0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 0 1 0 1 0 0 0 0 1 1 0 0 1 1 0 0 0 1 0 0 0
  0 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1
  1 0 0 0 0 0 1 1 1 0 0 1 1 1 1 0 0 0 0 0 0 0 0]]
59


## Tf-idf Normalisation
* Avoid features that occur very often, becauase they contain less information
* Information decreases as the number of occurences increases across different type of documents
* So we define another term - term-document-frequency which associates a weight with every term

In [288]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [289]:
tfidf_vectorizer = TfidfVectorizer(tokenizer=myTokenizer,ngram_range=(1,2),norm='l2') # combine unigram and bigram

vectorized_corpus = tfidf_vectorizer.fit_transform(corpus).toarray()
print(vectorized_corpus)

[[0.         0.         0.2355126  0.2355126  0.         0.
  0.2355126  0.2355126  0.2355126  0.2355126  0.         0.
  0.         0.         0.         0.         0.15032464 0.2355126
  0.         0.         0.2355126  0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.18568084 0.2355126  0.
  0.         0.         0.         0.2355126  0.2355126  0.
  0.         0.         0.         0.2355126  0.2355126  0.
  0.         0.2355126  0.2355126  0.2355126  0.2355126 ]
 [0.         0.         0.         0.         0.24977372 0.24977372
  0.         0.         0.         0.         0.24977372 0.24977372
  0.         0.         0.         0.         0.15942733 0.
  0.24977372 0.         0.         0.         0.         0.24977372
  0.24977372 0.         0.         0.24977372 0.24977372 0.
  0.         0.         0.24977372 0.         0.         0.
  0.       

In [290]:
print(tfidf_vectorizer.vocabulary_)

{'indian': 16, 'cricket': 6, 'team': 45, 'wins': 55, 'world': 57, 'cup': 8, 'says': 39, 'capt': 2, 'virat': 51, 'kohli': 20, 'indian cricket': 17, 'cricket team': 7, 'team wins': 46, 'wins world': 56, 'world cup': 58, 'cup says': 9, 'says capt': 40, 'capt virat': 3, 'virat kohli': 52, 'win': 53, 'next': 27, 'lok': 23, 'sabha': 37, 'elections': 10, 'confident': 4, 'pm': 32, 'win next': 54, 'next lok': 28, 'lok sabha': 24, 'sabha elections': 38, 'elections says': 11, 'says confident': 41, 'confident indian': 5, 'indian pm': 18, 'nobel': 29, 'laurate': 21, 'hearts': 14, 'people': 31, 'nobel laurate': 30, 'laurate hearts': 22, 'hearts people': 15, 'movie': 25, 'raazi': 33, 'exciting': 12, 'spy': 42, 'thriller': 47, 'based': 0, 'upon': 49, 'real': 35, 'story': 44, 'movie raazi': 26, 'raazi exciting': 34, 'exciting indian': 13, 'indian spy': 19, 'spy thriller': 43, 'thriller based': 48, 'based upon': 1, 'upon real': 50, 'real story': 36}


In [291]:
print(vectorized_corpus[0])

[0.         0.         0.2355126  0.2355126  0.         0.
 0.2355126  0.2355126  0.2355126  0.2355126  0.         0.
 0.         0.         0.         0.         0.15032464 0.2355126
 0.         0.         0.2355126  0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.18568084 0.2355126  0.
 0.         0.         0.         0.2355126  0.2355126  0.
 0.         0.         0.         0.2355126  0.2355126  0.
 0.         0.2355126  0.2355126  0.2355126  0.2355126 ]


In [292]:
print(vectorized_corpus[0].size)

59
