# Basic NLP Pipeline
1. Data Collection
2. Tokenization,Stopwords,Stemming
3. Building a common vocab
4. Vectorizing the documents
5. Performing classification/clustering

## 1. Data Collection

In [2]:
from nltk.corpus import brown

In [3]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [4]:
data = brown.sents(categories="editorial")
print(data)

[['Assembly', 'session', 'brought', 'much', 'good'], ['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.'], ...]


In [5]:
print(len(data))

2997


## 2. Tokenization,Stopwords,Stemming

### Tokenization

In [6]:
text = "It was a very pleasant day, weather was cool and there were light showers. I went to the market to buy some fruits."
print(text)

It was a very pleasant day, weather was cool and there were light showers. I went to the market to buy some fruits.


In [7]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [8]:
sents = sent_tokenize(text)
print(sents)

['It was a very pleasant day, weather was cool and there were light showers.', 'I went to the market to buy some fruits.']


In [11]:
print(sents[0])
words = word_tokenize(sents[0].lower())
print(words)

It was a very pleasant day, weather was cool and there were light showers.
['it', 'was', 'a', 'very', 'pleasant', 'day', ',', 'weather', 'was', 'cool', 'and', 'there', 'were', 'light', 'showers', '.']


### Stopwords removal

In [14]:
from nltk.corpus import stopwords
sw = set(stopwords.words('english'))
print(sw)
print(len(sw))

{'this', "shan't", 'myself', 'why', 'was', 'mustn', 'i', 'is', 'couldn', 'not', "needn't", "you'll", 'me', 'be', 'aren', 'his', 'about', "doesn't", "hadn't", "don't", "haven't", 'and', 'nor', 'as', "she's", 'who', "didn't", 'didn', 'that', 'all', "mustn't", 'itself', 'off', 'only', 'shan', 'were', "should've", 'from', 'doesn', 'more', 'than', 'theirs', 'how', 'or', 'above', 'now', "it's", 'in', 'if', 'into', 'very', 'am', 'are', 'he', 'until', 'being', "you've", 'does', 'most', 'shouldn', 'because', 'ain', 'my', 'do', 'can', 'needn', 'himself', 'for', 'again', 'what', 'wouldn', 'up', 'no', "isn't", 've', 'by', 'mightn', 'been', 'an', 'some', 'ma', "that'll", 'our', 'yourself', 'against', 'below', 'where', 'so', 'after', 'any', 'each', 'own', 'we', 'will', 're', "weren't", 'same', "won't", 'your', 'ours', 'other', 'further', 'during', 'themselves', 'doing', 's', 'whom', "mightn't", "shouldn't", 'when', 'haven', 'under', 'of', 'don', 'have', 'here', 'isn', 'over', 'a', 'but', 'him', 'wit

### Filter words from sentence

In [26]:
def filterWords(words):
    return [w for w in words if w not in sw]

print(words)
usefulWords = filterWords(words)
print(usefulWords)

['it', 'was', 'a', 'very', 'pleasant', 'day', ',', 'weather', 'was', 'cool', 'and', 'there', 'were', 'light', 'showers', '.']
['pleasant', 'day', ',', 'weather', 'cool', 'light', 'showers', '.']


### Tokenization using Regular Expression
- Problem with word tokenizer - Can't handle complex tokenization
- So we use Regexp Tokenizer class in NLTK

In [16]:
from nltk.tokenize import RegexpTokenizer

In [18]:
text1 = "Send all the 50 documents related to clauses 1,2,3 at abc@xyz.com ."
print(text1)
regularExp = "[a-zA-Z@]+"
tokenizer = RegexpTokenizer(regularExp)
print(tokenizer.tokenize(text1))

Send all the 50 documents related to clauses 1,2,3 at abc@xyz.com .
['Send', 'all', 'the', 'documents', 'related', 'to', 'clauses', 'at', 'abc@xyz', 'com']


### Stemming
- Process that transforms particular words(verbs,plurals) into their radical form
- Preserves the semantics of the sentence without increasing the number of unique tokens
- jumps, jumped, jumping => jump

In [29]:
text3 = "Foxes love to make jumps.The quick brown fox was seen jumping over the lovely dog from a 6ft feet high wall"
wordsList = tokenizer.tokenize(text3.lower())
print(wordsList)

['foxes', 'love', 'to', 'make', 'jumps', 'the', 'quick', 'brown', 'fox', 'was', 'seen', 'jumping', 'over', 'the', 'lovely', 'dog', 'from', 'a', 'ft', 'feet', 'high', 'wall']


In [30]:
wordsList = filterWords(wordsList)
print(wordsList)

['foxes', 'love', 'make', 'jumps', 'quick', 'brown', 'fox', 'seen', 'jumping', 'lovely', 'dog', 'ft', 'feet', 'high', 'wall']


### Stemmers
- SnowballStemmer (multilingual)
- PorterStemmer
- LancasterStemmer

In [32]:
from nltk.stem import SnowballStemmer, PorterStemmer, LancasterStemmer
ss = SnowballStemmer('english')
ssFrench = SnowballStemmer('french')
ps = PorterStemmer()
ls = LancasterStemmer()

In [33]:
# PorterStemmer
print(ps.stem("jumped"))
print(ps.stem("jumping"))
print(ps.stem("lovely"))
print(ps.stem("awesome"))

jump
jump
love
awesom


In [34]:
# LancasterStemmer
print(ps.stem("teenager"))
print(ls.stem("teenager"))

teenag
teen


In [37]:
# SnowballStemmer
print(ss.stem('lovely'))
print(ss.stem('teenager'))
# French
print(ssFrench.stem('courais'))
print(ssFrench.stem('courir'))

love
teenag
cour
cour


### Lemmatization
Similar to stemming

In [42]:
from nltk.stem import WordNetLemmatizer
l = WordNetLemmatizer()
l.lemmatize("crying")

'cry'

## 3. Building a common vocabulary ( Based on Bag Of Words Model)

In [43]:
corpus = [
    'Indian cricket team will wins World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
    'We will win next Lok Sabha Elections, says confident Indian PM',
    'The nobel laurate won the hearts of the people',
    'The movie Raazi is an exciting Indian Spy thriller based upon a real story'
]

In [45]:
from sklearn.feature_extraction.text import CountVectorizer

In [46]:
cv = CountVectorizer()

In [53]:
# String form
# vectorizedCorpus = cv.fit_transform(corpus).todense()  
vectorizedCorpus = cv.fit_transform(corpus).toarray()
print(vectorizedCorpus)
print(len(vectorizedCorpus[0]))

[[0 1 0 1 1 0 1 2 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0
  2 0 1 0 2]
 [0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 1
  1 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 3 0 0 0 0
  0 0 0 1 0]
 [1 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 1 1 1 0 0
  0 0 0 0 0]]
42


In [51]:
# Map word->index
print(cv.vocabulary_)

{'indian': 12, 'cricket': 6, 'team': 31, 'will': 37, 'wins': 39, 'world': 41, 'cup': 7, 'says': 27, 'capt': 4, 'virat': 35, 'kohli': 14, 'be': 3, 'held': 11, 'at': 1, 'sri': 29, 'lanka': 15, 'we': 36, 'win': 38, 'next': 19, 'lok': 17, 'sabha': 26, 'elections': 8, 'confident': 5, 'pm': 23, 'the': 32, 'nobel': 20, 'laurate': 16, 'won': 40, 'hearts': 10, 'of': 21, 'people': 22, 'movie': 18, 'raazi': 24, 'is': 13, 'an': 0, 'exciting': 9, 'spy': 28, 'thriller': 33, 'based': 2, 'upon': 34, 'real': 25, 'story': 30}


In [55]:
# Given a vector find sentence
import numpy as np

vector = np.ones((42,))
vector[3:7] = 0
print(vector)

[ 1.  1.  1.  0.  0.  0.  0.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.]


In [58]:
print(cv.inverse_transform(vector))
# Length = 42 - (7-3) = 38
print(len(cv.inverse_transform(vector)[0]))

[array(['an', 'at', 'based', 'cup', 'elections', 'exciting', 'hearts',
       'held', 'indian', 'is', 'kohli', 'lanka', 'laurate', 'lok', 'movie',
       'next', 'nobel', 'of', 'people', 'pm', 'raazi', 'real', 'sabha',
       'says', 'spy', 'sri', 'story', 'team', 'the', 'thriller', 'upon',
       'virat', 'we', 'will', 'win', 'wins', 'won', 'world'],
      dtype='<U9')]
38


In [60]:
print(cv.vocabulary_['an'])
print(cv.vocabulary_['at'])

0
1


In [62]:
# Effectively reduce the size of vector

def myTokenizer(sentence):
    words = tokenizer.tokenize(sentence.lower())
    return filterWords(words)

print(myTokenizer(corpus[0]))

['indian', 'cricket', 'team', 'wins', 'world', 'cup', 'says', 'capt', 'virat', 'kohli', 'world', 'cup', 'held', 'sri', 'lanka']


In [64]:
cv = CountVectorizer(tokenizer=myTokenizer)
newVectorizedCorpus = cv.fit_transform(corpus)
vc = newVectorizedCorpus.toarray()
print(len(vc[0]))
print(vc)

33
[[0 1 0 1 2 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 2]
 [0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 1 1 0 0 0 0]]


In [65]:
print(cv.inverse_transform(vc[0]))

[array(['capt', 'cricket', 'cup', 'held', 'indian', 'kohli', 'lanka',
       'says', 'sri', 'team', 'virat', 'wins', 'world'],
      dtype='<U9')]


## Features in Bag of Words Model
- Unigrams (Done above)
- Bigrams, Trigrams
- N-Grams

In [66]:
# Range from 1 to 3
cv = CountVectorizer(tokenizer=myTokenizer, ngram_range=(1,3))
nGramVectorizedCorpus = cv.fit_transform(corpus)
vc = nGramVectorizedCorpus.toarray()
# Size increases 
# size = (prev)(1 gram) + (prev-1)(2 gram) + (prev-2)(3 gram) = 3prev - 3 = 3*33 - 3 = 96
print(len(vc[0]))
print(vc)

96
[[0 0 0 1 1 1 0 0 0 1 1 1 2 1 1 1 1 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 1 1 1
  1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 1 0
  1 1 1 0 0 0 0 0 0 1 1 1 0 0 0 1 1 1 2 2 1 1]
 [0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0
  0 0 0 0 1 1 1 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 0 1 1 1 1 0 0 1 1 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0
  0 1 1 1 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 0 0 0 1 1 0 0 0
  0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 1 0 0 1
  0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]]


### Tf-idf Normalisation
- Avoid features that occur very often, becauase they contain less information
- Information decreases as the number of occurences increases across different type of documents
- So we define another term - term-document-frequency which associates a weight with every term

In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [69]:
tfIdfVectorizer = TfidfVectorizer(tokenizer=myTokenizer, ngram_range=(1,2), norm='l2')

In [70]:
tfIdfVectorizedCorpus = tfIdfVectorizer.fit_transform(corpus).toarray()
print(tfIdfVectorizedCorpus)

[[ 0.          0.          0.17142549  0.17142549  0.          0.
   0.17142549  0.17142549  0.34285097  0.17142549  0.17142549  0.          0.
   0.          0.          0.          0.          0.17142549  0.17142549
   0.10941867  0.17142549  0.          0.          0.17142549  0.17142549
   0.17142549  0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.13515382
   0.17142549  0.          0.          0.          0.17142549  0.17142549
   0.          0.17142549  0.17142549  0.          0.          0.          0.
   0.17142549  0.17142549  0.          0.          0.17142549  0.17142549
   0.34285097  0.34285097]
 [ 0.          0.          0.          0.          0.24977372  0.24977372
   0.          0.          0.          0.          0.          0.24977372
   0.24977372  0.          0.          0.          0.          0.          0.

In [71]:
print(tfIdfVectorizer.vocabulary_)

{'indian': 19, 'cricket': 6, 'team': 52, 'wins': 62, 'world': 64, 'cup': 8, 'says': 44, 'capt': 2, 'virat': 58, 'kohli': 23, 'held': 17, 'sri': 49, 'lanka': 25, 'indian cricket': 20, 'cricket team': 7, 'team wins': 53, 'wins world': 63, 'world cup': 65, 'cup says': 10, 'says capt': 45, 'capt virat': 3, 'virat kohli': 59, 'kohli world': 24, 'cup held': 9, 'held sri': 18, 'sri lanka': 50, 'win': 60, 'next': 32, 'lok': 28, 'sabha': 42, 'elections': 11, 'confident': 4, 'pm': 37, 'win next': 61, 'next lok': 33, 'lok sabha': 29, 'sabha elections': 43, 'elections says': 12, 'says confident': 46, 'confident indian': 5, 'indian pm': 21, 'nobel': 34, 'laurate': 26, 'hearts': 15, 'people': 36, 'nobel laurate': 35, 'laurate hearts': 27, 'hearts people': 16, 'movie': 30, 'raazi': 38, 'exciting': 13, 'spy': 47, 'thriller': 54, 'based': 0, 'upon': 56, 'real': 40, 'story': 51, 'movie raazi': 31, 'raazi exciting': 39, 'exciting indian': 14, 'indian spy': 22, 'spy thriller': 48, 'thriller based': 55, 'b