# Introduction to Natural Language Processing

### Install NLTK
```pip install nltk```

In [2]:
import nltk

In [9]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [3]:
# Corpus- A large collection of text
from nltk.corpus import brown

In [5]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [7]:
print(len(brown.categories()))

15


In [8]:
#let say i want sentences from adventure category
data = brown.sents(categories='adventure')

In [9]:
data

[['Dan', 'Morgan', 'told', 'himself', 'he', 'would', 'forget', 'Ann', 'Turner', '.'], ['He', 'was', 'well', 'rid', 'of', 'her', '.'], ...]

In [10]:
len(data) #number of sentences present in adventure cat

4637

In [12]:
print(data[0])

['Dan', 'Morgan', 'told', 'himself', 'he', 'would', 'forget', 'Ann', 'Turner', '.']


In [13]:
' '.join(data[1])

'He was well rid of her .'

# Bag of Words Pipeline
- Get the Data/Corpus
- Tokenisation(Break doc in sentences, and further sentences into words), 
- Stopward Removal
- Stemming
- Building a Vocab
- Vectorization
- Classification

### Tokenisation @ Stopword Removal

In [17]:
document = """It was a very pleasant day. The weather was cool and there were light showers.
I went to the market to but some fruits."""

sentence = "Send all the documents related to chapters 1,2,3 at vivekkrrai.bxr@gmail.com"

In [4]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [13]:
sents = sent_tokenize(document) #break a document into list of sentences
print(sents)

['It was a very pleasant day.', 'The weather was cool and there were light showers.', 'I went to the market to but some fruits.']


In [14]:
len(sents)

3

In [15]:
sents[0]

'It was a very pleasant day.'

In [18]:
sentence.split() 

['Send',
 'all',
 'the',
 'documents',
 'related',
 'to',
 'chapters',
 '1,2,3',
 'at',
 'vivekkrrai.bxr@gmail.com']

In [19]:
# .split() cannot split 1,2,3

In [20]:
#now let us use word tokeniser
words = word_tokenize(sentence)
print(words)

['Send', 'all', 'the', 'documents', 'related', 'to', 'chapters', '1,2,3', 'at', 'vivekkrrai.bxr', '@', 'gmail.com']


In [21]:
#we can also see that it breaks about special char

### Stopword removal

In [29]:
from nltk.corpus import stopwords

In [30]:
sw = set(stopwords.words('english'))

In [31]:
print(sw)

{'themselves', 'these', 'itself', 'further', 'been', 'only', 'but', 'all', 'too', 'do', "needn't", 'her', 'over', "it's", "should've", 'was', 'yourselves', 'from', 'herself', 'out', 'for', 'in', 'there', 'each', "doesn't", 'yours', 'needn', 'if', "you've", 'my', 'will', 'me', 'we', 'during', 's', 'nor', 'above', 'himself', 'no', 'of', 'being', 'to', 'under', "wouldn't", 'it', 'below', 'both', "you're", 'down', 'while', 'more', 'won', 'into', 'up', 'them', 'just', 'didn', 'll', 'were', 'this', 'don', 'ma', 'theirs', 'him', 'ourselves', 'that', 'm', 'ain', 'between', 'through', 'again', 'ours', 'its', 'had', 'does', 'whom', 'because', 'here', 'as', 'than', "mightn't", "hasn't", 'be', 'has', 'his', 'can', 'd', 'how', 'doing', 'myself', "couldn't", 're', 'against', "aren't", 'the', "won't", 'where', 'their', 'off', "weren't", 'by', 't', 'mustn', 'she', 'at', 'now', 'aren', 'and', 'on', 'weren', "you'll", 'very', 'who', 'shan', "mustn't", 'an', "that'll", 'a', "didn't", 'not', 'why', 'he', 

In [32]:
#these are the words that can be removed

In [33]:
def remove_stopwords(text, stopwords):
    useful_words = [w for w in text if w not in stopwords]
    return useful_words

In [34]:
text = "I am not bothered about her very much.".split()
useful_text = remove_stopwords(text, sw)
print(useful_text)

['I', 'bothered', 'much.']


In [35]:
'not' in sw

True

### Tokenisation using Regular Expressions

In [36]:
sentence = "Send all the 50 documents related to chapters 1,2,3 at vivekkrrai.bxr@gmail.com"

In [37]:
sentence.split()

['Send',
 'all',
 'the',
 '50',
 'documents',
 'related',
 'to',
 'chapters',
 '1,2,3',
 'at',
 'vivekkrrai.bxr@gmail.com']

In [38]:
from nltk.tokenize import RegexpTokenizer

In [39]:
tokenizer = RegexpTokenizer('[a-zA-Z@.]+') #include all letters, @,., + sign means for words
useful_text = tokenizer.tokenize(sentence)

In [40]:
useful_text #ALl numbers are removed

['Send',
 'all',
 'the',
 'documents',
 'related',
 'to',
 'chapters',
 'at',
 'vivekkrrai.bxr@gmail.com']

## Stemming
- Process that transforms words(verbs,plurals) into their radical form
- Preserve the semantics of the sentence wothout increasing the number of unique tokens
- Example - jumps, jumping, jumped, jump ==> jump

In [41]:
text = """Foxes love to make jumps. The quick brown fox was seen jumping over the
lovely dog from a 6ft high wall"""


In [42]:
# Snowball Stemmer, Porter, LAnchaster Stemmer
from nltk.stem.snowball import SnowballStemmer, PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

In [43]:
ps = PorterStemmer()

In [44]:
ps.stem('jumpping')

'jump'

In [45]:
ps.stem('jumps')

'jump'

In [46]:
ps.stem('lovely')

'love'

In [47]:
ps.stem('loving')

'love'

In [48]:
#it removes -s,-ing,-ly- etc at the end

In [49]:
#Snowball Stemmer - It is a multilingual Stemmebr works in many languages
ss = SnowballStemmer('english')

In [50]:
ss.stem('lovely')

'love'

In [51]:
ss.stem('jumping')

'jump'

In [52]:
## Lemmatization
from nltk.stem import WordNetLemmatizer

wn = WordNetLemmatizer()
wn.lemmatize('jumping')

'jumping'

## Building a Vocab and Vectorisation

In [53]:
#Sample corpus contains 4 documents, each document can have 1 or more sentences

corpus = [
        'Indian cricket team will wins World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
        'We will win next Lok Sabha Elections, says confident Indian PM',
        'The nobel laurate won the hearts of the people.',
        'The movie Raazi is an exciting Indian Spy thriller based upon a real story.'
]

In [54]:
from sklearn.feature_extraction.text import CountVectorizer

In [55]:
cv = CountVectorizer()

In [56]:
vectorized_corpus = cv.fit_transform(corpus)

In [57]:
vectorized_corpus = vectorized_corpus.toarray()

In [58]:
print(len(vectorized_corpus[0]))
print(vectorized_corpus[0])

42
[0 1 0 1 1 0 1 2 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0
 2 0 1 0 2]


In [59]:
cv.vocabulary_ #a/q to this cricket word has got the 6th index and that is why
#and sixth index we have 1 in vectorised_corpus[0] correspnding to 1st doc

{'indian': 12,
 'cricket': 6,
 'team': 31,
 'will': 37,
 'wins': 39,
 'world': 41,
 'cup': 7,
 'says': 27,
 'capt': 4,
 'virat': 35,
 'kohli': 14,
 'be': 3,
 'held': 11,
 'at': 1,
 'sri': 29,
 'lanka': 15,
 'we': 36,
 'win': 38,
 'next': 19,
 'lok': 17,
 'sabha': 26,
 'elections': 8,
 'confident': 5,
 'pm': 23,
 'the': 32,
 'nobel': 20,
 'laurate': 16,
 'won': 40,
 'hearts': 10,
 'of': 21,
 'people': 22,
 'movie': 18,
 'raazi': 24,
 'is': 13,
 'an': 0,
 'exciting': 9,
 'spy': 28,
 'thriller': 33,
 'based': 2,
 'upon': 34,
 'real': 25,
 'story': 30}

In [60]:
len(cv.vocabulary_.keys())


42

we can see the vectorised_corpus has same number of elements as there are 
key value pairs. Each position shows the frequency of elemnts and if an elemnt from the entire data is
not present in the vector of 1st doc it is 0.
suppose cup has frequency 2 cuz it occurs 2 times in 1st doc and 'an' has freq 0 cuz it doesn't occur in 1st doc

In [61]:
# to convert this dict mapping back to the sentence
#reverse mapping
numbers = vectorized_corpus[2]
numbers

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0],
      dtype=int64)

In [62]:
s= cv.inverse_transform(numbers)
s

[array(['hearts', 'laurate', 'nobel', 'of', 'people', 'the', 'won'],
       dtype='<U9')]

## Vectorisation with Stopward removal

In [75]:
def myTokenizer(document):
    words = tokenizer.tokenize(document.lower()) #convert every thing in lower case
    #Remove Stopwords
    words = remove_stopwords(words, sw)
    return words

In [76]:
myTokenizer(sentence)
#print(sentence)

['send', 'documents', 'related', 'chapters', 'vivekkrrai.bxr@gmail.com']

In [77]:
cv = CountVectorizer(tokenizer = myTokenizer)

In [78]:
vectorized_corpus = cv.fit_transform(corpus).toarray()

In [79]:
print(vectorized_corpus)

[[0 1 0 1 2 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 2]
 [0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 1 1 0 0 0 0]]


In [80]:
print(len(vectorized_corpus[0]))

33


In [81]:
cv.inverse_transform(vectorized_corpus)

[array(['capt.', 'cricket', 'cup', 'held', 'indian', 'kohli.', 'lanka.',
        'says', 'sri', 'team', 'virat', 'wins', 'world'], dtype='<U9'),
 array(['confident', 'elections', 'indian', 'lok', 'next', 'pm', 'sabha',
        'says', 'win'], dtype='<U9'),
 array(['hearts', 'laurate', 'nobel', 'people.'], dtype='<U9'),
 array(['based', 'exciting', 'indian', 'movie', 'raazi', 'real', 'spy',
        'story.', 'thriller', 'upon'], dtype='<U9')]

We can see that not all words are present. Some of them were removed during stopword removal.

In [82]:
# For test data

test_corpus = [
    'Indian cricket rocks!',
]
#we will not call fit transform for test data as it will shrink the vocab and change it.
cv.transform(test_corpus).toarray()

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

So for Train data we call ```fit_transform()``` but for test data we only call ```transform()```

## More ways to create Features
- Unigram - every word as a feature
- Bigrams - two consecutive words can be treated as a single feature
- Trigrams - three consecutive ....
- n-grams - and so on
- TF-IDF Normalisation

In [88]:
sent_1 = ["this is good movie"]
# sent_2 = ["this is good move but actor is not present"]
sent_3 = ["this is not good movie"]

In [101]:
cv = CountVectorizer(ngram_range=(1,3))

```ngram_range=(1,1)``` is for Unigram, ```ngram_range=(2,2)``` is for Bigrams
```ngram_range=(3,3)``` is for Trigrams and ```ngram_range=(1,3)``` means combinations of Unigram, Bigram and Trigrams

In [102]:
docs = [sent_1[0], sent_3[0]]
cv.fit_transform(docs).toarray()

array([[1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0],
       [1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1]], dtype=int64)

In [103]:
cv.vocabulary_

{'this': 11,
 'is': 2,
 'good': 0,
 'movie': 7,
 'this is': 12,
 'is good': 3,
 'good movie': 1,
 'this is good': 13,
 'is good movie': 4,
 'not': 8,
 'is not': 5,
 'not good': 9,
 'this is not': 14,
 'is not good': 6,
 'not good movie': 10}

### Tf-idf Normalisation
- Avoid features that occur very often, becauase they contain less information
- Information decreases as the number of occurences increases across different type of documents
- So we define another term - term-document-frequency which associates a weight with every term

In [104]:

sent_1  = "this is good movie"
sent_2 = "this was good movie"
sent_3 = "this is not good movie"

corpus = [sent_1,sent_2,sent_3]

In [105]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [106]:
tfidf = TfidfVectorizer()

In [112]:
vc = tfidf.fit_transform(corpus)

In [113]:
print(vc)

  (0, 2)	0.4633342717458061
  (0, 0)	0.4633342717458061
  (0, 1)	0.5966272352795762
  (0, 4)	0.4633342717458061
  (1, 5)	0.6990303272568005
  (1, 2)	0.4128585720620119
  (1, 0)	0.4128585720620119
  (1, 4)	0.4128585720620119
  (2, 3)	0.6172273175654565
  (2, 2)	0.3645443967613799
  (2, 0)	0.3645443967613799
  (2, 1)	0.4694172843223779
  (2, 4)	0.3645443967613799


In [115]:
vc = vc.toarray()
print(vc)

[[0.46333427 0.59662724 0.46333427 0.         0.46333427 0.        ]
 [0.41285857 0.         0.41285857 0.         0.41285857 0.69903033]
 [0.3645444  0.46941728 0.3645444  0.61722732 0.3645444  0.        ]]


In [114]:
tfidf.vocabulary_

{'this': 4, 'is': 1, 'good': 0, 'movie': 2, 'was': 5, 'not': 3}

- We can clearly see that 'not' has highest frequency than other words cuz it appears in only one document. 
- Word 'is' will have more if-idf value than this, good cuz it occurs 2 times in while doc and this, good appear 3 times.
- Basically, the words which appear more frequently will be given less weightage.