In [1]:
!pip install nltk

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


## Get the Data

In [133]:
import nltk
import numpy as np

In [2]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [3]:
from nltk.corpus import brown

In [5]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [7]:
data = brown.sents(categories='news')

In [25]:
' '.join(data[0])

"The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place ."

In [27]:
news_words = brown.words(categories='news')

In [28]:
len(news_words)

100554

# NLP Pipeline
 - Data Collection
 - Tokenization, Stopword removal, stemming
 - Building a Vocabulary (BoW)
 - Vectorization of all documents (corpus)
 - Perform classification

In [29]:
sen = "I love my country. I love to play cricket"

In [31]:
sen.split()

['I', 'love', 'my', 'country.', 'I', 'love', 'to', 'play', 'cricket']

In [34]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [45]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mohituniyal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [35]:
document = """It was a very pleasant day. The weather was cool and there were light showers. 
I went to the market to buy some fruits."""

In [38]:
print(word_tokenize(document))

['It', 'was', 'a', 'very', 'pleasant', 'day', '.', 'The', 'weather', 'was', 'cool', 'and', 'there', 'were', 'light', 'showers', '.', 'I', 'went', 'to', 'the', 'market', 'to', 'buy', 'some', 'fruits', '.']


In [39]:
sent_tokenize(document)

['It was a very pleasant day.',
 'The weather was cool and there were light showers.',
 'I went to the market to buy some fruits.']

In [41]:
document.split(".")

['It was a very pleasant day',
 ' The weather was cool and there were light showers',
 ' \nI went to the market to buy some fruits',
 '']

In [46]:
doc = "Mr. Modi is a good man. He is indian."

In [47]:
sent_tokenize(doc)

['Mr. Modi is a good man.', 'He is indian.']

In [50]:
doc.split(".")

['Mr', ' Modi is a good man', ' He is indian', '']

In [51]:
from nltk.corpus import stopwords

In [53]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mohituniyal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [67]:
sw = stopwords.words('english')

In [68]:
sw.remove('not')

In [69]:
'not' in sw

False

In [73]:
' '.join([w for w in word_tokenize(document.lower()) if w not in sw]) # list comprehension

'pleasant day . weather cool light showers . went market buy fruits .'

In [74]:
from nltk.tokenize import RegexpTokenizer

In [75]:
my_regex_tokenizer = RegexpTokenizer(pattern="[a-z]+")

In [78]:
abc = "Foxes love to make jumps. The quick brown fox 76989i7yy 876908 %^^%$&*** was seen jumping over the lovely dog from a 6ft feet,,  high wall...... mohituniyal@gmail.com"

In [79]:
abc

'Foxes love to make jumps. The quick brown fox 76989i7yy 876908 %^^%$&*** was seen jumping over the lovely dog from a 6ft feet,,  high wall...... mohituniyal@gmail.com'

In [81]:
" ".join(my_regex_tokenizer.tokenize(abc.lower()))

'foxes love to make jumps the quick brown fox i yy was seen jumping over the lovely dog from a ft feet high wall mohituniyal gmail com'

In [82]:
from nltk.stem import SnowballStemmer, PorterStemmer, LancasterStemmer

In [84]:
sb = SnowballStemmer(language='english')

In [85]:
sb.stem('lovely')

'love'

In [86]:
sb.stem("loved")

'love'

In [87]:
sb.stem('ran')

'ran'

In [96]:
sb.stem('movie')

'movi'

### BoW

In [101]:
corpus = [
        'Indian cricket team will wins World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
        'We will win next Lok Sabha Elections, says confident Indian PM',
        'Former Indian president APJ Abdul Kalam won the hearts of the people.',
        'The movie Raazi is an exciting Indian Spy thriller based upon a real story.'
]

In [102]:
from sklearn.feature_extraction.text import CountVectorizer

In [103]:
cv = CountVectorizer()

In [108]:
vectorized_corpus = cv.fit_transform(corpus)

In [109]:
vectorized_corpus

<4x45 sparse matrix of type '<class 'numpy.int64'>'
	with 51 stored elements in Compressed Sparse Row format>

In [110]:
print(vectorized_corpus)

  (0, 15)	1
  (0, 8)	1
  (0, 34)	1
  (0, 40)	2
  (0, 42)	1
  (0, 44)	2
  (0, 9)	2
  (0, 30)	1
  (0, 6)	1
  (0, 38)	1
  (0, 18)	1
  (0, 5)	1
  (0, 14)	1
  (0, 3)	1
  (0, 32)	1
  (0, 19)	1
  (1, 15)	1
  (1, 40)	1
  (1, 30)	1
  (1, 39)	1
  (1, 41)	1
  (1, 22)	1
  (1, 20)	1
  (1, 29)	1
  (1, 10)	1
  :	:
  (1, 25)	1
  (2, 15)	1
  (2, 12)	1
  (2, 26)	1
  (2, 2)	1
  (2, 0)	1
  (2, 17)	1
  (2, 43)	1
  (2, 35)	2
  (2, 13)	1
  (2, 23)	1
  (2, 24)	1
  (3, 15)	1
  (3, 35)	1
  (3, 21)	1
  (3, 27)	1
  (3, 16)	1
  (3, 1)	1
  (3, 11)	1
  (3, 31)	1
  (3, 36)	1
  (3, 4)	1
  (3, 37)	1
  (3, 28)	1
  (3, 33)	1


In [111]:
vectorized_corpus.shape

(4, 45)

In [118]:
vc_dense = vectorized_corpus.toarray()
vc_dense[1]

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       0])

In [119]:
vectorized_corpus.nnz

51

In [126]:
print(cv.vocabulary_)

{'indian': 15, 'cricket': 8, 'team': 34, 'will': 40, 'wins': 42, 'world': 44, 'cup': 9, 'says': 30, 'capt': 6, 'virat': 38, 'kohli': 18, 'be': 5, 'held': 14, 'at': 3, 'sri': 32, 'lanka': 19, 'we': 39, 'win': 41, 'next': 22, 'lok': 20, 'sabha': 29, 'elections': 10, 'confident': 7, 'pm': 25, 'former': 12, 'president': 26, 'apj': 2, 'abdul': 0, 'kalam': 17, 'won': 43, 'the': 35, 'hearts': 13, 'of': 23, 'people': 24, 'movie': 21, 'raazi': 27, 'is': 16, 'an': 1, 'exciting': 11, 'spy': 31, 'thriller': 36, 'based': 4, 'upon': 37, 'real': 28, 'story': 33}


In [130]:
vc_dense[2]

array([1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1,
       0])

In [129]:
cv.inverse_transform(vc_dense[2])

[array(['abdul', 'apj', 'former', 'hearts', 'indian', 'kalam', 'of',
        'people', 'president', 'the', 'won'], dtype='<U9')]

In [131]:
cv.get_feature_names()

['abdul',
 'an',
 'apj',
 'at',
 'based',
 'be',
 'capt',
 'confident',
 'cricket',
 'cup',
 'elections',
 'exciting',
 'former',
 'hearts',
 'held',
 'indian',
 'is',
 'kalam',
 'kohli',
 'lanka',
 'lok',
 'movie',
 'next',
 'of',
 'people',
 'pm',
 'president',
 'raazi',
 'real',
 'sabha',
 'says',
 'spy',
 'sri',
 'story',
 'team',
 'the',
 'thriller',
 'upon',
 'virat',
 'we',
 'will',
 'win',
 'wins',
 'won',
 'world']

In [141]:
my_text = ["my name is mohit, Indian president is a good person. Virat is Captian"]

In [143]:
cv.transform(my_text).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0]])

### Other ways of creating features

In [147]:
cv = CountVectorizer(ngram_range=(1,2), max_features=40)

In [148]:
cv.fit_transform(corpus)

<4x40 sparse matrix of type '<class 'numpy.int64'>'
	with 46 stored elements in Compressed Sparse Row format>

In [149]:
cv.vocabulary_

{'indian': 1,
 'team': 20,
 'will': 28,
 'wins': 34,
 'world': 38,
 'cup': 0,
 'says': 13,
 'virat': 24,
 'sri': 18,
 'will wins': 31,
 'wins world': 35,
 'world cup': 39,
 'says capt': 14,
 'virat kohli': 25,
 'will be': 29,
 'sri lanka': 19,
 'we': 26,
 'win': 32,
 'sabha': 11,
 'pm': 4,
 'we will': 27,
 'will win': 30,
 'win next': 33,
 'sabha elections': 12,
 'says confident': 15,
 'president': 5,
 'won': 36,
 'the': 21,
 'people': 3,
 'president apj': 6,
 'won the': 37,
 'of the': 2,
 'raazi': 7,
 'spy': 16,
 'real': 9,
 'raazi is': 8,
 'spy thriller': 17,
 'thriller based': 22,
 'upon real': 23,
 'real story': 10}

# TfIdf Normalisation

In [150]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [151]:
tfidf = TfidfVectorizer()

In [153]:
vc = tfidf.fit_transform(corpus)

In [157]:
print(vc)

  (0, 19)	0.21138162089799728
  (0, 32)	0.21138162089799728
  (0, 3)	0.21138162089799728
  (0, 14)	0.21138162089799728
  (0, 5)	0.21138162089799728
  (0, 18)	0.21138162089799728
  (0, 38)	0.21138162089799728
  (0, 6)	0.21138162089799728
  (0, 30)	0.1666556978718695
  (0, 9)	0.42276324179599456
  (0, 44)	0.42276324179599456
  (0, 42)	0.21138162089799728
  (0, 40)	0.333311395743739
  (0, 34)	0.21138162089799728
  (0, 8)	0.21138162089799728
  (0, 15)	0.11030769881732065
  (1, 25)	0.32417842259348545
  (1, 7)	0.32417842259348545
  (1, 10)	0.32417842259348545
  (1, 29)	0.32417842259348545
  (1, 20)	0.32417842259348545
  (1, 22)	0.32417842259348545
  (1, 41)	0.32417842259348545
  (1, 39)	0.32417842259348545
  (1, 30)	0.255585991926846
  :	:
  (1, 15)	0.16916974924594824
  (2, 24)	0.29162217443775823
  (2, 23)	0.29162217443775823
  (2, 13)	0.29162217443775823
  (2, 35)	0.4598365438714178
  (2, 43)	0.29162217443775823
  (2, 17)	0.29162217443775823
  (2, 0)	0.29162217443775823
  (2, 2)	0.291622

In [155]:
tfidf.vocabulary_

{'indian': 15,
 'cricket': 8,
 'team': 34,
 'will': 40,
 'wins': 42,
 'world': 44,
 'cup': 9,
 'says': 30,
 'capt': 6,
 'virat': 38,
 'kohli': 18,
 'be': 5,
 'held': 14,
 'at': 3,
 'sri': 32,
 'lanka': 19,
 'we': 39,
 'win': 41,
 'next': 22,
 'lok': 20,
 'sabha': 29,
 'elections': 10,
 'confident': 7,
 'pm': 25,
 'former': 12,
 'president': 26,
 'apj': 2,
 'abdul': 0,
 'kalam': 17,
 'won': 43,
 'the': 35,
 'hearts': 13,
 'of': 23,
 'people': 24,
 'movie': 21,
 'raazi': 27,
 'is': 16,
 'an': 1,
 'exciting': 11,
 'spy': 31,
 'thriller': 36,
 'based': 4,
 'upon': 37,
 'real': 28,
 'story': 33}