In [1]:
!pip install nltk



### Dataset

In [1]:
import nltk

In [7]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [8]:
nltk.download('brown')

[nltk_data] Downloading package brown to /Users/mohit/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

In [2]:
from nltk.corpus import brown

In [3]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [4]:
religion_data = brown.sents(categories='religion')

In [5]:
" ".join(religion_data[10])

'Demons , fairies , angels , and a host of other spiritual beings were as much a part of the experiential world of western man as were rocks and trees and stars .'

In [6]:
all_words= brown.words(categories='religion')

In [7]:
len(all_words)

39399

## Pipeline for NLP Task

- Data Collection (Raw Data)
- Data Cleaning 
 - removing bad words.
 - Tokenzation, Stemming/Lemmatization, Stopword removal
- Building a Common Vocabulary
- Vectorizing the documents (data)
- Perform Classification

Dataset - corpora <br>
1 Datapoint - document

In [9]:
document = """The movie was good. It was a pleasent day.
I went to market to buy some fruits...."""

In [11]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [12]:
sent_tokenize(document)

['The movie was good.',
 'It was a pleasent day.',
 'I went to market to buy some fruits....']

In [15]:
word_tokenize(document)

['The',
 'movie',
 'was',
 'good',
 '.',
 'It',
 'was',
 'a',
 'pleasent',
 'day',
 '.',
 'I',
 'went',
 'to',
 'market',
 'to',
 'buy',
 'some',
 'fruits',
 '....']

In [16]:
# one time download
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/mohit/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
from nltk.corpus import stopwords

In [26]:
sw = set(stopwords.words('english'))

In [33]:
def remove_stopwords(text):
    text = text.lower()
    text = word_tokenize(text)
    text = [w for w in text if w not in sw]
    return " ".join(text)

In [35]:
document

'The movie was good. It was a pleasent day.\nI went to market to buy some fruits....'

In [34]:
remove_stopwords(document)

'movie good . pleasent day . went market buy fruits ....'

In [37]:
"not" in sw

True

In [38]:
# Stemming


In [39]:
from nltk.stem import SnowballStemmer, PorterStemmer, LancasterStemmer

In [40]:
stemmer = SnowballStemmer(language='english')

In [41]:
stemmer.stem("jumping")

'jump'

In [42]:
stemmer.stem('jumps')

'jump'

In [43]:
stemmer.stem('jumped')

'jump'

In [47]:
stemmer.stem('loves')

'love'

In [50]:
stemmer.stem('making')

'make'

#### Tokenization with Regex

In [53]:
import re

In [51]:
sentence = "Send all%$^&*()*&^%$#%^ the 50 documents >{})@!# related to chapter 1,2,3,5 to mohit@gmail.com."

In [54]:
from nltk.tokenize import RegexpTokenizer

In [62]:
my_tokenizer = RegexpTokenizer("[a-zA-Z0-9@.]+")

In [63]:
" ".join(my_tokenizer.tokenize(sentence))

'Send all the 50 documents @ related to chapter 1 2 3 5 to mohit@gmail.com.'

### Creating Vocabulary/ Vectorizing Documents - BoW

In [65]:
corpus = [
        'Indian cricket team will wins World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
        'We will win next Lok Sabha Elections, says confident Indian PM',
        'The nobel laurate won the hearts of the people.',
        'The movie Raazi is an exciting Indian Spy thriller based upon a real story.'
]

In [66]:
# Bag of Words
from sklearn.feature_extraction.text import CountVectorizer

In [116]:
cv = CountVectorizer(ngram_range=(1,1))

In [117]:
vectorised_corpus = cv.fit_transform(corpus)

In [118]:
vectorised_corpus

<4x42 sparse matrix of type '<class 'numpy.int64'>'
	with 47 stored elements in Compressed Sparse Row format>

In [119]:
# print(vectorised_corpus)

In [120]:
vectorised_corpus.toarray()

array([[0, 1, 0, 1, 1, 0, 1, 2, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 2, 0, 1, 0, 2],
       [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
        0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]])

In [121]:
print(cv.vocabulary_)

{'indian': 12, 'cricket': 6, 'team': 31, 'will': 37, 'wins': 39, 'world': 41, 'cup': 7, 'says': 27, 'capt': 4, 'virat': 35, 'kohli': 14, 'be': 3, 'held': 11, 'at': 1, 'sri': 29, 'lanka': 15, 'we': 36, 'win': 38, 'next': 19, 'lok': 17, 'sabha': 26, 'elections': 8, 'confident': 5, 'pm': 23, 'the': 32, 'nobel': 20, 'laurate': 16, 'won': 40, 'hearts': 10, 'of': 21, 'people': 22, 'movie': 18, 'raazi': 24, 'is': 13, 'an': 0, 'exciting': 9, 'spy': 28, 'thriller': 33, 'based': 2, 'upon': 34, 'real': 25, 'story': 30}


In [122]:
s = ["virat kohli system is very huge."]
cv.transform(s).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]])

### nGrams
- unigram
- bigram
- trigrams

### TF-idf

In [123]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [124]:
tfidf = TfidfVectorizer()

In [126]:
tfidf_vectoriser = tfidf.fit_transform(corpus)

In [129]:
# print(tfidf_vectoriser)

In [130]:
tfidf_vectoriser.toarray()

array([[0.        , 0.21074652, 0.        , 0.21074652, 0.21074652,
        0.        , 0.21074652, 0.42149305, 0.        , 0.        ,
        0.        , 0.21074652, 0.13451678, 0.        , 0.21074652,
        0.21074652, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.16615498, 0.        , 0.21074652,
        0.        , 0.21074652, 0.        , 0.        , 0.        ,
        0.21074652, 0.        , 0.33230996, 0.        , 0.21074652,
        0.        , 0.42149305],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.32190145, 0.        , 0.        , 0.32190145, 0.        ,
        0.        , 0.        , 0.20546553, 0.        , 0.        ,
        0.        , 0.        , 0.32190145, 0.        , 0.32190145,
        0.        , 0.        , 0.        , 0.32190145, 0.        ,
        0.        , 0.32190145, 0.2537908 , 0.        , 0.        ,
        0.     