#### Natural language preprocessing pipeline using the sklearn and custum code (Model is called Bag of Words)
- Data import (In our case we will ourself create some of the data)
- Data preprocessing (NLP pipeline)
- Toekinzation or word seperation 
- Stop words removal ( Removal of the unnecessary words)
- Lemmatization or the stemming (Using the sklearn library)

- Constructing the vocab
- Bigram and trigrams (Explanation and use)
- Tf-Idf normalization

In [3]:
### Exploring the different dataset in the corpus
from nltk.corpus import brown

print("Brown contain {} total is {}".format(brown.categories(),len(brown.categories())))

Brown contain ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction'] total is 15


In [7]:
data = brown.sents(categories = 'fiction')
print(len(data))

4249


In [9]:
#1 sentence in fiction category
' '.join(data[1])

'Scotty did not go back to school .'

In [12]:
### Data 
document = """It was a very pleasant day. The weather was cool and there were light showers. 
I went to the market to buy some fruits."""

sentence = "Send all the 50 documents related to chapters 1,2,3 at prateek@cb.com"

In [14]:
### Tokenization using the sklearn
from nltk.tokenize import sent_tokenize,word_tokenize
st = sent_tokenize(document)
wt = word_tokenize(document)

In [44]:
### Tokenization using the custum code

def tokenizeSent(data,typeOfTokenize):
    """Tokenize the sentense"""
    if(typeOfTokenize=="sentTokenize"):
        data = data.split('.')
        return data
    elif(typeOfTokenize=="wordTokenize"):
        data = data.split(' ')
        return data
    else:
        print("Tokenize type not available")
        return "Error"

TokenizedWord = tokenizeSent(document,"wordTokenize")

In [45]:
### StopWords removal using the inbuilt stopwords dataset
from nltk.corpus import stopwords
sw = set(stopwords.words('english'))

def stopWordsRemoval(document):
    data = tokenizeSent(document,"wordTokenize")
    useful_words = [i for i in data if i not in sw]
    return useful_words

print(stopWordsRemoval(document))

['It', 'pleasant', 'day.', 'The', 'weather', 'cool', 'light', 'showers.', '\nI', 'went', 'market', 'buy', 'fruits.']


In [53]:
## StopWords removal using the custum words
sw_custum = set(['is','It','There','there','was','were','very','a','The','the','to'])

def stopWordsRemoval_custum(document):
    data = tokenizeSent(document,"wordTokenize")
    useful_words = [i for i in data if i not in sw_custum]
    return useful_words

print(stopWordsRemoval_custum(document))

['pleasant', 'day.', 'weather', 'cool', 'and', 'light', 'showers.', '\nI', 'went', 'market', 'buy', 'some', 'fruits.']


In [58]:
### Stemming using the sklearn 
from nltk.stem import PorterStemmer,SnowballStemmer,LancasterStemmer

ps = PorterStemmer()
print(ps.stem('jumping'))

ss = SnowballStemmer('english')
print(ss.stem('jumping'))

jump
jump


In [59]:
## Lemmatization
from nltk.stem import WordNetLemmatizer

wn = WordNetLemmatizer()
wn.lemmatize('jumping')

'jumping'

In [78]:
### Building the vocab using the custum code
corpus = [
        'Indian cricket team will wins World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
        'We will win next Lok Sabha Elections, says confident Indian PM',
        'The nobel laurate won the hearts of the people.',
        'The movie Raazi is an exciting Indian Spy thriller based upon a real story.'
]

## Create the dict for the uncommon words
VocabDict = {}

count = 0

for i in corpus:
    for j in i.split(' '):
        if(j not in VocabDict):
            VocabDict[j] = count
            count += 1

print(VocabDict)
print("The length of the vocab is {}".format(len(VocabDict)))

vector_coupus = [0]*len(VocabDict)
for i in corpus[0].split(' '):
    vector_coupus[VocabDict[i]] += 1
    
print(vector_coupus)

{'Indian': 0, 'cricket': 1, 'team': 2, 'will': 3, 'wins': 4, 'World': 5, 'Cup,': 6, 'says': 7, 'Capt.': 8, 'Virat': 9, 'Kohli.': 10, 'cup': 11, 'be': 12, 'held': 13, 'at': 14, 'Sri': 15, 'Lanka.': 16, 'We': 17, 'win': 18, 'next': 19, 'Lok': 20, 'Sabha': 21, 'Elections,': 22, 'confident': 23, 'PM': 24, 'The': 25, 'nobel': 26, 'laurate': 27, 'won': 28, 'the': 29, 'hearts': 30, 'of': 31, 'people.': 32, 'movie': 33, 'Raazi': 34, 'is': 35, 'an': 36, 'exciting': 37, 'Spy': 38, 'thriller': 39, 'based': 40, 'upon': 41, 'a': 42, 'real': 43, 'story.': 44}
The length of the vocab is 45
[1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [69]:
##USing the sklearn

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
vectorized_corpus = cv.fit_transform(corpus)
vectorized_corpus = vectorized_corpus.toarray()

print(vectorized_corpus[0])

[0 1 0 1 1 0 1 2 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0
 2 0 1 0 2]


In [71]:
print(cv.vocabulary_)
print(len(cv.vocabulary_))

{'indian': 12, 'cricket': 6, 'team': 31, 'will': 37, 'wins': 39, 'world': 41, 'cup': 7, 'says': 27, 'capt': 4, 'virat': 35, 'kohli': 14, 'be': 3, 'held': 11, 'at': 1, 'sri': 29, 'lanka': 15, 'we': 36, 'win': 38, 'next': 19, 'lok': 17, 'sabha': 26, 'elections': 8, 'confident': 5, 'pm': 23, 'the': 32, 'nobel': 20, 'laurate': 16, 'won': 40, 'hearts': 10, 'of': 21, 'people': 22, 'movie': 18, 'raazi': 24, 'is': 13, 'an': 0, 'exciting': 9, 'spy': 28, 'thriller': 33, 'based': 2, 'upon': 34, 'real': 25, 'story': 30}
42


In [72]:
s = cv.inverse_transform(vectorized_corpus)
print(s)

[array(['at', 'be', 'capt', 'cricket', 'cup', 'held', 'indian', 'kohli',
       'lanka', 'says', 'sri', 'team', 'virat', 'will', 'wins', 'world'],
      dtype='<U9'), array(['confident', 'elections', 'indian', 'lok', 'next', 'pm', 'sabha',
       'says', 'we', 'will', 'win'], dtype='<U9'), array(['hearts', 'laurate', 'nobel', 'of', 'people', 'the', 'won'],
      dtype='<U9'), array(['an', 'based', 'exciting', 'indian', 'is', 'movie', 'raazi',
       'real', 'spy', 'story', 'the', 'thriller', 'upon'], dtype='<U9')]


In [84]:
### USe of ngrams and bgrams
cv = CountVectorizer(ngram_range=(1,3))
vectorized_corpus = cv.fit_transform(corpus).toarray()
print(vectorized_corpus[0])

[0 0 0 1 1 1 0 0 0 1 1 1 1 1 1 0 0 0 1 1 1 2 1 1 1 1 0 0 0 0 0 0 0 0 0 1 1
 1 1 1 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 1 1 0 0 0 0 0 1 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0
 0 0 2 1 1 0 0 1 1 0 0 0 1 1 1 0 0 0 2 2 1 1]


In [85]:
s = cv.inverse_transform(vectorized_corpus)
print(s)

[array(['at', 'at sri', 'at sri lanka', 'be', 'be held', 'be held at',
       'capt', 'capt virat', 'capt virat kohli', 'cricket',
       'cricket team', 'cricket team will', 'cup', 'cup says',
       'cup says capt', 'cup will', 'cup will be', 'held', 'held at',
       'held at sri', 'indian', 'indian cricket', 'indian cricket team',
       'kohli', 'kohli world', 'kohli world cup', 'lanka', 'says',
       'says capt', 'says capt virat', 'sri', 'sri lanka', 'team',
       'team will', 'team will wins', 'virat', 'virat kohli',
       'virat kohli world', 'will', 'will be', 'will be held',
       'will wins', 'will wins world', 'wins', 'wins world',
       'wins world cup', 'world', 'world cup', 'world cup says',
       'world cup will'], dtype='<U24'), array(['confident', 'confident indian', 'confident indian pm',
       'elections', 'elections says', 'elections says confident',
       'indian', 'indian pm', 'lok', 'lok sabha', 'lok sabha elections',
       'next', 'next lok', 'next lo

##### Tf-idf Normalisation
- Avoid features that occur very often, becauase they contain less information
- Information decreases as the number of occurences increases across different type of documents
- So we define another term - term-document-frequency which associates a weight with every term
- Formula used is log(N/1+count(t,D))

In [86]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
vc = tfidf.fit_transform(corpus).toarray()
print(vc)

[[0.         0.21074652 0.         0.21074652 0.21074652 0.
  0.21074652 0.42149305 0.         0.         0.         0.21074652
  0.13451678 0.         0.21074652 0.21074652 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.16615498 0.         0.21074652
  0.         0.21074652 0.         0.         0.         0.21074652
  0.         0.33230996 0.         0.21074652 0.         0.42149305]
 [0.         0.         0.         0.         0.         0.32190145
  0.         0.         0.32190145 0.         0.         0.
  0.20546553 0.         0.         0.         0.         0.32190145
  0.         0.32190145 0.         0.         0.         0.32190145
  0.         0.         0.32190145 0.2537908  0.         0.
  0.         0.         0.         0.         0.         0.
  0.32190145 0.2537908  0.32190145 0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.        

In [88]:
print(tfidf.get_feature_names())

['an', 'at', 'based', 'be', 'capt', 'confident', 'cricket', 'cup', 'elections', 'exciting', 'hearts', 'held', 'indian', 'is', 'kohli', 'lanka', 'laurate', 'lok', 'movie', 'next', 'nobel', 'of', 'people', 'pm', 'raazi', 'real', 'sabha', 'says', 'spy', 'sri', 'story', 'team', 'the', 'thriller', 'upon', 'virat', 'we', 'will', 'win', 'wins', 'won', 'world']
