## Text Preprossing -  Word embedding (with out using Neural Network) for NLP
#### we will be using NLTK Natural Language Processing Toolkit

In [2]:
import warnings 
warnings.filterwarnings("ignore")

In [None]:
import sys
!{sys.executable} -m pip install nltk


In [32]:
# Taking random CORPUS from google using multi line comments
paragraph = """Modi was born and raised in Vadnagar in northeastern Gujarat, where he completed his secondary education. He was introduced to the RSS at the age of eight. His account of helping his father sell tea at the Vadnagar railway station has not been reliably corroborated. At age 18, he was married to Jashodaben Modi, whom he abandoned soon after, only publicly acknowledging her four decades later when legally required to do so. Modi became a full-time worker for the RSS in Gujarat in 1971. The RSS assigned him to the BJP in 1985 and he held several positions within the party hierarchy until 2001, rising to the rank of general secretary"""

In [33]:
paragraph   

'Modi was born and raised in Vadnagar in northeastern Gujarat, where he completed his secondary education. He was introduced to the RSS at the age of eight. His account of helping his father sell tea at the Vadnagar railway station has not been reliably corroborated. At age 18, he was married to Jashodaben Modi, whom he abandoned soon after, only publicly acknowledging her four decades later when legally required to do so. Modi became a full-time worker for the RSS in Gujarat in 1971. The RSS assigned him to the BJP in 1985 and he held several positions within the party hierarchy until 2001, rising to the rank of general secretary'

In [34]:
# Importing NLTK libraries for stemming and stop words
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords


=======================================================================================

### 1.  Tokenization - converting paragraph to sentences then words

In [35]:
nltk.download('punkt')  # need to download punkt in nltk for tokenization
sentences = nltk.sent_tokenize(paragraph) #converting paragraph into sentences as list of element


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\goura\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [36]:
sentences # containing list of sentences from paragraph

['Modi was born and raised in Vadnagar in northeastern Gujarat, where he completed his secondary education.',
 'He was introduced to the RSS at the age of eight.',
 'His account of helping his father sell tea at the Vadnagar railway station has not been reliably corroborated.',
 'At age 18, he was married to Jashodaben Modi, whom he abandoned soon after, only publicly acknowledging her four decades later when legally required to do so.',
 'Modi became a full-time worker for the RSS in Gujarat in 1971.',
 'The RSS assigned him to the BJP in 1985 and he held several positions within the party hierarchy until 2001, rising to the rank of general secretary']

In [37]:
sentences[5]

'The RSS assigned him to the BJP in 1985 and he held several positions within the party hierarchy until 2001, rising to the rank of general secretary'

=======================================================================================

### 2. Stemming - finding base root word from multiple words

In [38]:
stemmer=PorterStemmer()

In [39]:
stemmer.stem("going") # it will convert it to base word like go

'go'

In [40]:
stemmer.stem("thinking") 

'think'

=======================================================================================

### 2. Lemmatization - finding meaningful word from dictionary from multiple words (Alternative of Stemming)

In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('stopwords')

In [42]:
lemmatizer=WordNetLemmatizer() 

In [43]:
lemmatizer.lemmatize('thinking')

'thinking'

In [44]:
lemmatizer.lemmatize('finally')

'finally'

=======================================================================================

### 3. Cleaning the entire text like special char using regex
- Applying regex
- Applying Lowering case
- Applying Lemmatization
- Applying stopwords

In [45]:
len(sentences)

6

In [46]:
import re
corpus=[]
for i in range(len(sentences)):
    review =re.sub('[^a-zA-Z]',' ',sentences[i]) # other than a to z any word convert it to " " and update
    review=review.lower()  # lowering the words
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in set (stopwords.words("english")) ]
    review = " ".join(review)
    corpus.append(review) 

corpus


['modi born raised vadnagar northeastern gujarat completed secondary education',
 'introduced r age eight',
 'account helping father sell tea vadnagar railway station reliably corroborated',
 'age married jashodaben modi abandoned soon publicly acknowledging four decade later legally required',
 'modi became full time worker r gujarat',
 'r assigned bjp held several position within party hierarchy rising rank general secretary']

In [47]:
stopwords.words('english') # checking stop words not significance

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

=======================================================================================

### 4. Applying Stemming


In [48]:
for i in sentences:
    words=nltk.word_tokenize(i) # getting word from sentences
    for word in words:
        if word not in set(stopwords.words("english")): # if word not belong to stop word then stemming it
            print(stemmer.stem(word))

modi
born
rais
vadnagar
northeastern
gujarat
,
complet
secondari
educ
.
he
introduc
rss
age
eight
.
hi
account
help
father
sell
tea
vadnagar
railway
station
reliabl
corrobor
.
at
age
18
,
marri
jashodaben
modi
,
abandon
soon
,
publicli
acknowledg
four
decad
later
legal
requir
.
modi
becam
full-tim
worker
rss
gujarat
1971
.
the
rss
assign
bjp
1985
held
sever
posit
within
parti
hierarchi
2001
,
rise
rank
gener
secretari


=======================================================================================

### 5. Applying Lemmitization

In [49]:
for i in sentences:
    words=nltk.word_tokenize(i) # getting word from sentences
    for word in words:
        if word not in set(stopwords.words("english")): # if word not belong to stop word then stemming it
            print(lemmatizer.lemmatize(word))

Modi
born
raised
Vadnagar
northeastern
Gujarat
,
completed
secondary
education
.
He
introduced
RSS
age
eight
.
His
account
helping
father
sell
tea
Vadnagar
railway
station
reliably
corroborated
.
At
age
18
,
married
Jashodaben
Modi
,
abandoned
soon
,
publicly
acknowledging
four
decade
later
legally
required
.
Modi
became
full-time
worker
RSS
Gujarat
1971
.
The
RSS
assigned
BJP
1985
held
several
position
within
party
hierarchy
2001
,
rising
rank
general
secretary


=======================================================================================

### 6. Bag of Words 

In [50]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()

In [51]:
X= cv.fit_transform(corpus)

##### 6.1 Checking the vocabulay with index 

an unique index will be assigned to each word to identify

In [52]:
cv.vocabulary_ # this will give u index not frequency it is feature number like f1 f2 to fn

{'modi': 26,
 'born': 7,
 'raised': 32,
 'vadnagar': 45,
 'northeastern': 27,
 'gujarat': 17,
 'completed': 8,
 'secondary': 37,
 'education': 11,
 'introduced': 21,
 'age': 3,
 'eight': 12,
 'account': 1,
 'helping': 19,
 'father': 13,
 'sell': 39,
 'tea': 43,
 'railway': 31,
 'station': 42,
 'reliably': 34,
 'corroborated': 9,
 'married': 25,
 'jashodaben': 22,
 'abandoned': 0,
 'soon': 41,
 'publicly': 30,
 'acknowledging': 2,
 'four': 14,
 'decade': 10,
 'later': 23,
 'legally': 24,
 'required': 35,
 'became': 5,
 'full': 15,
 'time': 44,
 'worker': 47,
 'assigned': 4,
 'bjp': 6,
 'held': 18,
 'several': 40,
 'position': 29,
 'within': 46,
 'party': 28,
 'hierarchy': 20,
 'rising': 36,
 'rank': 33,
 'general': 16,
 'secretary': 38}

##### 6.2 Checking bag of words for first sentence

In [53]:
corpus[0] # first sentence

'modi born raised vadnagar northeastern gujarat completed secondary education'

In [54]:
length = len(X[0].toarray()[0])
length

48


 These 48 words are feature unigram. Now wherever we found word belong to first sentence we wil get the count 1

in the first sentence we have word as "and" whose index is 5 hence in 5th index we can see count 1, 32 index is "in" whose count is 2 as repeating 2 times

In [55]:
X[0].toarray()

array([[0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0]], dtype=int64)

##### 6.3 Converting to Binary Bag of word anything apperaring more than 1 will be considered as 1

In [56]:

cv_binary=CountVectorizer(binary=True)
X_binray= cv_binary.fit_transform(corpus)
X_binray[0].toarray() # all 2 are gone all will be 1 if count is more than 1

array([[0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0]], dtype=int64)

- These is the problem of Sparsity  which is really bad for large data set 

##### 6.4 Using Ngram in Bag of words

Creating only trigram

In [59]:
cv_trigram=CountVectorizer(binary=True,ngram_range=(3,3))   # creating only trigram
X_trigram= cv_trigram.fit_transform(corpus)
cv_trigram.vocabulary_

{'modi born raised': 23,
 'born raised vadnagar': 7,
 'raised vadnagar northeastern': 29,
 'vadnagar northeastern gujarat': 38,
 'northeastern gujarat completed': 24,
 'gujarat completed secondary': 13,
 'completed secondary education': 8,
 'introduced age eight': 17,
 'account helping father': 1,
 'helping father sell': 15,
 'father sell tea': 10,
 'sell tea vadnagar': 32,
 'tea vadnagar railway': 36,
 'vadnagar railway station': 39,
 'railway station reliably': 28,
 'station reliably corroborated': 35,
 'age married jashodaben': 3,
 'married jashodaben modi': 20,
 'jashodaben modi abandoned': 18,
 'modi abandoned soon': 21,
 'abandoned soon publicly': 0,
 'soon publicly acknowledging': 34,
 'publicly acknowledging four': 27,
 'acknowledging four decade': 2,
 'four decade later': 11,
 'decade later legally': 9,
 'later legally required': 19,
 'modi became full': 22,
 'became full time': 5,
 'full time worker': 12,
 'time worker gujarat': 37,
 'assigned bjp held': 4,
 'bjp held several

Creating both Bi and Tri gram   

In [60]:
cv_bi_trigram=CountVectorizer(binary=True,ngram_range=(2,3))   # creating both bi and trigram
X_bi_trigram= cv_bi_trigram.fit_transform(corpus)
cv_bi_trigram.vocabulary_

{'modi born': 49,
 'born raised': 15,
 'raised vadnagar': 61,
 'vadnagar northeastern': 81,
 'northeastern gujarat': 51,
 'gujarat completed': 28,
 'completed secondary': 17,
 'secondary education': 68,
 'modi born raised': 50,
 'born raised vadnagar': 16,
 'raised vadnagar northeastern': 62,
 'vadnagar northeastern gujarat': 82,
 'northeastern gujarat completed': 52,
 'gujarat completed secondary': 29,
 'completed secondary education': 18,
 'introduced age': 36,
 'age eight': 6,
 'introduced age eight': 37,
 'account helping': 2,
 'helping father': 32,
 'father sell': 21,
 'sell tea': 69,
 'tea vadnagar': 77,
 'vadnagar railway': 83,
 'railway station': 59,
 'station reliably': 75,
 'reliably corroborated': 65,
 'account helping father': 3,
 'helping father sell': 33,
 'father sell tea': 22,
 'sell tea vadnagar': 70,
 'tea vadnagar railway': 78,
 'vadnagar railway station': 84,
 'railway station reliably': 60,
 'station reliably corroborated': 76,
 'age married': 7,
 'married jashodab

==================================================================================

### 7. TF-IDF

In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv_tfidf = TfidfVectorizer()
X_tfidf=cv_tfidf.fit_transform(corpus)

In [62]:
corpus[0]

'modi born raised vadnagar northeastern gujarat completed secondary education'

In [63]:
X_tfidf[0].toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.35750457, 0.35750457, 0.        ,
        0.        , 0.35750457, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.29315886, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.24750486, 0.35750457, 0.        , 0.        ,
        0.        , 0.        , 0.35750457, 0.        , 0.        ,
        0.        , 0.        , 0.35750457, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.29315886, 0.        , 0.        ]])

- Here we can see different weightage those which are common appearing multiple time are having less or 0 weightage and those are rarely appreading having more weightage

##### 7.1 Using Ngram

In [64]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv_tfidf_tri = TfidfVectorizer(ngram_range=(3,3))
X_tfidf_tri=cv_tfidf_tri.fit_transform(corpus)

In [65]:
X_tfidf_tri[0].toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.37796447, 0.37796447, 0.        ,
        0.        , 0.        , 0.        , 0.37796447, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.37796447, 0.37796447,
        0.        , 0.        , 0.        , 0.        , 0.37796447,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.37796447, 0.        ,
        0.        ]])

- Here weightage are now different w.r.t Trigram

##### 7.2 Using max feature  

In [68]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv_tfidf_max_feature = TfidfVectorizer(ngram_range=(1,1),max_features=10)
X_tfidf_max_feature=cv_tfidf_max_feature.fit_transform(corpus)

In [69]:
X_tfidf_max_feature[0].toarray()

array([[0.        , 0.        , 0.48795307, 0.41196351, 0.59505434,
        0.        , 0.        , 0.        , 0.        , 0.48795307]])

- Now vector is reduced as we put a limit as 10 so we will get top 10 max feature 