<a href="https://colab.research.google.com/github/gokul8747/NLP/blob/main/BOW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bag Of Words

In [1]:
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
import string
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

**Sample corpus**

In [2]:
corpus = '''Bhagat Singh was a young revolutionary freedom fighter who had sacrificed his life for making India a free nation.
This young freedom fighter was born on 28th September in the year 1907 in the Punjab region that lies in Pakistan at present.
Bhagat Singh was initially impressed by the Gandhian principles and thus supported the Swadeshi movement.
Later, he thought of dropping the idea of non-violence because his mind totally changed after the Chauri-Chaura incident during the Non-cooperation movement.
He thought that revolt is necessary and only the way to drive out the Britishers from the nation.
This is the point where the ideology of Mahatma Gandhi and Bhagat Singh changed.
He had been imprisoned several times but never slowed down his struggle for independence.
The movies made on Bhagat Singh reveal the life events of this great hero of the nation.
'''

**Text preprocessing**

In [3]:
sent_token = sent_tokenize(corpus)
sent_token

['Bhagat Singh was a young revolutionary freedom fighter who had sacrificed his life for making India a free nation.',
 'This young freedom fighter was born on 28th September in the year 1907 in the Punjab region that lies in Pakistan at present.',
 'Bhagat Singh was initially impressed by the Gandhian principles and thus supported the Swadeshi movement.',
 'Later, he thought of dropping the idea of non-violence because his mind totally changed after the Chauri-Chaura incident during the Non-cooperation movement.',
 'He thought that revolt is necessary and only the way to drive out the Britishers from the nation.',
 'This is the point where the ideology of Mahatma Gandhi and Bhagat Singh changed.',
 'He had been imprisoned several times but never slowed down his struggle for independence.',
 'The movies made on Bhagat Singh reveal the life events of this great hero of the nation.']

In [4]:
token = [sent.translate(str.maketrans("","",string.punctuation)) for sent in sent_token ]
token

['Bhagat Singh was a young revolutionary freedom fighter who had sacrificed his life for making India a free nation',
 'This young freedom fighter was born on 28th September in the year 1907 in the Punjab region that lies in Pakistan at present',
 'Bhagat Singh was initially impressed by the Gandhian principles and thus supported the Swadeshi movement',
 'Later he thought of dropping the idea of nonviolence because his mind totally changed after the ChauriChaura incident during the Noncooperation movement',
 'He thought that revolt is necessary and only the way to drive out the Britishers from the nation',
 'This is the point where the ideology of Mahatma Gandhi and Bhagat Singh changed',
 'He had been imprisoned several times but never slowed down his struggle for independence',
 'The movies made on Bhagat Singh reveal the life events of this great hero of the nation']

In [11]:
processed_token = []
lemmatizer = WordNetLemmatizer()
for sent in token:
  raw_sent = re.sub('[^a-zA-Z]'," ",sent)
  raw_sent = raw_sent.lower()
  raw_sent = raw_sent.split()
  lemm_sent_list = [lemmatizer.lemmatize(i,pos="v") for i in raw_sent if i not in set(stopwords.words("english"))]
  lemm_sent = " ".join(lemm_sent_list)
  processed_token.append(lemm_sent)

In [12]:
processed_token

['bhagat singh young revolutionary freedom fighter sacrifice life make india free nation',
 'young freedom fighter bear th september year punjab region lie pakistan present',
 'bhagat singh initially impress gandhian principles thus support swadeshi movement',
 'later think drop idea nonviolence mind totally change chaurichaura incident noncooperation movement',
 'think revolt necessary way drive britishers nation',
 'point ideology mahatma gandhi bhagat singh change',
 'imprison several time never slow struggle independence',
 'movies make bhagat singh reveal life events great hero nation']

**Bag Of Words** for the above sample corpus

In [13]:
vectorizer = CountVectorizer()
x_vect = vectorizer.fit_transform(processed_token)

In [15]:
x_vect.toarray()

array([[0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
        0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 