<a href="https://colab.research.google.com/github/chaturvediajay/NLP/blob/main/final%2Bbag%2Bof%2Bwords.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Bag of words model

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
# load all necessary libraries
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

pd.set_option('max_colwidth', 100)

import nltk
words=nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Let's build a basic bag of words model on three sample documents

In [9]:
documents = ["Gangs of Wasseypur is a great movie.", "The success of a movie depends on the performance of the actors.", "There are no new movies releasing this week."]
print(documents)

['Gangs of Wasseypur is a great movie.', 'The success of a movie depends on the performance of the actors.', 'There are no new movies releasing this week.']


In [13]:
def preprocess(document):
    'changes document to lower case and removes stopwords'

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    # words = word_tokenize(document)


    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]

    # join words to make sentence
    document = " ".join(words)
    
    return document

documents = [preprocess(document) for document in documents]
print(documents)


UnboundLocalError: ignored

#### Creating bag of words model using count vectorizer function

In [None]:
vectorizer = CountVectorizer()
bow_model = vectorizer.fit_transform(documents)
print(bow_model)  # returns the row number and column number of the cells which have 1 as value

In [None]:
# print the full sparse matrix
print(bow_model.toarray())

In [None]:
print(bow_model.shape)
print(vectorizer.get_feature_names())

### Let's create a bag of words model on the spam dataset.

In [None]:
# load data
spam = pd.read_csv("/content/drive/MyDrive/al_ml_project/NLP/SMSSpamCollection.txt", sep = "\t", names=["label", "message"])
spam.head()

##### Let's take a subset of data (first 50 rows only) and create bag of word model on that.

In [None]:
spam = spam.iloc[0:50,:]
print(spam)

In [None]:
# extract the messages from the dataframe
messages = spam.message
print(messages)

In [None]:
# convert messages into list
messages = [message for message in messages]
print(messages)

In [None]:
# preprocess messages using the preprocess function
messages = [preprocess(message) for message in messages]
print(messages)

In [None]:
# bag of words model
vectorizer = CountVectorizer()
bow_model = vectorizer.fit_transform(messages)
print(bow_model.toarray())

In [None]:
print(bow_model.shape)
print(vectorizer.get_feature_names())

* A lot of duplicate tokens such as 'win'and 'winner'; 'reply' and 'replying'; 'want' and 'wanted' etc. 

## Stemming and lemmatising

In [None]:
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

# add stemming and lemmatisation in the preprocess function
def preprocess(document, stem=True):
    'changes document to lower case and removes stopwords'

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)

    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]
    
    if stem:
        words = [stemmer.stem(word) for word in words]
    else:
        words = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]

    # join words to make sentence
    document = " ".join(words)
    
    return document

### Bag of words model on stemmed messages

In [None]:
# stem messages
messages = [preprocess(message, stem=True) for message in spam.message]

# bag of words model
vectorizer = CountVectorizer()
bow_model = vectorizer.fit_transform(messages)

In [None]:
# look at the dataframe
pd.DataFrame(bow_model.toarray(), columns = vectorizer.get_feature_names())

In [None]:
# token names
print(vectorizer.get_feature_names())

### 359 tokens after stemming the messages as compared to 381 tokens without stemming.

### Let's try lemmatizing the messages.

In [None]:
# lemmatise messages
messages = [preprocess(message, stem=False) for message in spam.message]

# bag of words model
vectorizer = CountVectorizer()
bow_model = vectorizer.fit_transform(messages)

In [None]:
# look at the dataframe
pd.DataFrame(bow_model.toarray(), columns = vectorizer.get_feature_names())

In [None]:
# token names
print(vectorizer.get_feature_names())

### 363 tokens after lemmatizing the messages as compared to 381 tokens without lemmatising. But, on the other hand, stemmer reduces the token count to 359. Lemmatization doesn't work as expected because the data is very unclean.